目的是爬取北京到上海机票预订 - 北京到上海机票预约 - 同程机票预订 (ly.com)网站中的机票信息。设计使用BeautifulSoup库中的find_all()函数先获取所有 <div class="flight-item-head data-v-13439d30"> 标签的信息,之后使用正则表达式获取更加细致的信息。但是,在第一步find_all()函数的使用后,没有获取网页中所有符合要求div标签的内容,想知道是为什么。
import urllib.error
import urllib.request
from tkinter import *
from bs4 import BeautifulSoup
def checkChinese(InPut):
flag = False
for i in range(0, len(InPut)):
if ord(InPut[i]) <= 255:
flag = True
break
if flag is False:
return True
else:
return False
def checkDigit(InPut):
return InPut.isdigit()
def askForUrl(url):
head = {'User-Agent': 'Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / '
'80.0 3987.122 Safari / 537.36'}
request = urllib.request.Request(url, headers=head)
html = ''
try:
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
except urllib.error.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
return html
def setUrl(baseurl, locDict):
depLocation = str(e1.get())
ariLocation = str(e2.get())
depLocation = str(locDict[depLocation])[2:5]
ariLocation = str(locDict[ariLocation])[2:5]
ariYear = str(e3.get())
ariMonth = str(e4.get())
ariDay = str(e5.get())
url = baseurl + depLocation + '-' + ariLocation + '?' + 'date=' + ariYear + '-' + ariMonth + '-' + ariDay
return url
def searchForBaseInfo(baseurl1, locDict1, flightNameList1, depTime1, ariTime1, depAirport1, ariAirport1, ifAddDays1):
url1 = setUrl(baseurl1, locDict1)
# print(url1)
html = askForUrl(url1)
soup = BeautifulSoup(html, 'lxml')
#这里####################
for item in soup.find_all('div', class_='flight-item'):
#这里####################
item = str(item)
print(item)
flightNameList1.append(re.findall(findFlightName, item)[0])
tempTime = re.findall(findDepTime, item)[0][0] + ':' + re.findall(findDepTime, item)[0][1]
depTime1.append(tempTime)
tempTime = re.findall(findAriTime, item)[0][0] + ':' + re.findall(findAriTime, item)[0][1]
ariTime1.append(tempTime)
depAirport1.append(re.findall(findDepAirport, item)[0])
ariAirport1.append(re.findall(findAriAirport, item)[0])
tempAddDays = re.findall(findIfAddDays, item)
if len(tempAddDays) == 0:
ifAddDays1.append('n')
else:
ifAddDays1.append(tempAddDays[0])