from pyspider.libs.base_handler import *
import re
class Handler(BaseHandler):
crawl_config = {
}
def __init__(self):
self.urls=[
"www.wego.cn/airports/airport-name/a",
"www.wego.cn/airports/airport-name/b",
"www.wego.cn/airports/airport-name/c",
"www.wego.cn/airports/airport-name/d",
"www.wego.cn/airports/airport-name/e",
]
@every(minutes=24*60)
def on_start(self):
for url in self.urls:
self.crawl(url, callback=self.index_page,validate_cert=False)
@config(age=10*24*60*60)
def index_page(self, response):
url_list=re.findall('<li\sclass="extra-item\sis-hidden">\s+<a\shref="([\s+\S+]*?)"> \s+\S+',response.text)
for item in url_list:
url="http://www.wego.cn"+item
self.crawl(url,callback=self.detail_page,validate_cert=False)
@config(priority=2)
def detail_page(self, response):
print("------enter matching--------")
AirportName=re.findall('<tr>\s+<td\swidth="200">全名</td>\s+<td>([\s+\S+]*?)</td>\s+</tr>',response.text)
IATA=re.findall('<tr>\s+<td\swidth="200">IATA\s 代码</td>\s+<td>([\s+\S+]*?)</td>\s+</tr>',response.text)
Latitude=re.findall('<tr>\s+<td\swidth="200">纬度</td>\s+<td>([\s+\S+]*?)</td>\s+</tr>',response.text)
Longtitude=re.findall('<tr>\s+<td\swidth="200">经度</td>\s+<td>([\s+\S+]*?)</td>\s+</tr>',response.text)
return {
"url": response.url,
"AirportName":AirportName[0] if AirportName else "无",
"IATA":IATA[0] if IATA else "无",
"Longtitude":Longtitude[0] if Longtitude else "无",
"Latitude":Latitude[0] if Latitude else "无"
}