response返回不了东西,运行结果只有一个None,我打印了一下data,
发现啥都 没有。因为这个网站是外网,返回不到结果是不是与这个因素有关呢
还希望大佬 能跑一下,解答一下我的问题。
from lxml import html
import requests
etree = html.etree
class News(object):
def __init__(self):
self.url ='https://www.chinatimes.com/newspapers/260118'
self.headers ={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
def get_data(self,url):
response =requests.get(url,headers=self.headers)
return response.content
def parse_data(self, data):
# 创建 element对象
data = data.decode()
html = etree.HTML(data)
el_list = html.xpath('/html/body/div[2]/div/div[2]/div/section/ul/li/div/div/div/h3/a/font')
data_list = []
for el in el_list:
temp = {}
temp['title'] = el.xpath('')[0]
temp['link'] = 'https://www.chinatimes.com' + el.xpath("./@href")[0]
data_list.append(temp)
try:
# 获取 下一页的url
next_url = 'https://www.chinatimes.com' + html.xpath('/html/body/div[2]/div/div[2]/div/section/nav/ul/li[7]/a/@href')[0]
except:
next_url = None
return data_list, next_url
def save_data(self, data_list):
for data in data_list:
print(data)
def run(self):
# url
next_url = self.url
while True:
data = self.get_data(next_url)
data_list, next_url = self.parse_data(data)
self.save_data(data_list)
print(next_url)
if next_url == None:
break
if name == '__main__':
news =News()
news.run()