最近学习爬虫,参考大佬代码,但是爬取结果没有抬头,第三个爬取的数据并没有插入excel中
import requests
from lxml import etree
from openpyxl import Workbook
import random
class tengxun():
def __int__(self):
self.url = 'https://ke.qq.com/course/list?mt=1001&page={}'
self.header = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0",
"Connection": "keep - alive",
}
self.wb = Workbook()
self.ws = self.wb.active
self.ws.append(['title', 'link', 'now_reader'])
def geturl(self):
self.url = 'https://ke.qq.com/course/list?mt=1001&page={}'
url = [self.url.format(i) for i in range(1,5)]
return url
def prase_url(self,url):
self.header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0",
"Connection": "keep - alive",
}
response = requests.get(url, headers=self.header, timeout=5)
return response.content.decode('gbk', 'ignore')
def get_list(self,html_str):
html = etree.HTML(html_str)
connect_list = []
lists = html.xpath("//li[@class ='course-card-item']")
for list in lists:
item = {}
item['title'] = ''.join(list.xpath("./h4/a[@class = 'item-tt-link']/text()"))
item['link'] = ''.join(list.xpath("./a[@class = 'item-img-link']/@href"))
item['now_reader'] = ''.join(list.xpath("./div[@class = 'item-line item-line--moddle']/span[@class='line-cell item-user']/text()"))
connect_list.append(item)
return connect_list
def save_list(self, connects):
self.wb = Workbook()
self.ws = self.wb.active
for connect in connects:
self.ws.append([connect['title'], connect['link'], connect['now_reader']])
print('保存成功页招聘信息')
def run(self):
url_list = self.geturl()
for url in url_list:
html_url = self.prase_url(url)
connects = self.get_list(html_url)
self.save_list(connects)
self.wb.save(r'C:\Users\Administrator\Desktop\resource\UA_ls\demo_09 try.xlsx')
if __name__=='__main__':
spider = tengxun()
spider.run()