from bs4 import BeautifulSoup import xlwt import urllib.request import urllib.error import re def main(): baseurl = "http://www.xbiquge.la/1/1370/" # 爬取的网页 datalist = getData(baseurl) save = ".//电影信息.xls" # saveData(datalist) # askURl("http://58921.com/film/new") def getData(baseurl): datalist = [] for i in range(0, 1): url = baseurl + str(i) html = askURl(url) # 保存获取到的源码 # soup = BeautifulSoup(html, "html.parser") return datalist def askURl(url): head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image" "/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Language": "zh-CN,zh;q=0.9", "Host": "www.xbiquge.la"} request = urllib.request.Request(url, headers=head) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) return html def saveData(save): print("第一条") if __name__ == "__main__": main()
这是代码,下面这个是爬取的站点