代码如下:
import time
import random
import requests
import urllib
from bs4 import BeautifulSoup
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
class UserAgent():
def _get_UA(self,html):
soup = BeautifulSoup(html, "html.parser")
ip_get = []
ip_list = soup.find_all("tr")
for i in range(1,len(ip_list)):
ip_both = ip_list[i].find_all("td")
front = ip_both[1].text+':'
ip_get.append(front+ip_both[2].text)
time.sleep(random.randint(15,20))
return ip_get
def _get_html(self,html):
if html==None:
this_html=urllib.request.urlopen('https://www.xicidaili.com/nn/1')
else:
soup = BeautifulSoup(html,"html.parser")
next_page_url = soup.find("a",class_="next_page")
print(next_page_url)
html = urllib.request.urlopen('https://www.xicidaili.com'+next_page_url)
this_html = html
return this_html
错误出在_get_html方法中else里面的代码,传入的地址没有问题,我用浏览器可以正常打开地址https://www.xicidaili.com/nn/1
主运行代码如下:
n = User_Agent.UserAgent()
ip_html = n._get_html(None)
fake_ip = n._get_UA(ip_html)
ip_html = n._get_html(ip_html)
还有报错是这么说的:
Traceback (most recent call last):
File "E:\java4412\spider_demo\book_spider\main.py", line 21, in <module>
None
ip_html = n._get_html(ip_html)
File "E:\java4412\spider_demo\book_spider\User_Agent.py", line 35, in _get_html
html = urllib.request.urlopen('https://www.xicidaili.com'+next_page_url)
TypeError: Can't convert 'NoneType' object to str implicitly
有哪位大牛帮我看看这代码哪里不对么?本小白已经要疯了。。。。。
=========================分割线=================================================================
问题已解决
原因是我原先一直用一个固定header
我找了一个别人收集的User_Agent集合,在代码中随机更换header。
更改后的代码如下:
class UserAgent():
def _get_UA(self,soup):
headers=("User-Agent",Headers.getheaders())
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
# soup = BeautifulSoup(html, "html.parser")
ip_get = []
ip_list = soup.find_all("tr")
for i in range(1,len(ip_list)):
ip_both = ip_list[i].find_all("td")
front = ip_both[1].text+':'
ip_get.append(front+ip_both[2].text)
time.sleep(random.randint(15,20))
return ip_get
def _get_html_first(self):
headers=("User-Agent",Headers.getheaders())
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
this_html=urllib.request.urlopen('https://www.xicidaili.com/nn/1')
soup = BeautifulSoup(this_html,"html.parser")
return soup
def _get_soup(self,soup):
headers=("User-Agent",Headers.getheaders())
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
next_page_url = soup.find("a",class_="next_page").get('href')
print(next_page_url)
html = urllib.request.urlopen('https://www.xicidaili.com'+next_page_url)
soup = BeautifulSoup(html,'html.parser')
return soup
进行了一定的修改,可以正确运行。其中的_print()_是我为验证结果打的。