先贴上代码
def getInlinks(bsobj,inURL): #寻找网页的内链
inURL=urlparse(inURL).scheme+"://"+urlparse(inURL).netloc
inlinks=[]
for link in bsobj.findALL("a",href=re.compile("^(/|.*"+inURL+")")):
#寻找以/开头或者包含当前URL的链接
if link.attrs["href"] is not None:
if link.attrs["href"] not in inlinks:
if(link.attrs["href"].startswith('/')):
inlinks.append(inURL+link.attrs["href"])
else:
inlinks.append(link.attrs['href'])
return inlinks
def getOutlinks(bsobj,outURL): #寻找网页中的外链
outlinks=[]
for link in bsobj.findALL("a",href=re.compile("^(http|www)((?!"+outURL+").)*$")): #找到http,www开头的并且不包含当前URL的
if link.attrs['href'] is not None:
if link.attrs['href'] not in outlinks:
outlinks.append(link.attrs['href'])
return outlinks
def getlink(startURL):
html=urlopen(startURL)
bsobj=BeautifulSoup(html,"lxml")
outlinks=getOutlinks(bsobj,urlparse(startURL).netloc)
if (len(outlinks)==0):
print("没有可跳转的外部网站")
domain=urlparse(startURL).scheme+"://"+urlparse(startURL).netloc
inlinks=getInlinks(bsobj,domain)
return getlink(inlinks[random.randint(0,len(inlinks)-1)])
else:
return outlinks[random.randint(0,len(outlinks)-1)]
def followlinks(startURL): #只想让它跳转五次,递归调用
outlinks=getlink(startURL)
print("将要跳转的网站是"+outlinks)
global a
a=a+1
if(a<=5):
followlinks(outlinks)
followlinks("http://www.baidu.com/")
报错的原因是:
Traceback (most recent call last):
File "C:/Users/18022863809/Desktop/python_work/爬虫.py", line 55, in
followlinks("http://www.baidu.com/")
File "C:/Users/18022863809/Desktop/python_work/爬虫.py", line 49, in followlinks
outlinks=getlink(startURL)
File "C:/Users/18022863809/Desktop/python_work/爬虫.py", line 39, in getlink
outlinks=getOut(bsobj,urlparse(startURL).netloc)
File "C:/Users/18022863809/Desktop/python_work/爬虫.py", line 26, in getOut
for
```link in bsobj.findALL("a",href=re.compile("^(http|www)((?!"+outURL+").)*$")):
TypeError: 'NoneType' object is not callable