python3.2
requests.exceptions.MissingSchema: Invalid URL 'a': No scheme supplied. Perhaps you meant https://a/?
这是什么意思,在网上搜索都说是缺少http,或者代码不完全,但是我的是完整的啊root_url = 'http://www.baidu.com%27/
而且每次运行引号里面的单个字母要不然就是a,h,y还有/,帮帮忙,卡了一周了
爬虫部分
import re
from ruan.uilts import url_manager
import requests
from bs4 import BeautifulSoup
#设置入口
root_url = 'http://www.baidu.com'
urls=url_manager.urlmanager()
urls.add_new_urls(root_url)
fout=open("craw_all_pages.text","w")
while urls.has_new_url():
curr_url=urls.get_url()
r = requests.get(curr_url,timeout=3)
if r.status_code !=200:
print("error,return status_code is not 200",curr_url)
continue
soup=BeautifulSoup(r.text,"html_parser")
title=soup.title.string
fout.write("%s\t%s\n"%(curr_url.title))
print("succes:%s,%s,%s"%(curr_url.title,len(urls.new_urls)))
links= soup.find_all('a')
for link in links:
href=link["href"]
if href is None:
continue
pattern=r'^http://www.crazyant.net/\d+.html$'
if re.match(pattern,href):
urls.add_new_url(href)
fout.close()
url_manager部分
class urlmanager():
def __init__(self):
self.new_urls=set()#定义带爬取的url容器
self.old_urls=set()#定义已经被爬取的url容器
def add_new_url(self,url):#添加单个url
if url is None or len(url)==0:
return
if url in self.new_urls or url in self.old_urls:#判断url是否再新添加url里面或者再旧的url里面,都表示存在都返回
return
self.new_urls.add(url)
def add_new_urls(self,urls):#添加的url的集合,从中判断是否为新的或者旧的。新的添加入new_urls,反之添加入old_urls
if urls is None or len(urls)==0:
return
for url in urls:
self.add_new_url(url)
def get_url(self):
if self.has_new_url():
url=self.new_urls.pop()#pop方法可以从一个集合中移除一个元素并且返回
self.old_urls.add(url)#将移除的url标记成旧的url
return url
def has_new_url(self):#判断容器中是否还有url待爬取
return len(self.new_urls) > 0