第一次写爬虫,想保存网页到本地,然后网页中的图片想用本地的图片来替换,但是用BeautifulSoup发现无法修改网页的标签的属性,下面是尝试爬取百度网页的代码,但是保存到本地之后发现img中src属性没有发生改变
import urllib
import urllib2
from bs4 import BeautifulSoup
from HTMLParser import HTMLParser
def callBack(a,b,c):
per = 100.0*a*b/c
if per > 100:
per = 100
print "%.2f%%" % per
def getFileData(addr):
try:
splitPath = addr.split('/')
fName = splitPath.pop()
print fName
urllib.urlretrieve(addr,'d://'+fName)
except Exception as e:
print 'Cannot download:%s:%s' % (fName,e)
return fName
def getHtml(url,url0):
html = urllib2.urlopen(url).read()
bsObj = BeautifulSoup(html,"html.parser")
fName = getFileData(url0)
tag = bsObj.find(name="img");
tag.attrs['src'] = fName
f = open("d://baidu.html","w+")
f.write(html)
f.close()
url = "http://www.baidu.com"
url0 = "https://ss0.bdstatic.com/5aV1bjqh_Q23odCf/static/superman/img/logo/bd_logo1_31bdc765.png" #图片网址
getHtml(url,url0)