大家好~
我之前写的爬虫是单独针对某个页面的,比如新浪微博。这次需要在全网范围查询关于“开灯”有关的句子或文章,但是soup.findall("li",{"class":"result"})的结果一直是空。求助,谢谢。
PS:网上参考其他的例子,也都是在这个地方出问题,空的。。。
代码如下:
#-*-coding:utf-8-*-
__author__ = 'Daniel'
#python3.5
#'爬取关于开灯的新闻'
import re
import urllib
import chardet
from bs4 import BeautifulSoup
def remove_js_css(content):
r = re.compile(r'''<scrip.*?</scrip>''', re.I|re.M|re.S)
s = r.sub('', content)
r = re.compile(r'''<style.*?</style>''', re.I|re.M|re.S)
s = r.sub('', s)
r = re.compile(r'''<!--.*?-->''', re.I|re.M|re.S)
s = r.sub('', s)
r = re.compile(r'''<meta.*?>''', re.I|re.M|re.S)
s = r.sub('', s)
r = re.compile(r'''<ins.*?</ins>''', re.I|re.M|re.S)
s = r.sub('', s)
return s
def remove_empty_line(content):
r = re.compile(r'''^\s+$''', re.M|re.S)
s = r.sub('', content)
r = re.compile(r'''\n+''', re.M|re.S)
s = r.sub('\n', s)
return s
def remove_any_tag(s):
s = re.sub(r'''<[^>]+>''', '', s)
return s.strip()
def remove_any_tag_but_a(s):
text = re.findall(r'''<a[^r][^>]*>(.*?)</a>''', s, re.I|re.S|re.S)
text_b = remove_any_tag(s)
return len(''.join(text)), len(text_b)
def remove_image(s, n=50):
image = 'a' * n
r = re.compile(r'''<img.*?>''', re.I|re.M|re.S)
s = r.sub(image, s)
return s
def remove_video(s, n=1000):
video = 'a' * n
r = re.compile(r'''<embed.*?>''', re.I|re.M|re.S)
s = r.sub(video, s)
return s
def sum_max(values):
cur_max = values[0]
glo_max = -99999
left, right = 0, 0
for index, value in enumerate(values):
cur_max += value
if(cur_max > glo_max):
glo_max = cur_max
right = index
elif(cur_max < 0):
cur_max = 0
for i in range(right, -1, -1):
glo_max -= values[i]
if abs(glo_max < 0.0001):
left = i
break
return left, right + 1
def method_1(content, k = 1):
if not content:
return None, None, None, None
tmp = content.split('\n')
group_value = []
for i in range(0, len(tmp), k):
group = '\n'.join(tmp[i:i+k])
group = remove_image(group)
group = remove_video(group)
text_a, text_b = remove_any_tag_but_a(group)
temp = (text_b - text_a) - 8
group_value.append(tmp)
left, right = sum_max(group_value)
return left, right, len('\n'.join(tmp[:left])), len('\n'.join(tmp[:right]))
def extract(content):
content = remove_empty_line(remove_js_css(content))
left, right, x, y = method_1(content)
return '\n'.join(content.split('\n')[left:right])
#输入url,将其新闻页的正文输入txt
def extract_news_content(web_url, file_name):
request = urllib.Request(web_url)
#在请求加上头信息,伪装成浏览器访问
request.add_header('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')
opener = urllib.build_opener()
html = opener.open(request).read()
infoencode = chardet.detect(html)['encoding']##通过第3方模块来自动提取网页的编码
if html != None and infoencode != None:#提取内容不为空,error.或者用else
html = html.decode(infoencode, 'ignore')
soup = BeautifulSoup(html)
content = soup.renderContents()
content_text = extract(content)#提取新闻网页中的正文部分,化为无换行的一段文字
content_text = re.sub(" "," ",content_text)
content_text = re.sub(">","",content_text)
content_text = re.sub(""",'""',content_text)
content_text = re.sub("<[^>]+>","",content_text)
content_text = re.sub("\n","",content_text)
file = open(file_name,'a')#append
file.write(content_text)
file.close()
#抓取百度新闻搜索结果:中文搜索,前10页,url:key=关键词
def search(key_word):
search_url = 'http://news.baidu.com/ns?word=key_word&tn=news&from=news&cl=2&rn=20&ct=1'
req = urllib.request.urlopen(search_url.replace('key_word', key_word))
real_visited = 0
for count in range(10):#前10页
html = req.read()
soup = BeautifulSoup(html)
content = soup.findAll("li", {"class": "result"}) #resultset object
num = len(content)
for i in range(num):
#先解析出来所有新闻的标题、来源、时间、url
p_str = content[i].find('a') #if no result then nontype object
contenttitle = p_str.renderContents()
contenttitle = contenttitle.decode('utf-8', 'ignore')#need it
contenttitle = re.sub("<[^>]+>", "", contenttitle)
contentlink = str(p_str.get("href"))
#存放顺利抓取的url,对比
visited_url = open(r'visited-cn.txt', 'r')#是否已经爬过
visited_url_list = visited_url.readlines()
visited_url.close()#及时close
exist = 0
for item in visited_url_list:
if contentlink == item:
exist = 1
if exist != 1:#如果未被访问url
p_str2 = content[i].find('p').renderContents()
contentauthor = p_str2[:p_str2.find("  ")]#来源
contentauthor = contentauthor.decode('utf-8', 'ignore')#时
contenttime = p_str2[p_str2.find("  ") + len("  ") + 1:]
contenttime = contenttime.decode('utf-8', 'ignore')
#第i篇新闻,filename="D:\\Python27\\newscn\\%d.txt"%(i)
#file = open(filename,'w'),一个txt一篇新闻
real_visited += 1
file_name = r"newscn\%d.txt"%(real_visited)
file = open(file_name,'w')
file.write(contenttitle.encode('utf-8'))
file.write(u'\n')
file.write(contentauthor.encode('utf-8'))
file.write(u'\n')
file.write(contenttime.encode('utf-8'))
file.write(u'\n'+contentlink+u'\n')
file.close()
extract_news_content(contentlink, file_name)#还写入文件
visited_url_list.append(contentlink)#访问之
visited_url = open(r'visited-cn.txt', 'a')#标记为已访问,永久存防止程序停止后丢失
visited_url.write(contentlink+u'\n')
visited_url.close()
if len(visited_url_list) >= 120:
break
#解析下一页
if count == 0:
next_num = 0
else:
next_num = 1
next_page = 'http://news.baidu.com' + soup('a',{'href':True,'class':'n'})[next_num]['href'] # search for the next page#翻页
print(next_page)
req = urllib.urlopen(next_page)
if __name__=='__main__':
#key_word = input('input key word:')
key_word = 'helloworld'
search(key_word)
Python爬虫soup.findAll("li", {"class": "result"})为空
- 写回答
- 好问题 0 提建议
- 追加酬金
- 关注问题
- 邀请回答
-
2条回答 默认 最新
- Go 旅城通票 2017-05-04 12:02关注
新浪微博是ajax动态加载的数据,源代码里面是没有显示出来的内容,爬虫得到的内容就和查看源代码后的内容一样,所以ajax,js生成的内容爬虫是捉取不到内容,不利于seo
你的找到新浪的ajax数据接口,直接请求接口获取数据而不是显示的url地址
解决 无用评论 打赏 举报