ds1231h
ds1231h
2017-05-04 10:05

Python爬虫soup.findAll("li", {"class": "result"})为空

20
  • 爬虫
  • python
 大家好~
我之前写的爬虫是单独针对某个页面的,比如新浪微博。这次需要在全网范围查询关于“开灯”有关的句子或文章,但是soup.findall("li",{"class":"result"})的结果一直是空。求助,谢谢。
PS:网上参考其他的例子,也都是在这个地方出问题,空的。。。
代码如下:
#-*-coding:utf-8-*-
__author__ = 'Daniel'
#python3.5
#'爬取关于开灯的新闻'

import re
import urllib
import chardet
from bs4 import BeautifulSoup

def remove_js_css(content):
    r = re.compile(r'''<scrip.*?</scrip>''', re.I|re.M|re.S)
    s = r.sub('', content)
    r = re.compile(r'''<style.*?</style>''', re.I|re.M|re.S)
    s = r.sub('', s)
    r = re.compile(r'''<!--.*?-->''', re.I|re.M|re.S)
    s = r.sub('', s)
    r = re.compile(r'''<meta.*?>''', re.I|re.M|re.S)
    s = r.sub('', s)
    r = re.compile(r'''<ins.*?</ins>''', re.I|re.M|re.S)
    s = r.sub('', s)
    return s

def remove_empty_line(content):
    r = re.compile(r'''^\s+$''', re.M|re.S)
    s = r.sub('', content)
    r = re.compile(r'''\n+''', re.M|re.S)
    s = r.sub('\n', s)
    return s

def remove_any_tag(s):
    s = re.sub(r'''<[^>]+>''', '', s)
    return s.strip()

def remove_any_tag_but_a(s):
    text = re.findall(r'''<a[^r][^>]*>(.*?)</a>''', s, re.I|re.S|re.S)
    text_b = remove_any_tag(s)
    return len(''.join(text)), len(text_b)

def remove_image(s, n=50):
    image = 'a' * n
    r = re.compile(r'''<img.*?>''', re.I|re.M|re.S)
    s = r.sub(image, s)
    return s

def remove_video(s, n=1000):
    video = 'a' * n
    r = re.compile(r'''<embed.*?>''', re.I|re.M|re.S)
    s = r.sub(video, s)
    return s

def sum_max(values):
    cur_max = values[0]
    glo_max = -99999
    left, right = 0, 0
    for index, value in enumerate(values):
        cur_max += value
        if(cur_max > glo_max):
            glo_max = cur_max
            right = index
        elif(cur_max < 0):
            cur_max = 0

    for i in range(right, -1, -1):
        glo_max -= values[i]
        if abs(glo_max < 0.0001):
            left = i
            break
    return left, right + 1

def method_1(content, k = 1):
    if not content:
        return None, None, None, None
    tmp = content.split('\n')
    group_value = []
    for i in range(0, len(tmp), k):
        group = '\n'.join(tmp[i:i+k])
        group = remove_image(group)
        group = remove_video(group)
        text_a, text_b = remove_any_tag_but_a(group)
        temp = (text_b - text_a) - 8
        group_value.append(tmp)
    left, right = sum_max(group_value)
    return left, right, len('\n'.join(tmp[:left])), len('\n'.join(tmp[:right]))

def extract(content):
    content = remove_empty_line(remove_js_css(content))
    left, right, x, y = method_1(content)
    return '\n'.join(content.split('\n')[left:right])

#输入url,将其新闻页的正文输入txt
def extract_news_content(web_url, file_name):
    request = urllib.Request(web_url)

    #在请求加上头信息,伪装成浏览器访问
    request.add_header('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')
    opener = urllib.build_opener()
    html = opener.open(request).read()
    infoencode = chardet.detect(html)['encoding']##通过第3方模块来自动提取网页的编码
    if html != None and infoencode != None:#提取内容不为空,error.或者用else
        html = html.decode(infoencode, 'ignore')
        soup = BeautifulSoup(html)
        content = soup.renderContents()
        content_text = extract(content)#提取新闻网页中的正文部分,化为无换行的一段文字
        content_text = re.sub("&nbsp;"," ",content_text)
        content_text = re.sub("&gt;","",content_text)
        content_text = re.sub("&quot;",'""',content_text)
        content_text = re.sub("<[^>]+>","",content_text)
        content_text = re.sub("\n","",content_text)
        file = open(file_name,'a')#append
        file.write(content_text)
        file.close()

#抓取百度新闻搜索结果:中文搜索,前10页,url:key=关键词
def search(key_word):
    search_url = 'http://news.baidu.com/ns?word=key_word&tn=news&from=news&cl=2&rn=20&ct=1'
    req = urllib.request.urlopen(search_url.replace('key_word', key_word))
    real_visited = 0
    for count in range(10):#前10页
        html = req.read()
        soup = BeautifulSoup(html)
        content = soup.findAll("li", {"class": "result"}) #resultset object
        num = len(content)
        for i in range(num):
            #先解析出来所有新闻的标题、来源、时间、url
            p_str = content[i].find('a') #if no result then nontype object
            contenttitle = p_str.renderContents()
            contenttitle = contenttitle.decode('utf-8', 'ignore')#need it
            contenttitle = re.sub("<[^>]+>", "", contenttitle)
            contentlink = str(p_str.get("href"))
            #存放顺利抓取的url,对比
            visited_url = open(r'visited-cn.txt', 'r')#是否已经爬过
            visited_url_list = visited_url.readlines()
            visited_url.close()#及时close
            exist = 0
            for item in visited_url_list:
                if contentlink == item:
                    exist = 1
            if exist != 1:#如果未被访问url
                p_str2 = content[i].find('p').renderContents()
                contentauthor = p_str2[:p_str2.find("&nbsp;&nbsp")]#来源
                contentauthor = contentauthor.decode('utf-8', 'ignore')#时
                contenttime = p_str2[p_str2.find("&nbsp;&nbsp") + len("&nbsp;&nbsp") + 1:]
                contenttime = contenttime.decode('utf-8', 'ignore')
                #第i篇新闻,filename="D:\\Python27\\newscn\\%d.txt"%(i)
                #file = open(filename,'w'),一个txt一篇新闻
                real_visited += 1
                file_name = r"newscn\%d.txt"%(real_visited)
                file = open(file_name,'w')
                file.write(contenttitle.encode('utf-8'))
                file.write(u'\n')
                file.write(contentauthor.encode('utf-8'))
                file.write(u'\n')
                file.write(contenttime.encode('utf-8'))
                file.write(u'\n'+contentlink+u'\n')
                file.close()
                extract_news_content(contentlink, file_name)#还写入文件
                visited_url_list.append(contentlink)#访问之
                visited_url = open(r'visited-cn.txt', 'a')#标记为已访问,永久存防止程序停止后丢失
                visited_url.write(contentlink+u'\n')
                visited_url.close()
            if len(visited_url_list) >= 120:
                break
            #解析下一页
        if count == 0:
            next_num = 0
        else:
            next_num = 1
        next_page = 'http://news.baidu.com' + soup('a',{'href':True,'class':'n'})[next_num]['href'] # search for the next page#翻页
        print(next_page)
        req = urllib.urlopen(next_page)

if __name__=='__main__':
    #key_word = input('input key word:')
    key_word = 'helloworld'
    search(key_word)


  • 点赞
  • 回答
  • 收藏
  • 复制链接分享

2条回答