Python爬虫soup.findAll("li", {"class": "result"})为空

 大家好~
我之前写的爬虫是单独针对某个页面的，比如新浪微博。这次需要在全网范围查询关于“开灯”有关的句子或文章，但是soup.findall("li",{"class":"result"})的结果一直是空。求助，谢谢。
PS：网上参考其他的例子，也都是在这个地方出问题，空的。。。
代码如下：
#-*-coding:utf-8-*-
__author__ = 'Daniel'
#python3.5
#'爬取关于开灯的新闻'

import re
import urllib
import chardet
from bs4 import BeautifulSoup

def remove_js_css(content):
    r = re.compile(r'''<scrip.*?</scrip>''', re.I|re.M|re.S)
    s = r.sub('', content)
    r = re.compile(r'''<style.*?</style>''', re.I|re.M|re.S)
    s = r.sub('', s)
    r = re.compile(r'''<!--.*?-->''', re.I|re.M|re.S)
    s = r.sub('', s)
    r = re.compile(r'''<meta.*?>''', re.I|re.M|re.S)
    s = r.sub('', s)
    r = re.compile(r'''<ins.*?</ins>''', re.I|re.M|re.S)
    s = r.sub('', s)
    return s

def remove_empty_line(content):
    r = re.compile(r'''^\s+$''', re.M|re.S)
    s = r.sub('', content)
    r = re.compile(r'''\n+''', re.M|re.S)
    s = r.sub('\n', s)
    return s

def remove_any_tag(s):
    s = re.sub(r'''<[^>]+>''', '', s)
    return s.strip()

def remove_any_tag_but_a(s):
    text = re.findall(r'''<a[^r][^>]*>(.*?)</a>''', s, re.I|re.S|re.S)
    text_b = remove_any_tag(s)
    return len(''.join(text)), len(text_b)

def remove_image(s, n=50):
    image = 'a' * n
    r = re.compile(r'''<img.*?>''', re.I|re.M|re.S)
    s = r.sub(image, s)
    return s

def remove_video(s, n=1000):
    video = 'a' * n
    r = re.compile(r'''<embed.*?>''', re.I|re.M|re.S)
    s = r.sub(video, s)
    return s

def sum_max(values):
    cur_max = values[0]
    glo_max = -99999
    left, right = 0, 0
    for index, value in enumerate(values):
        cur_max += value
        if(cur_max > glo_max):
            glo_max = cur_max
            right = index
        elif(cur_max < 0):
            cur_max = 0

    for i in range(right, -1, -1):
        glo_max -= values[i]
        if abs(glo_max < 0.0001):
            left = i
            break
    return left, right + 1

def method_1(content, k = 1):
    if not content:
        return None, None, None, None
    tmp = content.split('\n')
    group_value = []
    for i in range(0, len(tmp), k):
        group = '\n'.join(tmp[i:i+k])
        group = remove_image(group)
        group = remove_video(group)
        text_a, text_b = remove_any_tag_but_a(group)
        temp = (text_b - text_a) - 8
        group_value.append(tmp)
    left, right = sum_max(group_value)
    return left, right, len('\n'.join(tmp[:left])), len('\n'.join(tmp[:right]))

def extract(content):
    content = remove_empty_line(remove_js_css(content))
    left, right, x, y = method_1(content)
    return '\n'.join(content.split('\n')[left:right])

#输入url，将其新闻页的正文输入txt
def extract_news_content(web_url, file_name):
    request = urllib.Request(web_url)

    #在请求加上头信息，伪装成浏览器访问
    request.add_header('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')
    opener = urllib.build_opener()
    html = opener.open(request).read()
    infoencode = chardet.detect(html)['encoding']##通过第3方模块来自动提取网页的编码
    if html != None and infoencode != None:#提取内容不为空，error.或者用else
        html = html.decode(infoencode, 'ignore')
        soup = BeautifulSoup(html)
        content = soup.renderContents()
        content_text = extract(content)#提取新闻网页中的正文部分，化为无换行的一段文字
        content_text = re.sub("&nbsp;"," ",content_text)
        content_text = re.sub("&gt;","",content_text)
        content_text = re.sub("&quot;",'""',content_text)
        content_text = re.sub("<[^>]+>","",content_text)
        content_text = re.sub("\n","",content_text)
        file = open(file_name,'a')#append
        file.write(content_text)
        file.close()

#抓取百度新闻搜索结果:中文搜索，前10页，url：key=关键词
def search(key_word):
    search_url = 'http://news.baidu.com/ns?word=key_word&tn=news&from=news&cl=2&rn=20&ct=1'
    req = urllib.request.urlopen(search_url.replace('key_word', key_word))
    real_visited = 0
    for count in range(10):#前10页
        html = req.read()
        soup = BeautifulSoup(html)
        content = soup.findAll("li", {"class": "result"}) #resultset object
        num = len(content)
        for i in range(num):
            #先解析出来所有新闻的标题、来源、时间、url
            p_str = content[i].find('a') #if no result then nontype object
            contenttitle = p_str.renderContents()
            contenttitle = contenttitle.decode('utf-8', 'ignore')#need it
            contenttitle = re.sub("<[^>]+>", "", contenttitle)
            contentlink = str(p_str.get("href"))
            #存放顺利抓取的url，对比
            visited_url = open(r'visited-cn.txt', 'r')#是否已经爬过
            visited_url_list = visited_url.readlines()
            visited_url.close()#及时close
            exist = 0
            for item in visited_url_list:
                if contentlink == item:
                    exist = 1
            if exist != 1:#如果未被访问url
                p_str2 = content[i].find('p').renderContents()
                contentauthor = p_str2[:p_str2.find("&nbsp;&nbsp")]#来源
                contentauthor = contentauthor.decode('utf-8', 'ignore')#时
                contenttime = p_str2[p_str2.find("&nbsp;&nbsp") + len("&nbsp;&nbsp") + 1:]
                contenttime = contenttime.decode('utf-8', 'ignore')
                #第i篇新闻，filename="D:\\Python27\\newscn\\%d.txt"%(i)
                #file = open(filename,'w'),一个txt一篇新闻
                real_visited += 1
                file_name = r"newscn\%d.txt"%(real_visited)
                file = open(file_name,'w')
                file.write(contenttitle.encode('utf-8'))
                file.write(u'\n')
                file.write(contentauthor.encode('utf-8'))
                file.write(u'\n')
                file.write(contenttime.encode('utf-8'))
                file.write(u'\n'+contentlink+u'\n')
                file.close()
                extract_news_content(contentlink, file_name)#还写入文件
                visited_url_list.append(contentlink)#访问之
                visited_url = open(r'visited-cn.txt', 'a')#标记为已访问，永久存防止程序停止后丢失
                visited_url.write(contentlink+u'\n')
                visited_url.close()
            if len(visited_url_list) >= 120:
                break
            #解析下一页
        if count == 0:
            next_num = 0
        else:
            next_num = 1
        next_page = 'http://news.baidu.com' + soup('a',{'href':True,'class':'n'})[next_num]['href'] # search for the next page#翻页
        print(next_page)
        req = urllib.urlopen(next_page)

if __name__=='__main__':
    #key_word = input('input key word:')
    key_word = 'helloworld'
    search(key_word)

写回答
好问题 0 提建议
关注问题
分享
邀请回答
编辑收藏删除
收藏举报

2条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
斯洛文尼亚旅游 2017-05-04 12:02
关注
新浪微博是ajax动态加载的数据，源代码里面是没有显示出来的内容，爬虫得到的内容就和查看源代码后的内容一样，所以ajax，js生成的内容爬虫是捉取不到内容，不利于seo

你的找到新浪的ajax数据接口，直接请求接口获取数据而不是显示的url地址

解决无用
评论打赏
分享
举报

评论

按下Enter换行，Ctrl+Enter发表内容

报告相同问题？

关注问题

Python中bs4的soup.find()和soup.find_all()用法
2024-06-12 20:35

2401_85674920的博客 list=soup.find_all(name="li") for tag in tag_list: print(tag.name,tag.text) 结果如下：（输出name和text） li 唱 li 跳 li rap li 篮球 2.2利用attrs找多个代码如下： from bs4 import BeautifulSoup ...
python爬虫压箱底的笔记
2023-08-27 00:31

shigen01的博客 python爬虫，shigen2023年5月27日整理的笔记，现在分享出来。
Python之爬虫之BeautifulSoup学习
2022-10-01 16:43

shy014的博客 Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库。由于 BeautifulSoup 是基于 Python，所以相对来说速度会比另一个 Xpath 会慢点，但是其功能也是非常的强大学习资料中文官网：...
全网最全python爬虫系统进阶学习(附原代码)学完可就业
2021-05-02 23:47

yk 坤帝的博客第一章爬虫介绍 1.认识爬虫第二章：requests实战（基础爬虫） 1.豆瓣电影爬取 2.肯德基餐厅查询 3.破解百度翻译 4.搜狗首页 5.网页采集器 6.药监总局相关数据爬取第三章：爬虫数据分析（bs4,xpath,正则表达式) 1....
Python爬虫实战：爬取视频到本地，超详细实战教程
2025-09-17 11:09

xcLeigh的博客该教程针对批量保存网站视频的需求，详解用 Python 实现爬取的方法。先介绍所需的 requests、BeautifulSoup4、you-get 等工具库及安装步骤，说明网站单视频与合集视频的 URL 特点。接着分阶段讲解核心功能，从单...
Python网络爬虫学习笔记（四）解析库的使用
2020-12-19 16:52

小零呦的博客解析库的使用使用正则表达式，比较烦琐，而且万一有地方写错了，可能导致匹配失败。...在 Python 中，有 lxml 、Beautiful Soup 、 pyquery 等解析库实现这个操作。使用 XPath XPath ，全称 XML Path La
Python爬虫学习笔记（古诗网、电影top等案例）
2024-03-29 18:38

Li-477233969的博客 Python爬虫的相关基础知识以及案例
python爬虫详解
2021-07-11 21:56

穆瑾轩的博客 python爬虫简介 1、基本概念 1.1、什么是爬虫网络爬虫，是一种按照一定规则，自动抓取互联网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁、自动索引、模拟程序或者蠕虫。随着网络的迅速发展，万维网...
python爬虫
2024-06-14 14:32

YY520！的博客 ='tab-item')) soup.find(id="specific_id") print(soup.find_all("a")) # 返回一个列表 print(soup.select(".info")) # 返回一个列表 print(soup.select(".tab-bar > .tab-item")[0]) 层级选择 >表示的一个层级，...
Python爬虫技术全面解析：从入门到实战
2025-06-21 23:32

Python爬虫项目的博客本文将全面介绍2024年最新的Python爬虫技术，涵盖从基础到高级的各个方面，并提供大量实用代码示例。无论你是刚入门的新手，还是希望提升技能的开发者，都能从中获得有价值的知识。Requests：简洁的HTTP请求库：...
Python爬虫实战：使用最新技术爬取Wikipedia词条数据
2025-06-30 21:30

Python爬虫项目的博客集成了pyquery、requests、beautifulsoup等库的现代化HTML解析库aiohttpasyncio：异步HTTP客户端，提高爬取效率：经典的HTML解析库Pyppeteer：基于Puppeteer的无头浏览器控制库Scrapy...存储爬取的非结构化数据python。
网络爬虫（Python：Requests、Beautiful Soup笔记）
2023-11-26 17:58

galaxy‘的博客网络爬虫（Python：Requests、Beautiful Soup笔记）
没有解决我的问题, 去提问

Python爬虫soup.findAll("li", {"class": "result"})为空

2条回答 默认 最新

2条回答默认最新