weixin_45243233 2020-04-30 12:42 采纳率: 0%
浏览 191
已结题

python的返回值news_detail4无法被get_equal_rate_1认定为字符串,请问如何能把news_detail4也一起进行比较?

我这里是先获取新闻网页内容,然后进行比较,前三个爬取返回值可以进行比较,第四个不行,请问该怎么办?
import difflib
from xml.etree.ElementTree import tostring

import requests
from lxml import etree
import time
from gne import GeneralNewsExtractor
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options

def get_chinanew_data():
cookies = {
'Hm_lvt_0da10fbf73cda14a786cd75b91f6beab': '1587367903',
'Hm_lpvt_0da10fbf73cda14a786cd75b91f6beab': '1587375545',
}

headers = {
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'zh-CN,zh;q=0.9',
}

response = requests.get('http://www.chinanews.com/gn/2020/04-20/9162019.shtml', headers=headers, cookies=cookies,
                        verify=False)
html = response.content.decode(errors='ignore')
etree_html = etree.HTML(html)
main = etree_html.xpath('//div[@id="cont_1_1_2"]')[0]
title = main.xpath('./h1/text()')[0]
pub_time = main.xpath(".//div[3]/div[@class='left-t']/text()")[0]
author = main.xpath('./div[5]/div[2]/div/span/text()')[0][:-2].split(':')[1]
pubtime = pub_time.split()[0] + ' ' + pub_time.split()[1]
content = ''.join(main.xpath('./div[@class="left_zw"]/p/text()')).strip()
site_url = 'http://www.chinanews.com/gn/2020/04-20/9162019.shtml'
site_name = '中国新闻网'
news_detail = {
    'pub_time': pubtime.replace('年', '-').replace('月', '-').replace('日', ''),
    'author': author,
    'title': title,
    'content': content.replace('\u3000', ''),
    'site_url': site_url,
    'site_name': site_name,
}
return news_detail

def selenium_download_data():
options = Options()
options.add_argument('--headless')
driver = Chrome(options=options,executable_path=r"C:\Users\常乐添\AppData\Local\Google\Chrome\Application\chromedriver.exe")
url_list = [
'https://news.sina.com.cn/gov/xlxw/2020-04-20/doc-iircuyvh8766402.shtml',
'https://news.ifeng.com/c/7vovtvQ2gVc',
'https://baijiahao.baidu.com/s?id=1664460259411900230&wfr=spider&for=pc']

for url in url_list:
    driver.get(url)
    time.sleep(3)
    extractor = GeneralNewsExtractor()

    if url == 'https://news.sina.com.cn/gov/xlxw/2020-04-20/doc-iircuyvh8766402.shtml':
        result1 = extractor.extract(driver.page_source)
    if url == 'https://news.ifeng.com/c/7vovtvQ2gVc':
        result2 = extractor.extract(driver.page_source)
    if url == 'https://baijiahao.baidu.com/s?id=1664460259411900230&wfr=spider&for=pc':
        result3 = extractor.extract(driver.page_source)

news_detail2 = {
    'pub_time': result1['publish_time'].replace('T', ' '),
    'author': result1['author'],
    'title': result1['title'],
    'content': result1['content'].replace('\n', ''),
    'site_url': 'https://news.sina.com.cn/gov/xlxw/2020-04-20/doc-iircuyvh8766402.shtml',
    'site_name': '新浪新闻',
}
news_detail3 = {
    'pub_time': result2['publish_time'].replace('T', ' '),
    'author': result2['author'],
    'title': result2['title'],
    'content': result2['content'].replace('\n', ''),
    'site_url': 'https://news.ifeng.com/c/7vovtvQ2gVc',
    'site_name': '凤凰网新闻',
}
news_detail4 = {
    'pub_time': result3['publish_time'].replace('T', ' '),
    'author': result3['author'],
    'title': result3['title'],
    'content': result3['content'].replace('\n', ''),
    'site_url': 'https://baijiahao.baidu.com/s?id=1664460259411900230&wfr=spider&for=pc',
    'site_name': '百度新闻',
}
driver.quit()
return news_detail2, news_detail3, news_detail4

def get_all_data():
news_detail1 = get_chinanew_data()
news_detail2, news_detail3, news_detail4 = selenium_download_data()
return news_detail1, news_detail2, news_detail3, news_detail4

def get_equal_rate_1(str1, str2):
return difflib.SequenceMatcher(None, str1, str2).quick_ratio()

if name == '__main__':
# get_chinanew_data()
# selenium_download_data()
print(get_all_data())
图片说明
这是显示的结果,只有前三个返回值进行比较的结果。想出现4个返回值互相进行比较的6个结果
def show(request):
news_list = News.objects.order_by('pub_time').all()[:4]
news1 = news_list[0]
news2 = news_list[1]
news3 = news_list[2]
news4 = news_list[3]
s1_s2 = get_equal_rate_1(news1.content, news2.content)
s2_s3 = get_equal_rate_1(news2.content, news3.content)
s1_s3 = get_equal_rate_1(news1.content, news3.content)
s1_s4 = get_equal_rate_1(news1.content, news4.content)
s2_s4 = get_equal_rate_1(news2.content, news4.content)
s3_s4 = get_equal_rate_1(news3.content, news4.content)
return render(request, 'show.html', locals())
这是展示结果部分的代码。
在show.html文件中只能显示出这个
图片说明
显示不出后面三项比较值

  • 写回答

1条回答 默认 最新

  • threenewbee 2020-04-30 13:34
    关注

    return difflib.SequenceMatcher(None, str1, str2).quick_ratio()
    ->
    return str(difflib.SequenceMatcher(None, str1, str2).quick_ratio())

    评论

报告相同问题?

悬赏问题

  • ¥15 Arcgis相交分析无法绘制一个或多个图形
  • ¥15 seatunnel-web使用SQL组件时候后台报错,无法找到表格
  • ¥15 fpga自动售货机数码管(相关搜索:数字时钟)
  • ¥15 用前端向数据库插入数据,通过debug发现数据能走到后端,但是放行之后就会提示错误
  • ¥30 3天&7天&&15天&销量如何统计同一行
  • ¥30 帮我写一段可以读取LD2450数据并计算距离的Arduino代码
  • ¥15 飞机曲面部件如机翼,壁板等具体的孔位模型
  • ¥15 vs2019中数据导出问题
  • ¥20 云服务Linux系统TCP-MSS值修改?
  • ¥20 关于#单片机#的问题:项目:使用模拟iic与ov2640通讯环境:F407问题:读取的ID号总是0xff,自己调了调发现在读从机数据时,SDA线上并未有信号变化(语言-c语言)