Scootchen_CSDN 2022-10-27 20:34 采纳率: 50%
浏览 30

未使用 import语句from lxml import etree ?

之前都没什么问题,突然就来一个未使用 import语句from lxml import etree都报错,这里为什么呢,有没有朋友帮忙看看那里错了?

/Users/chenyuhui/Desktop/截屏2022-10-27 20.33.17.png

import requests
import re
from lxml import etree
import csv
import time
import asyncio
import aiohttp
import logging
import pandas as pd
import numpy as np
import random
import ssl
import certifi

ssl_context = ssl.create_default_context()
ssl_context.load_verify_locations(certifi.where())

URL1='https://movie.douban.com/subject/35131346/comments?status=P'
file_name='短评-{index}.csv'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
    'cookie': 'll="108298"; bid=bQ20QaMu-Hc; push_noty_num=0; push_doumail_num=0; __utmv=30149280.12857; ct=y; gr_user_id=15663cd4-0204-441c-af43-e8b1a7d2d5f2; __gads=ID=4ed9362f7037ff69-22a328ce7fd70077:T=1666667493:RT=1666667493:S=ALNI_Mb0n4LajrFJVVuT5rLrXwGQO4aPfw; __utmz=30149280.1666760296.17.6.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/128572766/; ap_v=0,6.0; __gpi=UID=00000b6c11ffabc2:T=1666667493:RT=1666831710:S=ALNI_MbUWI8jPIZJO0A34AwiFDfTsDqfyQ; __utma=30149280.1432797394.1664965576.1666830229.1666834156.23; __utmc=30149280; __utmt=1; dbcl2="128572766:obLNF0XC3Tw"; ck=61Xh; __utmb=30149280.17.10.1666834156'
}

#定义了logging的基本配置
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s: %(message)s')
start=time.time()

URL2=URL1[0:51]+'start={start}&limit=20&status=P&sort=new_score'
URL3='https://www.douban.com/people/{authorid}/'

CONCURRENCY = 5
semaphore = asyncio.Semaphore(CONCURRENCY)

with open(file_name.format(index=re.findall('subject/(.*?)/comments',URL1)[0]), 'w',encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['序号','评论者', '评论', 'IP', '常驻地','有用量','网址'])

df=pd.DataFrame(pd.read_excel('./代理IP.xlsx'))
IPPORTS=list(df.loc[:,"IPPORTS"])
proxys = np.unique([IPPORTS for IPPORT in IPPORTS])


async def scrape_api(url):
    async with semaphore:
        try:
            logging.info('scraping %s', url)
            proxy=random.choice(proxys)
            proxies = {
                'http': 'http://' + proxy,

            }
            session.proxies=proxies
            async with session.get(url,headers=headers,ssl=ssl_context) as response:
                return await response.text()

        except aiohttp.ClientError:
            logging.error('error occurred while scraping %s', url, exc_info=True)

async def scrape_index(page):
    url = URL2.format(start=page)
    return await scrape_api(url)

async def scrape_detail(authorid):
    url = URL3.format(authorid=authorid)
    return await scrape_api(url)

async def main():
    global session
    session = aiohttp.ClientSession()
    r1=requests.get(URL1,headers=headers)
    web1=r1.text
    # print(r1.status_code)
    print(web1)
    review_counts=int(re.findall('看过\((.*?)\)',web1)[0])
    print("共{}条短评".format(review_counts))
    scrape_index_tasks = [asyncio.ensure_future(scrape_index(page)) for page in range(0,review_counts//20,20)]
    results = await asyncio.gather(*scrape_index_tasks)
    for page,web2 in enumerate(results):
        reviews=re.findall('(.*?)', web2)
        youyongs=re.findall('(.*?)', web2)
        authorids=re.findall('https://www.douban.com/people/(.*?)/" class="">',web2)
        authors = re.findall('class="">(.*?)', web2)

        for i,[review,youyong,author,authorid] in enumerate(zip(reviews,youyongs,authors,authorids)):

            with open(file_name.format(index=re.findall('subject/(.*?)/comments', URL1)[0]), 'a',encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow([i+1+page*20, author, review,"/","/",youyong,URL3.format(authorid=authorid)])


if __name__ == '__main__':
    loop=asyncio.get_event_loop()
    loop.run_until_complete(main())

    end=time.time()
    print("用时{}s".format(end-start))
#     print(reviews_sum)
#     print(authorhrefs)
#     print(levels)
#     print(len(reviews_sum))
#     print(len(authorhrefs))
#     print(len(levels))
#
# for authorhref in authorhrefs:

















  • 写回答

1条回答 默认 最新

  • CSDN-Ada助手 CSDN-AI 官方账号 2022-10-27 22:48
    关注
    评论

报告相同问题?

问题事件

  • 创建了问题 10月27日

悬赏问题

  • ¥15 传染病最优控制问题,控制为什么没起作用
  • ¥15 请问一个软件连接不上服务器了怎么办呀
  • ¥15 ue5.3 pico打包失败 求帮助
  • ¥15 请教,这是用chatgpt写的CAD LISPD脚本,需求是画一个由双直线组成的矩形
  • ¥50 微信小程序 成功包奶茶啊
  • ¥15 计算机博弈的六子棋代码
  • ¥15 在Fetch API中传送的参数为何不起作用?
  • ¥15 问题遇到的现象和发生背景 360导航页面千次ip是20元,但是我们是刷量的 超过100ip就不算量了,假量超过100就不算了 这是什么逻辑呢 有没有人能懂的 1000元红包感谢费
  • ¥30 计算机硬件实验报告寻代
  • ¥15 51单片机写代码,要求是图片上的要求,请大家积极参与,设计一个时钟,时间从12:00开始计时,液晶屏第一行显示time,第二行显示时间