Black--Cat 2021-08-12 20:06 采纳率: 50%
浏览 35

python数据采集与处理相关问题,根据注释写代码!

根据注释写代码

import requests
from lxml import etree
import time
import csv
import re

class Crawl(object):
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
        }
        # 新建文件 data.csv 来储存数据。(1分)
        #TODO:
        self.f = open(, , , ,)
     



    row = "NAME,SCORE,VOTES,RELEASE_DATE,REGIONS,GENRES,PPROTAGONIST,PPROTAGONIST\n"
    self.f.write(row)
    self.page = 0

def parse(self, url):
    response = requests.get(url=url, headers=self.headers)
    parse_html = etree.HTML(response.text)

    # 判断页面是否有数据 (1分)
    #TODO:
    li_list = parse_html.xpath()


    if len(li_list):
        self.parse_next(parse_html)

        # 构建翻页url (1分)
        #TODO:
        self.page =
        url =
      
        time.sleep(1)
        self.parse(url)
    else:
        return

def parse_next(self,parse_html):
    # 解析页面,获取 Title,RatingNum,Number,Info1 (4分)
    #TODO:
    Title = parse_html.xpath()
    RatingNum = parse_html.xpath()
    Number = parse_html.xpath()
    Info1 = parse_html.xpath()
   
    Info = [x.strip() for x in Info1 if x.strip() != '']
    Infos1 = [Info[i] for i in range(0, len(Info), 2)]
    Infos2 = [Info[i] for i in range(1, len(Info)+1, 2)]

    for title,ratingNum, number, info1, info2 in zip(Title, RatingNum, Number, Infos1, Infos2):
        item = {}
        info_all = info2.strip().split('/')
        info_time, info_state = info_all[:2]
        info_type = info_all[-1]
        item['NAME'] = title
        
        # 处理item['SCORE']的数据 (1分)
        #TODO:
        item['SCORE'] = 

        item['VOTES'] = re.findall(r'(\d+)',number)[0]
        item['RELEASE_DATE'] = re.findall(r'(\d+)',info_time)[0]
        item['REGIONS'] = "".join(re.findall(r'([^\d()])',info_state)).strip()
        item['GENRES'] = info_type.strip()
        item['DIRECTOR'] = "".join(re.findall(r'导演:(.*)\xa0', info1)).strip()
        item['PPROTAGONIST'] = "".join(re.findall(r'主演:(.*)', info1))
        self.save_data(item)

def save_data(self, item):
    f = csv.writer(self.f)
    f.writerow(item.values())

def main(self):
    url = "https://movie.douban.com/top250?start={}&filter=".format(self.page)
    self.parse(url)
    self.f.close()
    print("\n=====数据采集完成=====")

if name == "main":
crawl = Crawl()
crawl.main()

  • 写回答

1条回答 默认 最新

  • m0_58995603 2021-08-12 22:55
    关注

    私聊

    评论

报告相同问题?

问题事件

  • 创建了问题 8月12日

悬赏问题

  • ¥15 TeleScan不能修改bar
  • ¥100 请问我基于逐飞库写的这个有关于mp u6050传感器的函数,为什么输出的值是固定的?
  • ¥15 hadoop中启动hive报错如下怎么解决
  • ¥15 如何优化QWebEngineView 加载url的速度
  • ¥15 关于#hadoop#的问题,请各位专家解答!
  • ¥15 如何批量抓取网站信息
  • ¥15 Spring Boot离线人脸识别
  • ¥15 使用Ant Design Pro vue的时候,使用环境变量的时候,构建报错是什么原因
  • ¥15 NRF24L01能发送但是不能接收
  • ¥15 想问一下这种情况怎么解决呢(关键词-file)