m0_46585205 2021-11-23 10:33 采纳率: 16.7%
浏览 30
已结题

保存到文件乱码和没有那么条数据

问题遇到的现象和发生背景

原地址:https://mp.weixin.qq.com/s/54dcJO2KR9aMTQJh5OlSzg
爬取数据控制台没有出问题 可是爬取到文件就出问题

问题相关代码,请勿粘贴截图
import requests
import requests
import csv
import time
import json
import requests
from lxml import etree
from pymongo import MongoClient
from bs4 import BeautifulSoup
import pandas as pd

def getOnepage(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
        'Cookie': 'bid=NX8M2NI7rfg; douban-fav-remind=1; __yadk_uid=TEJSv3vlFpxrnShgBGXWW51qExiqLCiD; __gads=ID=35ed214a9a0f04f3-229b40ae8cc5003e:T=1609826897:RT=1609826897:S=ALNI_MbjrOjiMxJC6bra_BWqa1z6LwJvFA; ll="118267"; viewed="1007305"; gr_user_id=38f71f7e-49a3-4463-a3e9-dd3f77625dad; _ga=GA1.2.1355032985.1609826898; _vwo_uuid_v2=D6A8A09C6232AC6A436C3775284DBE348|dadd79d0f01552308d196454329600a7; dbcl2="247324733:YcdnDsCblB0"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.24732; ck=MMTN; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1637395735%2C%22https%3A%2F%2Fwww.gameres.com%2F%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.1355032985.1609826898.1637309781.1637395736.17; __utmc=30149280; __utmz=30149280.1637395736.17.14.utmcsr=gameres.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; _pk_id.100001.8cb4=b87f2f239f7dd7c8.1609826897.15.1637395743.1637309781.; __utmb=30149280.4.10.1637395736'

    }
    # 发送请求,得到响应

    response = requests.get(url, headers=headers)
    return response.json()  # 文本

# 解析一页的数据
def parseOnepage (res):


    col = ['name', 'star', 'rating', 'platforms', 'n_ratings', 'genres', 'content']

    n = len(res['games'])
    list1=[]
    for j in range(n):
        item={}
        item['name'] = res['games'][j]['title']
        item['star'] = res['games'][j]['star']
        item['rating'] = res['games'][j]['rating']
        item['platforms'] = res['games'][j]['platforms']
        item['n_ratings'] = res['games'][j]['n_ratings']
        item['genres'] = res['games'][j]['genres']
        item['content'] = res['games'][j]['review']['content']
        print(item)
        list1.append(item)
    return list1
def savaData(item):
    with open('douban.csv', 'w', newline='', encoding='utf-8')as f:
        fieldnames = ['name', 'star', 'rating', 'platforms', 'n_ratings',
                      'genres','content']
        write = csv.DictWriter(f, fieldnames=fieldnames)
        write.writeheader()
        write.writerows(item)
def main():
    for i in range(1,100):
        url="https://www.douban.com/j/ilmen/game/search?genres=&platforms=&q=&sort=rating&more="+str(i)
        response = getOnepage(url)
        # parseOnepage(response)
        savaData(parseOnepage(response))
if __name__ == '__main__': # 程序的窗口
    main()

运行结果及报错内容

img

img

我的解答思路和尝试过的方法
我想要达到的结果
  • 写回答

3条回答 默认 最新

  • 辉煌仪奇 2021-11-23 11:21
    关注

    你这是爬取了多少个页面呀,我跑了好久,记得采纳
    修改了你数据提取后的多余回车和乱码现象,还有提取的数据有些字符是异形字符,如果你想要显示,建议换种方法存数据,修改了存文件的函数,具体哪些地方你再看看

    import requests
    
    
    def getOnepage(url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
            'Cookie': 'bid=NX8M2NI7rfg; douban-fav-remind=1; __yadk_uid=TEJSv3vlFpxrnShgBGXWW51qExiqLCiD; __gads=ID=35ed214a9a0f04f3-229b40ae8cc5003e:T=1609826897:RT=1609826897:S=ALNI_MbjrOjiMxJC6bra_BWqa1z6LwJvFA; ll="118267"; viewed="1007305"; gr_user_id=38f71f7e-49a3-4463-a3e9-dd3f77625dad; _ga=GA1.2.1355032985.1609826898; _vwo_uuid_v2=D6A8A09C6232AC6A436C3775284DBE348|dadd79d0f01552308d196454329600a7; dbcl2="247324733:YcdnDsCblB0"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.24732; ck=MMTN; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1637395735%2C%22https%3A%2F%2Fwww.gameres.com%2F%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.1355032985.1609826898.1637309781.1637395736.17; __utmc=30149280; __utmz=30149280.1637395736.17.14.utmcsr=gameres.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; _pk_id.100001.8cb4=b87f2f239f7dd7c8.1609826897.15.1637395743.1637309781.; __utmb=30149280.4.10.1637395736'
        }
        # 发送请求,得到响应
        response = requests.get(url, headers=headers)
        return response.json()  # 文本
    
    
    # 解析一页的数据
    def parseOnepage(res):
        col = ['name', 'star', 'rating', 'platforms', 'n_ratings', 'genres', 'content']
        n = len(res['games'])
        list1 = []
        for j in range(n):
            item = {}
            item['name'] = res['games'][j]['title'].replace('\r\n', '')
            item['star'] = res['games'][j]['star'].replace('\r\n', '')
            item['rating'] = res['games'][j]['rating'].replace('\r\n', '')
            item['platforms'] = res['games'][j]['platforms'].replace('\r\n', '')
            item['n_ratings'] = str(res['games'][j]['n_ratings'])
            item['genres'] = res['games'][j]['genres'].replace('\r\n', '')
            item['content'] = res['games'][j]['review']['content'].replace('\r\n', '')
            list1.append(item)
        return list1
    
    
    def savaData(item):
        import os
        if os.path.exists('douban.csv'):
            with open('douban.csv', 'a+') as f:
                for i in item:
                    c = list(i.values())
                    b = ','.join(c)
                    try:
                        f.writelines(b + '\n')
                    except:
                        pass
        else:
            with open('douban.csv', 'w+') as f:
                title = "name, star, rating, platforms, n_ratings,genres, content\n"
                f.writelines(title)
                for i in item:
                    c = list(i.values())
                    b = ','.join(c)
                    try:
                        f.writelines(b + '\n')
                    except:
                        pass
    
            # write = csv.DictWriter(f, fieldnames=fieldnames)
            # write.writeheader()
            # write.writerows(item)
    
    
    def main():
        for i in range(1, 100):
            url = "https://www.douban.com/j/ilmen/game/search?genres=&platforms=&q=&sort=rating&more=" + str(i)
            response = getOnepage(url)
            # parseOnepage(response)
            savaData(parseOnepage(response))
    
    
    if __name__ == '__main__':  # 程序的窗口
        main()
    
    

    img

    img

    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论
查看更多回答(2条)

报告相同问题?

问题事件

  • 已结题 (查看结题原因) 11月23日
  • 已采纳回答 11月23日
  • 修改了问题 11月23日
  • 创建了问题 11月23日

悬赏问题

  • ¥100 支付宝网页转账系统不识别账号
  • ¥15 基于单片机的靶位控制系统
  • ¥15 AT89C51控制8位八段数码管显示时钟。
  • ¥15 真我手机蓝牙传输进度消息被关闭了,怎么打开?(关键词-消息通知)
  • ¥15 下图接收小电路,谁知道原理
  • ¥15 装 pytorch 的时候出了好多问题,遇到这种情况怎么处理?
  • ¥20 IOS游览器某宝手机网页版自动立即购买JavaScript脚本
  • ¥15 手机接入宽带网线,如何释放宽带全部速度
  • ¥30 关于#r语言#的问题:如何对R语言中mfgarch包中构建的garch-midas模型进行样本内长期波动率预测和样本外长期波动率预测
  • ¥15 ETLCloud 处理json多层级问题