大雄是个程序员 2019-06-02 16:24 采纳率: 0%
浏览 411

请求python3.7中 的url中文问题

import string
import urllib
import json
import time
from quopri import quote

ISOTIMEFORMAT='%Y-%m-%d %X'

outputFile = 'douban_movie.txt'
fw = open(outputFile, 'w')
fw.write('id;title;url;cover;rate\n')

headers = {}
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
headers["Accept-Encoding"] = "gzip, deflate, sdch"
headers["Accept-Language"] = "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4,ja;q=0.2"

headers["Cache-Control"] = "max-age=0"

headers["Connection"] = "keep-alive"

headers["Cookie"] = 'bid="LJSWKkSUfZE"; ll="108296"; __utmt=1; regpop=1; _pk_id.100001.4cf6=32aff4d8271b3f15.1442223906.2.1442237186.1442224653.; _pk_ses.100001.4cf6=*; __utmt_douban=1; __utma=223695111.736177897.1442223906.1442223906.1442236473.2; __utmb=223695111.0.10.1442236473; __utmc=223695111; __utmz=223695111.1442223906.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=30149280.674845100.1442223906.1442236473.1442236830.3; __utmb=30149280.4.9.1442237186215; __utmc=30149280; __utmz=30149280.1442236830.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap=1'

headers["Host"] = "movie.douban.com"
headers["Referer"] = "http://movie.douban.com/"
headers["Upgrade-Insecure-Requests"] = 1
headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"

获取tag

request = urllib.request.Request(url="http://movie.douban.com/j/search_tags?type=movie")
response = urllib.request.urlopen(request)
tags = json.loads(response.read())['tags']

开始爬取

print ("********** START **********")
print (time.strftime( ISOTIMEFORMAT, time.localtime() ))

for tag in tags:
print ("Crawl movies with tag: " + tag)
print (time.strftime( ISOTIMEFORMAT, time.localtime() ))

start = 0
while True:
    url = "http://movie.douban.com/j/search_subjects?type=movie&tag=" +tag.encode("utf-8")+"&page_limit=20&page_start="+str(start)
    #url = quote(url, safe=string.printable)
    request = urllib.request.Request(url=url)
    response = urllib.request.urlopen(request)
    movies = json.loads(response.read())['subjects']
    if len(movies) == 0:
        break
    for item in movies:
        rate = item['rate']
        title = item['title']
        url = item['url']
        cover = item['cover']
        movieId = item['id']
        record = str(movieId) + ';' + title + ';' + url + ';' + cover + ';' + str(rate) + '\n'
        fw.write(record.encode('utf-8'))
        print (tag + '\t' + title)
    start = start + 20

fw.close()

图片说明
图片说明

图片说明
图片说明

  • 写回答

1条回答

  • 7*24 工作者 2019-06-03 14:30
    关注

    你需要导入 urllib.request 库,不是 urllib

    评论

报告相同问题?

悬赏问题

  • ¥15 微信公众号自制会员卡没有收款渠道啊
  • ¥15 stable diffusion
  • ¥100 Jenkins自动化部署—悬赏100元
  • ¥15 关于#python#的问题:求帮写python代码
  • ¥20 MATLAB画图图形出现上下震荡的线条
  • ¥15 关于#windows#的问题:怎么用WIN 11系统的电脑 克隆WIN NT3.51-4.0系统的硬盘
  • ¥15 perl MISA分析p3_in脚本出错
  • ¥15 k8s部署jupyterlab,jupyterlab保存不了文件
  • ¥15 ubuntu虚拟机打包apk错误
  • ¥199 rust编程架构设计的方案 有偿