import requests
from lxml import etree
import pymysql
import re
import time
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='mydb',port='3306',charset='utf8')
cursor=conn.cursor()#连接数据库及光标
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
def get_movie_url(url):
html=requests.get(url,headers=headers)
selector=etree.HTML(html.text)
movie_hrefs=selector.xpath('//div[@class="hd"/a/@href')
for movie_href in movie_hrefs:
get_movie_info(movie_href)
def get_movie_info(url):
html = requests.get(url, headers=headers)
selector = etree.HTML(html.text)
try:
name=selector.xpath('//div[@id="content"]/h1/span/text()')[0]
director=selector.xpath('//div[@id="info"]/span[1]/span[2]/a/text()')[0]
actors=selector.xpath('//div[@id="info"]/span[3]/span[2]/text()')[0]
actor=actors.xpath('string(.)')
style=re.findall('<span property="v:genre">(.*?)</span>',html.text,re.S)[0]
country=re.findall('<span class="pl">制片国家/地区:</span>(.*?)<br>',html.text,re.S)[0]
release_time=re.findall('上映日期:</span>.*?>(.*?)</span>',html.text,re.S)[0]
time=re.findall('片长:</span>.*?>(.*?)</span>',html.text,re.S)[0]
score=selector.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()"')[0]
cursor.execute(
"insert into doubanmovie (name,director,actor,style,country,release_time,time,score) values(%s,%s,%s,%s,%s,%s,%s,%s)",
(str(name),str(director),str(actor),str(style),str(country),str(release_time),str(time),str(score)))
except IndexError:
pass
if __name__=='__main__':
urls=['https://movie.douban.com/top250?start={}'.format(str(i)) for i in range(0,250,25)]
for url in urls:
get_movie_url(url)
time.sleep(2)
conn.commit()
爬取豆瓣电影存入数据库,报错TypeError: %d format: a number is required, not str
- 写回答
- 好问题 0 提建议
- 追加酬金
- 关注问题
- 邀请回答
-
3条回答 默认 最新
- threenewbee 2018-12-01 16:06关注
urls=['https://movie.douban.com/top250?start={}'.format(str(i)) for i in range(0,250,25)]
->
urls=['https://movie.douban.com/top250?start={}'.format(i) for i in range(0,250,25)]问题如果解决,请点下我回答左上角的采纳,谢谢
解决 无用评论 打赏 举报
悬赏问题
- ¥15 虚拟机打包apk出现错误
- ¥30 最小化遗憾贪心算法上界
- ¥15 用visual studi code完成html页面
- ¥15 聚类分析或者python进行数据分析
- ¥15 逻辑谓词和消解原理的运用
- ¥15 三菱伺服电机按启动按钮有使能但不动作
- ¥15 js,页面2返回页面1时定位进入的设备
- ¥50 导入文件到网吧的电脑并且在重启之后不会被恢复
- ¥15 (希望可以解决问题)ma和mb文件无法正常打开,打开后是空白,但是有正常内存占用,但可以在打开Maya应用程序后打开场景ma和mb格式。
- ¥20 ML307A在使用AT命令连接EMQX平台的MQTT时被拒绝