import requests
from lxml import etree
import re
import pymysql
import time
conn = pymysql.connect(host = 'localhost',user = 'root',passwd = '52xzy1314@',db = 'mydb',port = 3306,charset = 'utf8')
cursor = conn.cursor()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}
def get_movie_url(url):
html = requests.get(url,headers = headers)
selector = etree.HTML(html.text)
movie_hrefs = selector.xpath('//div[@class="hd"]/a/@href')
for movie_href in movie_hrefs:
get_movie_info(movie_href)
def get_movie_info(url):
html = requests.get(url,headers = headers)
selector = etree.HTML(html.text)
try:
name = selector.xpath(".//div[@id='content']/h1/span[1]/text()")[0]
director = selector.xpath(".//div[@id='info']/span[1]/span[@class='attrs']/a/text()")[0]
actors = selector.xpath(".//div[@id='info']/span[@class='actor']/span[@class='pl']")[0]
actor = actors.xpath('string(.)')
style = re.findall('<span property = "v:genre">(.*?)</span>',html.text,re.S)[0]
country = re.findall('<span class="pl">制片国家/地区:</span>(.*?)<br/>',html.text,re.S)[0]
release_time = re.findall('上映日期:</span>.*?>(.*?)</span>',html.text,re.S)[0]
time = re.findall('片长:</span>.*?>(.*?)</span>',html.text,re.S)[0]
score = selector.xpath(".//strong[@class='ll rating_num']/text()")[0]
cursor.execute("insert into doubanmovie (name,director,actor,style,country,release_time,time,score) values(%s,%s,%s,%s,%s,%s,%s,%s)",(str(name),str(director),str(actor),str(style),str(country),str(release_time),str(time),str(score)))
except:
pass
if __name__ == '__main__':
urls = ['https://movie.douban.com/top250?start={}'.format(str(i)) for i in range (0,250,25)]
for url in urls:
get_movie_url(url)
time.sleep(2)
conn.commit()