import requests
from lxml import etree
from multiprocessing import Pool
from bs4 import BeautifulSoup
import lxml
import pymysql
def prepare_url():
base_url = "https://movie.douban.com/top250?start="
full_urls = []
for i in range(0, 10):
# 准备全路径
full_url = base_url + str(i * 25)
full_urls.append(full_url)
return full_urls
def get_html(url):
headers = {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT)'}
try:
html = requests.get(url=url, headers=headers).text
except:
print("爬取失败")
def parse_html(html):
soup = BeautifulSoup(html, 'lxml')
div_list = soup.find_all('div', {'class': 'info'})
message_list = []
for div in div_list:
title = div.find('a').find('span').text
score = div.find_all('div', {'class': 'star'}).find('span', {'class': 'rating_num'}).text
movie_url = div.find('a')['href']
dict = {'movie': title, 'rating_num': score, 'url': movie_url}
message_list.append(dict)
return message_list
def get_data(url):
html = get_html(url)
datas = parse_html(html)
if __name__ == '__main__':
full_urls = prepare_url()
# print(full_urls)
pool = Pool(10)
data = pool.map(get_data,full_urls)
print(data)
TypeError: object of type 'NoneType' has no len()