import requests
from multiprocessing import Pool
from bs4 import BeautifulSoup
import pymysql
import time
def prepare_url():
base_url = "https://movie.douban.com/top250?start="
full_urls = []
for i in range(0, 10):
# 准备全路径
full_url = base_url + str(i * 25)
full_urls.append(full_url)
return full_urls
def get_html(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36 Edg/85.0.564.68"}
try:
html = requests.get(url=url, headers=headers).content.decode('utf-8')
except:
print("爬取失败")
return html
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
div_list = soup.find_all('div', {'class': 'info'})
message_list = []
for div in div_list:
title = div.find('a').find('span').text
score = div.find('div', {'class': 'star'}).find('span', {'class': 'rating_num'}).text
movie_url = div.find('a')['href']
dict = {'movie': title, 'rating_num': score, 'url': movie_url}
message_list.append(dict)
return message_list
def prepareDatabase():
db = pymysql.connect("localhost", 'root', '123456', 'test')
cursor = db.cursor()
sql = """
CREATE TABLE IF NOT EXISTS data (
url VARCHAR(255) NOT NULL ,
movie_name VARCHAR(255) NOT NULL,
rating_num FLOAT NOT NULL,
PRIMARY KEY(url)
)
"""
cursor.execute(sql)
sql_utf = "ALTER TABLE data CONVERT TO CHARACTER SET utf8 COLLATE utf8_general_ci;"
cursor.execute(sql_utf)
db.close()
def save_data(url,movie,rating_num):
db = pymysql.connect("localhost", 'root', '123456', 'test')
cursor = db.cursor()
sql = 'INSERT INTO data(url,movie_name,rating_num) VALUES ("{}","{}",{})'
try:
cursor.execute(sql.format(url,movie,rating_num))
db.commit()
except:
db.rollback()
db.close()
# 试图给数据库中的数据进行降序排序
def sort():
db = pymysql.connect("localhost", 'root', '123456', 'test')
cursor = db.cursor()
sql = "SELECT * FROM data order by rating_num desc"
try:
cursor.execute(sql)
db.commit()
except:
db.rollback()
db.close()
def get_data(url):
html = get_html(url)
datas = parse_html(html)
prepareDatabase()
for i in range(0,len(datas)):
save_data(datas[i]['url'],datas[i]['movie'],datas[i]['rating_num'])
if __name__ == '__main__':
start_time = time.time()
full_urls = prepare_url()
pool = Pool(19)
pool.map(get_data,full_urls)
sort()
end_time = time.time()
print("Run time = " + str(end_time - start_time) + " Second.")
# data_save()
程序没有报错,但是没办法对爬取之后保存到数据库中的数据进行排序,是python不能对数据库进行排序操作吗,我看的书上也没有,人要傻了