import requests
import re
from bs4 import BeautifulSoup
from openpyxl import Workbook
from openpyxl.styles import Alignment #文字对齐方式
def top250():
wb = Workbook() #得到一个全新的workbook
ws=wb['Sheet'] #通过sheet名字获取到sheet
num=0
num1=0
lst=[]
name_lst=[]
dy_lst=[]
zy_lst=[]
time_lst=[]
country_lst = []
leixing_lst = []
pj_lst = []
people_lst = []
quote_lst = []
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36 Edg/81.0.416.68",
}
while num <= 225:
url='https://movie.douban.com/top250?start='+str(num)+'&filter='
with requests.get(url=url,headers=headers) as r: #分段下载
if r.status_code==200:
r.encoding=r.apparent_encoding #编码
soup=BeautifulSoup(r.text,'html.parser') #解析方法
ol=soup.find('ol')
li=ol.find_all('li')
for i in li:
name=i.find('span').text
p = i.find('p')
p = str(p).split('<br/>', 1)
fst = p[0].split('>', 1)[1]
sec = p[1].split('<', 1)[0]
daoyan = fst.split('主', 1)[0]
daoyan = daoyan.replace('导演:', '')
daoyan = daoyan.replace('\xa0', '')
daoyan = daoyan.replace('\n ', '')
try:
zhuyan=fst.split('主演:',1)[1]
except:
zhuyan=''
time=sec.split('/',2)[0]
time=time.replace('\xa0','')
time = time.replace('\n ', '')
time=re.findall(r'\d{4}',time)[-1]
country=sec.split('\xa0/\xa0',2)[1]
country=country.replace('/xa0','')
type=sec.split('\xa0/\xa0',2)[2]
type=type.replace('\xa0','')
type = type.replace('\n', '')
star=i.find('div',attrs={'class': "star"})
span=star.find_all('span')
pingjia=span[1].text
people=span[3].text.split('评价',1)[0]
try:
quote=i.find('p',attrs={'class': "quote"}).text
quote=quote.replace('\n','')
except:
quote=''
name_lst.append(name)
dy_lst.append(daoyan)
zy_lst.append(zhuyan)
time_lst.append(time)
country_lst.append(country)
leixing_lst.append(type)
pj_lst.append(pingjia)
people_lst.append(people)
quote_lst.append(quote)
num1+=1
print('第{}页爬取完毕!'.format(num1))
if num==225:
print('爬取结束,开始写入excel。。。')
paiming = list(range(1, 251))
lst.append(paiming)
lst.append(name_lst)
lst.append(dy_lst)
lst.append(zy_lst)
lst.append(time_lst)
lst.append(country_lst)
lst.append(leixing_lst)
lst.append(pj_lst)
lst.append(people_lst)
lst.append(quote_lst)
head=['排名', '电影名称', '导演', '主演', '年份', '地区', '类型', '评分', '评价人数', '一句简介']
ws.append(head)
for i in range(len(lst[0])):
for j in range(len(lst[0])):
ws.cell(j + 2, i + 1).value = lst[i][j]
print('写入excel完成!')
for cell in ws['1']:
cell.alignment=Alignment(horizontal='center',vertical='center') #横向居中,纵向居中
for cell in ws['A']:
cell.alignment = Alignment(horizontal='center', vertical='center')
for cell in ws['B']:
cell.alignment = Alignment(horizontal='center', vertical='center')
for cell in ws['E']:
cell.alignment = Alignment(horizontal='center', vertical='center')
for cell in ws['H']:
cell.alignment = Alignment(horizontal='center', vertical='center')
for cell in ws['I']:
cell.alignment = Alignment(horizontal='center', vertical='center')
ws.column_dimensions['B'].width=25
ws.column_dimensions['I'].width = 13
wb.save('豆瓣电影top250.xlsx')
num+=25
else:
print('失败!')
if __name__=='__main__':
print('开始爬取!')
top250()
请问这个怎么调试啊,请你们上传下调试截图