import pymongo
import requests
from lxml import etree
import time
import re
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36 Edg/93.0.961.52'
}
client = pymongo.MongoClient('localhost', 27017) # 连接数据库
mydb = client['mydb'] # 创建库mydb
musictop = mydb['musictop250'] # 创建表
# 连接数据库
def get_url_music(url):
"""获取每个音乐的链接"""
web_data = requests.get(url,headers=headers).content.decode()
html = etree.HTML(web_data)
selectors = html.xpath('//a[@class="nbg"]/@href')
for selector in selectors:
get_info(selector)
def get_info(url):
"""获取网页信息"""
web_data = requests.get(url,headers=headers).content.decode()
html = etree.HTML(web_data)
songs = html.xpath('//*[@id="wrapper"]/h1/span/text()')[0]
singers = re.findall(' 表演者:.*?<a href=".*?">(.*?)</a>',web_data,re.S)
styles = re.findall('<span class="pl">流派:</span> (.*?)<br />',web_data,re.S)
publish_time = re.findall(' <span class="pl">发行时间:</span> (.*?)<br />',web_data,re.S)
rates = html.xpath('//*[@id="interest_sectl"]/div/div[2]/strong/text()'[0])
if len(styles)==0:
styles='未知'
else:
styles=styles[0].strip()
if len(publish_time)==0:
publish_time='未知'
else:
publish_time=publish_time[0].strip()
if len(singers)==0:
singers='未知'
else:
singers=singers[0].strip()
print(songs,singers,publish_time,rates)
data = {
'song':songs,
'singer':singers,
'style':styles,
'publish_time':publish_time,
'rate':rates
}
musictop.insert_one(data)
if __name__ == '__main__':
urls = ['https://music.douban.com/top250?start={}'.format(str(i)) for i in range(0, 250, 25)]
for url in urls:
get_url_music(url)
time.sleep(2)
疑惑的问题
- 每次运行的结果不一样
- 有时候出现错误有时候没有错误
- 有时候数据打印的时候都没打印完全就结束进程了
我都感觉是不是我电脑出问题了,希望大家能够帮我解决,谢谢