import requests, time, random
from fake_useragent import UserAgent
from lxml import etree
def get_url(url):
headers = {
"User-Agent": UserAgent().random
}
time.sleep(random.randint(3, 9))
response = requests.get(url, headers=headers)
response.encoding='utf8'
if response.status_code == 200:
return response.text
else:
return None
def parse_index(html):
e = etree.HTML(html)
movie_base_url = "https://maoyan.com{}?catId=3&showType=3"
all_url = e.xpath('//div[@class="movie-item film-channel"]/a/@href')
return [movie_base_url.format(url) for url in all_url]
def parse_info(html):
e = etree.HTML(html)
name = e.xpath('//h1[@class="name"]')
type_ = e.xpath('//li[@class="ellipsis"][1]')
contary_duration = e.xpath('//li[@class="ellipsis"][2]')
year = e.xpath('//li[@class="ellipsis"][3]')
introduce = e.xpath('//span[@class="dra"]')
return {
"name": name,
"type": type_,
"contary_duration": contary_duration,
"year": year,
"introduce": introduce
}
def main():
"""
base_url = "https://maoyan.com/films?catId=3&showType=3&offset={}"
for i in range(0, 3):
new_url = base_url.format(i*30)
# time.sleep(random.randint(2,4))
html = get_url(new_url)
movie_urls = parse_index(html)
for movie_url in movie_urls:
movie_html = get_url(movie_url)
movie_info = parse_info(movie_html)
with open('movie.txt', 'a', encoding='utf8') as f:
f.write(movie_info, encoding='utf8')
"""
base_url = "https://maoyan.com/films?catId=3&showType=3&offset=0"
html = get_url(base_url)
movie_urls = parse_index(html)
for movie_url in movie_urls:
movie_html = get_url(movie_url)
movie_info = parse_info(movie_html)
with open('movie.txt', 'a', encoding='utf8') as f:
f.write(movie_info, encoding='utf8')
if __name__ == "__main__":
main()