完整代码如下
#多进程爬取豆瓣TOP250
import requests
from bs4 import BeautifulSoup
import time
from multiprocessing import Process,Queue
def open_url(url):
proxies = {'http':'121.230.210.200:3256','http':'203.82.253.47:80','http':'118.117.189.17:3256'}
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36 Edg/92.0.902.62'}
html = requests.get(url,headers = headers,proxies = proxies).text
return html
def parse_url(url):
html = BeautifulSoup(open_url(url),features="html.parser")
#获取每一页所有电影名
movies = []
targets = html.find_all('span',class_="title")
for each in targets:
movies.append(each.text)
#获取每一页所有电影得分
marks = []
targets = html.find_all('span',class_="rating_num",property="v:average")
for each in targets:
marks.append(each.text)
#将数据储存到队列中
for i in range(len(movies)):
dataqueue.put(movies[i]+' mark:'+marks[i])
def main():
start_t = time.time()
num = 1
dataqueue = Queue()
process_list = []
#循环创建10个进程
for i in range(10):
url = 'https://movie.douban.com/top250?start=%d&filter=' % i*25
process = Process(target = parse_url,args = url)
process.start()
process_list.append(process)
#父进程等待所有子进程结束再进行下面的语句
for process in process_list:
process.join()
#输出内容
while not dataqueue.empty():
print('排名:',num,' title:',dataqueue.get())
num+=1
end_t = time.time()
print('用时:',end_t - start_t)
if __name__ == '__main__':
main()
但是每次运行都会报错,每一个进程都会出现以下异常
Traceback (most recent call last):
Traceback (most recent call last):
File "D:\下载\python\lib\multiprocessing\process.py", line 313, in _bootstrap
self.run()
File "D:\下载\python\lib\multiprocessing\process.py", line 108, in run
self._target(*self._args, **self._kwargs)
TypeError: parse_url() takes 1 positional argument but 1175 were given
为什么会出现如此多参数
该怎样解决