各位大佬,小白学习python,想用多线程io爬取,但是想让每个线程的列表名不重复,我又弄了names=locals,但是报错,显示AttributeError: 'NoneType' object has no attribute 'append'。意思是names.get("this_"+str(thread_num)).append(xxxx)没有这个append方法。names.get("this_"+str(thread_num))这个我打印了类型明明是list啊
1.封装函数:只爬一页,网页发生变化时,请在函数内更改xpath规则
"""
def spideronepage(page_num,thread_num): #需要输入爬取页数,必须填写线程编号,返回列表储存全部爬取结果
import requests
from lxml import etree
names = locals() #引入可变变量名
url = "https://xxxxx525.com/page/{}".format(page_num)
headers={
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35"
}
resp = requests.get(url=url, headers =headers)
tree=etree.HTML(resp.text)
li_list = tree.xpath("*//article")
names['gameinfos_'+str(thread_num)]=[]
for li in li_list:
game_name =li.xpath(".//h2/a/@title")[0]
game_netxlink = li.xpath(".//h2/a/@href")[0]
game_imgurl = li.xpath(".//div/a/img/@data-src")[0]
gameinfo=[game_name,game_netxlink,game_imgurl]
names.get('gameinfos_'+str(thread_num)).append(gameinfo)
return names.get("this_"+str(thread_num)).append(gameinfo)
"""
2.封装函数下载图片到文件夹,包含创建文件夹
"""
def imgdownload(img_url): #img_url提供下载链接,返回图片本地存储路径
import requests
import os
import re
path='../爬取图片switch520' #新建存放图片的文件夹
if os.path.exists(path):
pass
else:
os.makedirs(path,exist_ok=True)
download_jpg = requests.get(img_url)
re_str=re.search("(\w+\.jpg)|(\w+\.png)",img_url).group()
img_name = str(re_str)
with open("{}/{}".format(path,img_name) ,"wb") as f:
f.write(download_jpg.content)
"""
3.封装函数:向excel写入爬取
"""
def writeexcel(data_list):
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws.title ="switch520网站爬取"
ws._current_row=1
for i in data_list:
ws.append(i)
wb.save('../switch520xxx.xlsx')
"""
4.多线程模块封装,IO并发技术,最大利用带宽,改函数调用单页爬取模块spideronepage
"""
def multi_thread(page_nums):
import threading
threads=[]
for page_num in range(1,page_nums+1):
threads.append(
threading.Thread(target=spideronepage,args=(page_num,page_num))
)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
"""
!!!主函数 多页爬取,多线程版!!!!
"""
def main():
names=locals()
page_nums=20
multi_thread(page_nums)
for i in range(1,page_nums+1):
print(names.get('gameinfos_'+str(i)))
if __name__=="__main__":
main()
```python