##spider代码
import scrapy
class MiddleSpider(scrapy.Spider):
name = "middle"
#allowed_domains = ["www.xxx.com"]
start_urls = ["http://www.baidu.com/s?wd=ip"]
def parse(self, response):
page_text=response.text
with open("ip.html","w",encoding="utf-8") as fp:
fp.write(page_text)
##scrapy中间件
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
import random
class MiddleproDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
user_agent_list=['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
]
PROXY_http=["114.231.42.244","183.236.232.160"]
PROXY_https=["120.83.49.90:9000","95.189.112.214:35508"]
#拦截请求
def process_request(self, request, spider):
request.headers["User-Agent"]=random.choice(self.user_agent_list)
#验证代理的操作是否会生效
request.meta["proxy"]="http://182.139.110.18"
return None
#拦截所有响应
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
#拦截异常请求
def process_exception(self, request, exception, spider):
if request.url.split(":")[0] == "http":
#代理
request.meta["proxy"]="http://"+random.choice(self.PROXY_http)
else:
request.meta["proxy"]="https://"+random.choice(self.PROXY_https)
return request#将修正后的请求对象重新请求发送
出现如下报错
2023-05-30 18:45:14 [scrapy.downloadermiddlewares.retry] ERROR: Gave up retrying <GET http://www.baidu.com/s?wd=ip> (failed 3 times): TCP connection timed out: 10060: 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。