import re
import urllib.request
fh=open('C:\\Users\\Hear-H\\Desktop\\汽车企业数据\\新建文件夹\\298.txt','w',encoding='utf-8')
area='<li><span>公司地区</span>(.*?)</li>'
area1=area.encode('utf-8')
time='<span>成立时间</span>(.*?)</li>'
time1=time.encode('utf-8')
address='<span>地址</span>(.*?)</li>'
address1=address.encode('utf-8')
client='<p id=\"maintypicClient\">(.*?)</p>'
product='<p id=\"product\">(.*?)</p>'
i=0
pat='<a target=\"_blank\" href=\"(http://i.gasgoo.com/supplier/.*?)\">'
headers=('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36')
opener=urllib.request.build_opener()
opener.addheaders=[headers]
while i<100:
i+=1
url="http://i.gasgoo.com/supplier/c-298/index-"+str(i)+".html"
web=opener.open(url).read().decode('utf-8')
rst=re.compile(pat).findall(web)
rst1=list()
for a in rst:
if a not in rst1:
rst1.append(a)
rst1.pop(0)
for b in rst1:
pat1=b+'\">(.*?)</a>'
name=re.compile(pat1).findall(web)
name_d=''.join(name)
url1=b
website1=opener.open(url1).read().decode('utf-8').encode('utf-8')
website2=opener.open(url1).read().decode('utf-8')
result1=re.compile(area1).findall(website1)
for c in result1:
result1_d=c.decode('utf-8')
result2=re.compile(time1).findall(website1)
for d in result2:
result2_d=d.decode('utf-8')
result3=re.compile(address1).findall(website1)
for e in result3:
result3_d=e.decode('utf-8')
result4=re.compile(client).findall(str(website2))
result4_d=''.join(result4)
result5=re.compile(product).findall(str(website2))
result5_d=''.join(result5)
print(name_d+'?'+result1_d+'?'+result2_d+'?'+result3_d+'?'+result4_d+'?'+result5_d+'\n')
fh1=fh.write(name_d+'?'+result1_d+'?'+result2_d+'?'+result3_d+'?'+result4_d+'?'+result5_d+'\n')
fh.close
就是我在爬取汽车企业数据网站的时候出现了HTTPError: Internal Server Error的问题,但是我上网查的时候一般说Internal Server Error出现的时候都会有500之类的数字提示,这里也没有,所以请问各位大佬一下,出现这种情况是不是只能用代理了呢?或者还有其他的方法
Traceback (most recent call last):
File "<ipython-input-1-7c05d0a2c578>", line 1, in <module>
runfile('C:/Users/Hear-H/Desktop/汽车企业数据/汽车企业数据挖掘.py', wdir='C:/Users/Hear-H/Desktop/汽车企业数据')
File "D:\Anaconda\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 786, in runfile
execfile(filename, namespace)
File "D:\Anaconda\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 110, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/Hear-H/Desktop/汽车企业数据/汽车企业数据挖掘.py", line 39, in <module>
website1=opener.open(url1).read().decode('utf-8').encode('utf-8')
File "D:\Anaconda\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "D:\Anaconda\lib\urllib\request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "D:\Anaconda\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "D:\Anaconda\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "D:\Anaconda\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
HTTPError: Internal Server Error