python程序:
#!/usr/bin/env python
# coding=utf-8
#import importlib,sys
#import sys
#sys.setdefaultencoding('gbk')
import sys
import imp
import jieba
import json
from bs4 import BeautifulSoup
import urllib.request
import urllib3
import re
import os,os.path
import codecs
import requests
'''def getHtml(url):
global html
page = urllib.request.urlopen(url)
html = page.read()
return html
'''
def download_file(download_url,file_name):
print(download_url)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1)AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}
#req = urllib.request(download_url,headers=headers)
response = urllib.request.urlopen(download_url)#(req)
file = open(file_name, 'wb')
file.write(response.read())
file.close()
print(file_name)
print("Completed : .... %d ..." % x)
save_path = 'E:\\2345Downloads'
#url = 'https://www.lfd.uci.edu/'
#html = getHtml(url)
html='''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<!-- saved from url=(0048)https://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml -->
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<li><a id="aiohttp"></a><strong><a href="https://github.com/KeepSafe/aiohttp/">Aiohttp</a></strong>: a http client/server for asyncio.
<ul>
<li><a href="javascript:;" onclick=" javascript:dl([101,111,51,46,118,106,97,54,53,100,95,119,52,110,56,45,99,104,112,116,47,113,108,109,105,50,115], "IH4DA37BC5G0@BBA>1262H>?A1=>?A1=>:G<95F86;2:@E"); "javascript: dl("" title="[631 KB] [Oct 10, 2019]">aiohttp‑3.6.2‑cp38‑cp38‑win_amd64.whl</a></li>
<li><a href="javascript:;" onclick=" javascript:dl([101,118,53,99,112,47,113,51,106,105,45,54,116,110,50,56,111,104,119,46,108,115,97], "D=75301;4E8?@;;396B:B=9236>9236>9A8<6=BA@C"); "javascript: dl("" title="[600 KB] [Oct 10, 2019]">aiohttp‑3.6.2‑cp38‑cp38‑win32.whl</a></li>
<li><a href="javascript:;" onclick=" javascript:dl([101,118,47,100,45,97,109,52,99,115,53,104,106,108,54,116,51,55,95,105,46,110,112,50,111,119,113], "8F;IE09>14BG:>>E3?C=CF37E?@37E?@53HBDA452=6CH:<"); "javascript: dl("" title="[624 KB] [Oct 10, 2019]">aiohttp‑3.6.2‑cp37‑cp37m‑win_amd64.whl</a></li>
<li><a href="javascript:;" onclick=" javascript:dl([101,118,99,45,104,112,116,119,47,113,109,97,110,50,106,111,105,115,108,46,53,55,51,54], "@<=840C57:?>35542EBFB<214ED214ED926?;E<B63A"); "javascript: dl("" title="[596 KB] [Oct 10, 2019]">aiohttp‑3.6.2‑cp37‑cp37m‑win32.whl</a></li>
<li><a href="javascript:;" onclick=" javascript:dl([101,50,109,54,104,47,112,52,115,51,99,110,108,118,119,97,46,95,113,53,100,45,106,116,111,105], "70EA5<BF4>HG3FF5D8?2?0D9582D95821D=H:@>1C26?=3;"); "javascript: dl("" title="[617 KB] [Oct 10, 2019]">aiohttp‑3.6.2‑cp36‑cp36m‑win_amd64.whl</a></li>
<li><a href="javascript:;" onclick=" javascript:dl([101,113,111,54,115,47,109,46,45,104,118,119,50,105,99,108,97,106,110,53,116,112,51], "3;@0D9BC4?<18CCD7E626;7=DE27=DE257:<AE;6:8>"); "javascript: dl("" title="[590 KB] [Oct 10, 2019]">aiohttp‑3.6.2‑cp36‑cp36m‑win32.whl</a></li>
<li><a href="javascript:;" onclick=" javascript:dl([101,115,51,118,52,113,50,110,97,95,54,112,100,104,47,119,53,106,109,46,105,108,111,116,45,99], "05@4:2?F=H:1?=7CE<FF:G1B9B5GH:1?GH:1?AG>C687A;93B><D"); "javascript: dl("" title="[612 KB] [Oct 10, 2019]">aiohttp‑3.6.2‑cp35‑cp35m‑win_amd64.whl</a></li>
<li><a href="javascript:;" onclick=" javascript:dl([101,106,54,118,108,104,116,110,111,105,97,99,113,115,50,47,112,51,46,119,45,109,53], "<=0;?2E5>:?@E>987455?C@A1A=C:?@EC:?@EDCB86@=AB43"); "javascript: dl("" title="[584 KB] [Oct 10, 2019]">aiohttp‑3.6.2‑cp35‑cp35m‑win32.whl</a></li>
'''
print('html done')
#html.decode('utf-8')
#print(html)
'''
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1)AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}
r = requests.get(url, headers = headers)
r.encoding = "utf-8"
soup = BeautifulSoup(r.text, "html.parser")
#html_mod=re.sub(pattern=".",repl=".",string=html.decode('utf-8'))
for link in soup.find_all('a'): #soup.find_all返回的为列表
print(link.get('href'))
#name_list+=link
'''
name_list = html#soup.find_all('a')#re.findall(r']">*-cp38-win_amd64.whl',html.decode('utf-8'))
x=1
files=os.listdir(save_path)
print(files)
print(type(name_list))
name_list=str(name_list)
name_list1=[]
#print(name_list)
#for name in name_list:
global k
k=0
# name[k]=str(name1[k])
for i in range(len(name_list)):
j=0
if name_list[i-2:i+1]==']">':
name_list1.append(name_list[i+1:i+50])
global m
if k<len(name_list1):
for l in range(len(name_list1[k])):
if l-9>=0:
if name_list1[k][l-9:l]=='amd64.whl':
j=1
m=l
if j==1:
name_list1[k]=name_list1[k][0:m]
k+=1
'''if j==0:
name_list.remove(name)'''
#file_name = os.path.join(save_path ,name)
#print(name)
print(name_list1)
for name in name_list1:
if name in files:
continue
print('no:'+str(x))
print('\ndownload'+name)
# importlib.reload(sys)
imp.reload(sys)
download_file('https://download.lfd.uci.edu/pythonlibs/s2jqpv5t/'+name,save_path)
x=x+1
print(name_list)
print('finished')
报错:
Traceback (most recent call last):
File "E:\2345Downloads\111 - 副本.py", line 123, in
download_file('https://download.lfd.uci.edu/pythonlibs/s2jqpv5t/'+name,save_path)
File "E:\2345Downloads\111 - 副本.py", line 30, in download_file
response = urllib.request.urlopen(download_url)#(req)
File "C:\Users\ASUS\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\ASUS\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\Users\ASUS\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "C:\Users\ASUS\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 502, in _call_chain
result = func(*args)
File "C:\Users\ASUS\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 1360, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "C:\Users\ASUS\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 1317, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "C:\Users\ASUS\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 1230, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\ASUS\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 1241, in _send_request
self.putrequest(method, url, **skips)
File "C:\Users\ASUS\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 1096, in putrequest
self._output(self._encode_request(request))
File "C:\Users\ASUS\AppData\Local\Programs\Python\Python38\lib\http\client.py", line 1176, in _encode_request
return request.encode('ascii')
UnicodeEncodeError: 'ascii' codec can't encode character '\u2011' in position 32: ordinal not in range(128)
我在百度查,百度上的解决办法:https://www.cnblogs.com/jfdwd/p/11459936.html
import sys
reload(sys)
sys.setdefaultencoding('utf8')
只适用于python2(2.7等)
用于我的版本(python3.8)会提示:
Traceback (most recent call last):
File "e:/2345Downloads/111 - 副本.py", line 5, in
sys.setdefaultencoding('gbk')
AttributeError: module 'sys' has no attribute 'setdefaultencoding'
我查了这些语句在python3(3.8等)中的替代,发现可以用
https://blog.csdn.net/icehui2012/article/details/76635534
import requests, re, sys
reload(sys)
sys.setdefaultencoding("utf-8")
但是结果和没用一样
报错:
UnicodeEncodeError: 'ascii' codec can't encode character '\u2011' in position 32: ordinal not in range(128)
我又用了
import sys
import imp
imp.reload(sys)
发现还是不行
报错:
UnicodeEncodeError: 'ascii' codec can't encode character '\u2011' in position 32: ordinal not in range(128)
求高手解决