我想要达到的结果
能爬下来百度学术上关于我给的关键字相关的文献
问题相关代码,请勿粘贴截图
import sys
import re
import urllib.request
import xlwt
from bs4 import BeautifulSoup
#定义正则表达式筛选规则
findLink=re.compile(r'''<a href="(.*?)" data-click="{'button_tp':'title'}" target="_blank">''',re.S)#视频链接
#获取对应url网页的数据
def get_url(url):
head = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36 Edg/96.0.1054.34"}#模拟计算机访问网页
request = urllib.request.Request(url,headers=head)#以headers身份访问url网页
html=''
try:
reponse = urllib.request.urlopen(request)
html = reponse.read().decode('utf-8')
except urllib.error.URLError as e:
if hasattr(e,'code'):
print(e.code)
if hasattr(e,'reason'):
print(e.reason)
return html
#调用get_url函数获取指定网页数据,以html形式存储
def get_data(baseurl):
data_list=[]
html=get_url(baseurl)#获取get_url爬到的数据
soup=BeautifulSoup(html,'html.parser')#定义使用html解读器解读数据的变量soup
links=soup.select('div.content .img a')
for item in links:
href=item.attrs['href']
print(href)
data_list.append(href)
return data_list
text = input("你想要搜索什么")
url = "https://xueshu.baidu.com/s?wd="+text
get_data(url)
运行结果及报错内容
D:\python\python.exe D:/python爬虫作业/爬虫(改版)卢琮文.py
你想要搜索什么血液
Traceback (most recent call last):
File "D:\python爬虫作业\爬虫(改版)卢琮文.py", line 36, in <module>
get_data(url)
File "D:\python爬虫作业\爬虫(改版)卢琮文.py", line 26, in get_data
html=get_url(baseurl)#获取get_url爬到的数据
File "D:\python爬虫作业\爬虫(改版)卢琮文.py", line 15, in get_url
reponse = urllib.request.urlopen(request)
File "D:\python\lib\urllib\request.py", line 214, in urlopen
return opener.open(url, data, timeout)
File "D:\python\lib\urllib\request.py", line 517, in open
response = self._open(req, data)
File "D:\python\lib\urllib\request.py", line 534, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "D:\python\lib\urllib\request.py", line 494, in _call_chain
result = func(*args)
File "D:\python\lib\urllib\request.py", line 1389, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "D:\python\lib\urllib\request.py", line 1346, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "D:\python\lib\http\client.py", line 1257, in request
self._send_request(method, url, body, headers, encode_chunked)
File "D:\python\lib\http\client.py", line 1268, in _send_request
self.putrequest(method, url, **skips)
File "D:\python\lib\http\client.py", line 1106, in putrequest
self._output(self._encode_request(request))
File "D:\python\lib\http\client.py", line 1186, in _encode_request
return request.encode('ascii')
UnicodeEncodeError: 'ascii' codec can't encode characters in position 10-11: ordinal not in range(128)
进程已结束,退出代码为 1