我想从深交所互动通爬取2010年1月1日到2023年12月31日的所有关于“绿色”的提问和回答,以下是我运行的代码,但总是爬到一半就报错,不知道是哪里出了问题,求解答。
import requests
import time
import csv
import re
key = "绿色"
# 写入表头
with open('深交所关键词:{0}.csv'.format(key), 'w', newline='', encoding="utf-8") as csv_file:
data = csv.writer(csv_file, delimiter=',')
data.writerow(["time", "name", "code", "Question", "Answer"])
url = "http://irm.cninfo.com.cn/newircs/search/searchResult?stockCodes=&keywords={}&startDate=2010-01-01+00%3A0000%3A00%3A00&endDate=2023-12-31+23%3A5923%3A59%3A59&onlyAttentionCompany=2&pageNum=1&pageSize=10".format(key)
res = requests.get(url).json()
if 'data' in res and 'totalRecord' in res['data']:
total_record = res['data']['totalRecord']
end_page = (total_record // 1000) + 1
page = 1
while page <= end_page:
print("\nThe No.{} page start {}/{}...".format(page, page, end_page))
api_url = "http://irm.cninfo.com.cn/newircs/search/searchResult?stockCodes=&keywords={}&infoTypes=1%2C2%2C3%2C4%2C5%2C6%2C7%2C11&startDate=2010-01-01+00%3A0000%3A00%3A00&endDate=2023-12-31+23%3A5923%3A59%3A59&onlyAttentionCompany=2&pageNum={}&pageSize=1000".format(key, page)
try:
response = requests.get(api_url, timeout=8).json()
except TimeoutError:
print("retry No.{} page...".format(page))
continue
time.sleep(1)
if 'data' in response and 'results' in response['data']:
for element in response['data']['results']:
mainContent = element.get('mainContent', '无') # 获取问题内容
stockCode = '*' + str(element.get('stockCode', '无')) # 获取公司股票代码
companyShortName = element.get('companyShortName', '无') # 获取公司名称
attachedContent = element.get('attachedContent', '无') # 获取答案内容
# 使用正则表达式剔除<em>标签及其内容
mainContent = re.sub(r'<em>.*?</em>', '', mainContent)
attachedContent = re.sub(r'<em>.*?</em>', '', attachedContent)
# 处理日期时间戳
pubDate = element.get('pubDate', 0)
updateDate = element.get('updateDate', 0)
if pubDate and updateDate:
pubDate = int(pubDate) // 1000 # 转换为整数,然后进行整数除法
updateDate = int(updateDate) // 1000
pubDate = time.localtime(pubDate)
updateDate = time.localtime(updateDate)
date = str(time.strftime("%Y-%m-%d", pubDate))
row = [date, companyShortName, stockCode, mainContent, attachedContent]
print(row)
with open('深交所关键词:{0}.csv'.format(key), 'a', newline='', encoding="utf-8") as csv_file:
data = csv.writer(csv_file, delimiter=',')
data.writerow(row)
page = page + 1
以下是报错的内容:
Traceback (most recent call last):
File "C:\Users\ASUS\Desktop\pycharmproject\.venv\lib\site-packages\requests\models.py", line 974, in json
return complexjson.loads(self.text, **kwargs)
File "D:\pi\lib\json\__init__.py", line 357, in loads
return _default_decoder.decode(s)
File "D:\pi\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "D:\pi\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\ASUS\Desktop\pycharmproject\shen.py", line 26, in <module>
response = requests.get(api_url, timeout=8).json()
File "C:\Users\ASUS\Desktop\pycharmproject\.venv\lib\site-packages\requests\models.py", line 978, in json
raise RequestsJSONDecodeError(e.msg, e.doc, e.pos)
requests.exceptions.JSONDecodeError: Expecting value: line 1 column 1 (char 0)