问题遇到的现象和发生背景
Jupyter Notebook 网站爬虫以下链接中的信息,希望发行日期每次自动输入上周六日期至本周五日期,点击查询,如有结果,将所有查询到的信息自动下载到excel文档中,查询没有相关结果,直接提示"no result", 请问怎么实现?感谢~
网站链接:https://www.shclearing.com.cn/IssuerServicePlateform/view/client/search/ISIN_search_do.jsp
我想要达到的结果
、
Jupyter Notebook 网站爬虫以下链接中的信息,希望发行日期每次自动输入上周六日期至本周五日期,点击查询,如有结果,将所有查询到的信息自动下载到excel文档中,查询没有相关结果,直接提示"no result", 请问怎么实现?感谢~
网站链接:https://www.shclearing.com.cn/IssuerServicePlateform/view/client/search/ISIN_search_do.jsp
、
比较笨的办法
import calendar
import re
import time
import openpyxl
import parsel as parsel
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
url = 'https://www.shclearing.com.cn/IssuerServicePlateform/view/client/search/ISIN_search_do.jsp'
service = Service(r"D:\Softwares\chromedriver.exe")
driver = webdriver.Chrome(service=service)
driver.get(url=url)
js1 = "document.getElementById('startDate').removeAttribute('readonly')" # 移除只读属性
js2 = "document.getElementById('startDate').setAttribute('maxlength', 10)" # 修改maxlength的值
driver.execute_script(js1)
driver.execute_script(js2)
js3 = "document.getElementById('endDate').removeAttribute('readonly')" # 移除只读属性
js4 = "document.getElementById('endDate').setAttribute('maxlength', 10)" # 修改maxlength的值
driver.execute_script(js3)
driver.execute_script(js4)
# 获取当前时间
time1 = time.localtime()
# print(time1)
# 返回给定日期的日期码。0(星期一)到6(星期日)。月份为 1(一月) 到 12(12月)。
calendar1 = calendar.weekday(time1.tm_year, time1.tm_mon, time1.tm_mday)
# print(calendar1)
last_saturday = (-2 - calendar1) * 24 * 60 * 60 + time.time()
cur_friday = (4 - calendar1) * 24 * 60 * 60 + time.time()
# 获得上周六日期
last_sat = time.strftime('%Y-%m-%d', time.localtime(last_saturday))
# 获得本周五日期
cur_fri = time.strftime('%Y-%m-%d', time.localtime(cur_friday))
# print(last_sat)
# print(cur_fri)
el_start = driver.find_element(by=By.ID, value="startDate")
el_start.send_keys(last_sat)
# el_start.send_keys('2022-08-24')
time.sleep(2)
el_end = driver.find_element(by=By.ID, value="endDate")
el_end.send_keys(cur_fri)
# el_end.send_keys('2022-09-02')
# 找到查询按钮
el_search = driver.find_element(by=By.XPATH, value='//*[@id="button"]')
# 点击
el_search.click()
# 滑动到页面底部
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(3)
# 找到重置
# el_reset = driver.find_element(by=By.XPATH, value='//*[@id="button2"]')
# el_reset.click()
html_data = driver.page_source
# print(html_data)
selector = parsel.Selector(html_data)
results = selector.css('#ISINS > tbody > tr').getall()
# print(results)
# print(type(results))
# print(len(results))
len_results = len(results)
# 判断是否有结果
if len_results == 1 and results[0] == '':
print("no result")
else:
# 获取总页数
page_total = int(re.findall('<span class="gray_text12"> 共(.*?)</span>', html_data)[0])
fieldnames = ['序号', '产品代码', 'ISIN编码', '发行日', '产品中文全称', '产品中文简称', '产品英文简称']
# 1.创建空白工作簿
work_book = openpyxl.Workbook()
# 2.创建新的工作表
work_book.create_sheet()
work_sheet = work_book.active # 获取当前工作表
# 4.写入单元格
# 写入标题行
work_sheet.append(fieldnames)
for page in range(1, page_total + 1):
# 获取数据
selector1 = parsel.Selector(driver.page_source)
results1 = selector1.css('#ISINS > tbody > tr').getall()
for r in results1[1:-1]:
row = re.findall(
'<td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>('
'.*?)</td></tr>',
r)[0]
print(list(row))
# print(type(row))
work_sheet.append(list(row))
time.sleep(3)
if page < page_total:
# 点击“下一页”
driver.find_element(by=By.LINK_TEXT, value='下一页').click()
time.sleep(5)
# 保存Excel文件
# work_book.save('test.xlsx')
# 关闭浏览器
driver.close()