jjyyyyaa 2022-12-24 19:37 采纳率: 50%
浏览 244
已结题

爬虫Selenium

用的论坛里的代码https://blog.csdn.net/feinisite/article/details/114290245?ops_request_misc=&request_id=&biz_id=102&utm_term=%E7%88%AC%E5%8F%96%E4%B8%8A%E5%B8%82%E5%85%AC%E5%8F%B8%E5%85%AC%E5%91%8A&utm_medium=distribute.pc_search_result.none-task-blog-2~all~sobaiduweb~default-1-114290245.142^v68^js_top,201^v4^add_ask,213^v2^t3_control2&spm=1018.2226.3001.4187

有的语法更新了我改了下,然后现在貌似是点击下一页那里出错了,怎么改

"""
 巨潮资讯网数据挖掘实战--获取套期保值公告:
 1.搜索多个关键字
 2.实现翻页功能
 3.正则提取
 4.数据清洗
 5.存储到excel
"""
from selenium import webdriver
import time
import re

def tao_bao(keyword):
    # 1.无界面浏览
    # chrome_options = webdriver.ChromeOptions()
    # chrome_options.add_argument('--headless')
    # browser = webdriver.Chrome(options=chrome_options)
    browser = webdriver.Chrome()  # 模拟谷歌浏览器
    browser.maximize_window()  # 最大化页面
    url='http://www.cninfo.com.cn/new/fulltextSearch?notautosubmit=&keyWord=' + keyword
    browser.get(url)
    time.sleep(3)
    browser.find_element(By.XPATH,r' // *[ @ id = "calendar"] / div / span / div / div / input[1]').send_keys('2014-01-01')
    browser.find_element(By.XPATH,r'// *[ @ id = "calendar"] / div / span / div / div / input[2]').send_keys('2020-01-01')
    browser.find_element(By.XPATH,r'// *[ @ id = "calendar"] / button / i').click()  # 点击搜索
    time.sleep(2)

    # 计算总页数
    data = browser.page_source
    p_count = '</div> <span class="total-box" style="">共 (.*?) 条 当前显示.*?条</span></div>'
    count = re.findall(p_count, data)[0]   # 获取总条数  class ="total-box" style="" > 共 23 条 当前显示21-23条 < / span >
    pages = int(int(count) / 10)  # 获取总页数
    if pages>100:
        pages=100
    else:
        pages=pages
    # print(data)

    # 2.自动翻页获取源代码
    datas=[]
    datas.append(data)
    for i in range(pages):
        browser.find_element(By.XPATH,r'//*[@id="fulltext-search"]/div/div[1]/div[2]/div[4]/div[2]/div/button[2]/i').click()   # 点击下一页按钮
        time.sleep(2)
        data = browser.page_source
        datas.append(data)
        time.sleep(2)
    alldata = "".join(datas)  # 将列表转换为字符串
    # browser.quit()

    # 2.正则提取
    p_title = '<a target="_blank".*?class="r-title">(.*?)</span>'
    p_href = '<a target="_blank" href="(.*?)" data-id="'
    p_shares = '<a target="_blank".*?data-seccode="(.*?)" class='  # 提取股票代码
    p_date = '<a target="_blank" href=".*?;announcementTime=(.*?)" data-id="'  # 提取发布日期
    title = re.findall(p_title,alldata)
    href = re.findall(p_href,alldata)
    shares = re.findall(p_shares,alldata)
    date = re.findall(p_date,alldata)
    # print(title)
    # print(len(title))
    # print(href)
    # print(len(href))
    # print(shares)
    # print(len(shares))
    # print(date)
    # print(len(date))

    # 3.数据清洗
    for i in range(len(title)):
        title[i] = re.sub('<.*?>','',title[i])
        href[i] = 'https://www.cnifo.com.cn' + href[i]
        href[i] = re.sub('amp;','',href[i])
        # print(str(i+1) + '.' + shares[i] +'-'+ title[i] + '-' + date[i])
        # print(href[i])

     # 4. 写进excel文件
    file1 = open('/Users/hsx/Desktop/爬虫/套期保值公告爬取.csv', 'a')  #
    file1.write(keyword + '公告completed' + '\n' + '\n')
    for i in range(len(title)):
        file1.write(str(i+1) + '/' + shares[i] +'/'+ title[i] + '/' + date[i] + href[i])
        file1.write('----------' + '\n')
    file1.close()

# 5.函数定义及调用
keywords = ['套保','套期保值']
for i in keywords:
        tao_bao(i)vv

报错是这样的


NoSuchElementException                    Traceback (most recent call last)
Input In [18], in <cell line: 89>()
     88 keywords = ['向特定对象发行股票']
     89 for i in keywords:
---> 90         tao_bao(i)

Input In [18], in tao_bao(keyword)
     43 datas.append(data)
     44 for i in range(pages):
---> 45     browser.find_element(By.XPATH,r'//*[@id="fulltext-search"]/div/div[1]/div[2]/div[4]/div[2]/div/button[2]/i').click()   # 点击下一页按钮
     46     time.sleep(2)
     47     data = browser.page_source

File D:\anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py:861, in WebDriver.find_element(self, by, value)
    858     by = By.CSS_SELECTOR
    859     value = '[name="%s"]' % value
--> 861 return self.execute(Command.FIND_ELEMENT, {"using": by, "value": value})["value"]

File D:\anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py:444, in WebDriver.execute(self, driver_command, params)
    442 response = self.command_executor.execute(driver_command, params)
    443 if response:
--> 444     self.error_handler.check_response(response)
    445     response["value"] = self._unwrap_value(response.get("value", None))
    446     return response

File D:\anaconda3\lib\site-packages\selenium\webdriver\remote\errorhandler.py:249, in ErrorHandler.check_response(self, response)
    247         alert_text = value["alert"].get("text")
    248     raise exception_class(message, screen, stacktrace, alert_text)  # type: ignore[call-arg]  # mypy is not smart enough here
--> 249 raise exception_class(message, screen, stacktrace)

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="fulltext-search"]/div/div[1]/div[2]/div[4]/div[2]/div/button[2]/i"}
  (Session info: chrome=108.0.5359.125)
Stacktrace:
Backtrace:
    (No symbol) [0x0041E563]
    (No symbol) [0x003A7FC1]
    (No symbol) [0x0029D04D]
    (No symbol) [0x002CC0B0]
    (No symbol) [0x002CC22B]
    (No symbol) [0x002FE612]
    (No symbol) [0x002E85D4]
    (No symbol) [0x002FC9EB]
    (No symbol) [0x002E8386]
    (No symbol) [0x002C163C]
    (No symbol) [0x002C269D]
    GetHandleVerifier [0x006B9B82+2658722]
    GetHandleVerifier [0x006ACB84+2605476]
    GetHandleVerifier [0x004C825A+620666]
    GetHandleVerifier [0x004C6E80+615584]
    (No symbol) [0x003B05EC]
    (No symbol) [0x003B5958]
    (No symbol) [0x003B5A45]
    (No symbol) [0x003C050B]
    BaseThreadInitThunk [0x75D16739+25]
    RtlGetFullPathName_UEx [0x77428AFF+1215]
    RtlGetFullPathName_UEx [0x77428ACD+1165]

  • 写回答

7条回答 默认 最新

  • cjh4312 2022-12-24 20:10
    关注
    获得2.10元问题酬金

    弄得太复杂了,只要这样,啥数据都拿到手了。

    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.27'}
    
    url='http://www.cninfo.com.cn/new/fulltextSearch/full?searchkey=%E5%90%91%E7%89%B9%E5%AE%9A%E5%AF%B9%E8%B1%A1%E5%8F%91%E8%A1%8C%E8%82%A1%E7%A5%A8&sdate=&edate=&isfulltext=false&sortName=pubdate&sortType=desc&pageNum=1&type='
    d=requests.get(url,headers=headers).json()
    
    
    评论
    1人已打赏

报告相同问题?

问题事件

  • 系统已结题 1月1日
  • 修改了问题 12月25日
  • 创建了问题 12月24日

悬赏问题

  • ¥15 找别人艾特你然后删除的微博
  • ¥15 idea做图书管理系统,要求如下
  • ¥15 最短路径分配法——多路径分配
  • ¥15 SQL server 2022安装程序(英语)无法卸载
  • ¥15 关于#c++#的问题:把一个三位数的素数写在另一个三位数素数的后面
  • ¥15 求一个nao机器人跳舞的程序
  • ¥15 anaconda下载后spyder内无法正常运行
  • ¥20 统计PDF文件指定词语的出现的页码
  • ¥50 分析一个亿级消息接收处理策略的问题?
  • ¥20 uniapp 朋友圈分享单页面自定义操作