Zman777 2024-04-11 07:23 采纳率: 25%
浏览 10

关于#python#的爬虫Xpath定位问题,请各位专家解答!

最近在自学Python爬虫在获取一个页面里的弹窗详情时遇到了问题:

all_data = []
    for page in range(1, 4 + 1):  # 替换总页数为实际的页码数量
        print(f"正在提取第{page}页数据...")
        flip_page(driver, page)  # 翻页到指定页码
        try:
            # 等待表格行加载完成
            rows = WebDriverWait(driver, 10).until(
                EC.visibility_of_all_elements_located(
                    (By.XPATH, '//*[@id="tscVeh"]/tbody/tr')
                )
            )
            # 如果有行存在,则进行处理
            if rows:
                for index, row in enumerate(rows, start=1):
                    # 假设双击的元素位于每行的第一个td中,您可以根据实际情况调整XPath
                    double_click_element_xpath = (
                        f'//*[@id="tscVeh"]/tbody/tr[{index}]/td[1]'
                    )
                    double_click_element = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable(
                            (By.XPATH, double_click_element_xpath)
                        )
                    )
                    # 执行双击操作
                    action = ActionChains(driver)
                    action.move_to_element(
                        double_click_element
                    ).double_click().perform()

                    try:
                        # 假设行是通过特定的元素或类来定义的,例如<div>或<tr>,并且包含“未处理”字样
                        unhandled_rows = WebDriverWait(driver, 10).until(
                            EC.presence_of_all_elements_located(
                                (
                                    By.XPATH,
                                    "//span[contains(text(), '未处理') and contains(@style, 'color: red')]",
                                )
                            )
                        )
                        for row in unhandled_rows:
                            # 找到span所在的tr元素
                            tr_element = row.find_element(By.XPATH, "..").find_element(
                                By.XPATH, ".."
                            )  # span -> td -> tr
                            view_details_button = tr_element.find_element(
                                By.XPATH, ".//td[last()]/a[@class='view']"
                            )
                            # 点击“查看详情”按钮
                            view_details_button.click()
                            wait = WebDriverWait(driver, 10)
                            right_div = wait.until(
                                EC.visibility_of_element_located(
                                    (
                                        By.XPATH,
                                        "//*[@id='info']/div",
                                    )
                                )
                            )

                            title2_elements = right_div.find_elements(
                                By.XPATH, "//*[@class='info']/div"
                            )
                            for element in title2_elements:
                                row2_data = {}
                                for col_num in range(1, 9):  # 假设你需要前8列的数据
                                    xpath = f"//*[@id='info']/div[{col_num}]/span[2]"
                                    # 使用显式等待等待元素可见
                                    try:
                                        ch_element = wait.until(
                                            EC.visibility_of_element_located(
                                                (By.XPATH, xpath)
                                            )
                                        )
                                        row2_data[f"Column{col_num}"] = ch_element.text
                                        print(ch_element.text)
                                    except TimeoutException:
                                        print(f"Timed out waiting for element {xpath}")
                                        continue  # 如果超时,则跳过当前循环的剩余部分,继续下一个循环
                                    row2_data[f"Column{col_num}"] = ch_element.text
                                    print(ch_element.text)

                                all_data.append(row2_data)
                                close_button = WebDriverWait(driver, 10).until(
                                    EC.element_to_be_clickable(
                                        (By.CSS_SELECTOR, ".aui_close")
                                    )
                                )
                            driver.execute_script("arguments[0].click();", close_button)
                            print("已关闭弹窗")
                            df = pd.DataFrame(all_data)
                            print(df)
                        # 等待返回按钮变得可点击
                        back_button = WebDriverWait(driver, 10).until(
                            EC.element_to_be_clickable(
                                (
                                    By.XPATH,
                                    "//*[@id='mem-content']/div[1]/div/h3/a",
                                )
                            )
                        )
                        # 点击返回按钮
                        back_button.click()
                        print("已点击返回按钮,回到A页面处理下一条数据")
                        # print("all_data", all_data)
                    except TimeoutException:
                        # 等待返回按钮变得可点击
                        back_button = WebDriverWait(driver, 10).until(
                            EC.element_to_be_clickable(
                                (By.XPATH, "//*[@id='mem-content']/div[1]/div/h3/a")
                            )
                        )
                        # 点击返回按钮
                        back_button.click()
                        print("已点击返回按钮,回到A页面处理下一条数据")
                        print("未找到包含'未处理'字样的行返回A页面")
        except TimeoutException:
            print("未找到返回按钮,无法回到Aaa页面")
    sleep(3)
    print(f"第{page}页数据提取完成。")

    df.to_excel("SJSD违章明细2.xlsx", index=False, engine="openpyxl")
    print("所有数据提取完成并已保存到Excel文件")
    driver.quit()

整个代码运行过程中以下代码块执行过程中输出了重复信息

 # 点击“查看详情”按钮
                            view_details_button.click()
                            wait = WebDriverWait(driver, 10)
                            right_div = wait.until(
                                EC.visibility_of_element_located(
                                    (
                                        By.XPATH,
                                        "//*[@id='info']/div",
                                    )
                                )
                            )

                            title2_elements = right_div.find_elements(
                                By.XPATH, "//*[@class='info']/div"
                            )
                            for element in title2_elements:
                                row2_data = {}
                                for col_num in range(1, 9):  # 假设你需要前8列的数据
                                    xpath = f"//*[@id='info']/div[{col_num}]/span[2]"
                                    # 使用显式等待等待元素可见
                                    try:
                                        ch_element = wait.until(
                                            EC.visibility_of_element_located(
                                                (By.XPATH, xpath)
                                            )
                                        )
                                        row2_data[f"Column{col_num}"] = ch_element.text
                                        print(ch_element.text)
                                    except TimeoutException:
                                        print(f"Timed out waiting for element {xpath}")
                                        continue  # 如果超时,则跳过当前循环的剩余部分,继续下一个循环
                                    row2_data[f"Column{col_num}"] = ch_element.text
                                    print(ch_element.text)

                                all_data.append(row2_data)
                                close_button = WebDriverWait(driver, 10).until(
                                    EC.element_to_be_clickable(
                                        (By.CSS_SELECTOR, ".aui_close")
                                    )
                                )
                            driver.execute_script("arguments[0].click();", close_button)
                            print("已关闭弹窗")
                            df = pd.DataFrame(all_data)
                            print(df)

print(df)输出以下内容 单条内容重复了15次 但我只希望成功获取其中一条内容就可以了

Column1  Column2           Column3  ...       Column6 Column7 Column8

0 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
1 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
2 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
3 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
4 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
5 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
6 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
7 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
8 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
9 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
10 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
11 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
12 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
13 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
14 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
15 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50

以下是要获取内容页面的结构

<div class="right" style="float: left;width: 50%">
                <div style="font-size: 14px;font-weight: 600">信息详情</div>
                <div class="info" id="info" style="margin-top: 20px">
                    <div>
                        <span class="title"></span>
                        <span class="title2 hphm">获取值1:</span>
                    </div>
                    <div>
                        <span class="title"></span>
                        <span class="title2 hpzlStr">小型</span>
                    </div>
                    <div>
                        <span class="title"></span>
                        <span class="title2 wfsj">2024-03-23 21:51</span>
                    </div>
                    <div>
                        <span class="title"></span>
                        <span class="title2 wfdz">获取值4</span>
                    </div>
                    <div>
                        <span class="title"></span>
                        <span class="title2 wfms">获取值5</span>
                    </div>
                    <div>
                        <span class="title"></span>
                        <span class="title2 cjjgmc">获取值6</span>
                    </div>
                    <div>
                        <span class="title"></span>
                        <span class="title2 wfjfs">3</span>
                    </div>
                    <div>
                        <span class="title"></span>
                        <span class="title2 fkje">50</span>
                    </div>
                </div>
            </div>
        </div>
    </div>

请各位python专家们帮忙看看 总感觉自己找的这几个Xpath有问题 但又不知道怎么解决

 By.XPATH, "//*[@id='info']/div",
By.XPATH, "//*[@class='info']/div"
xpath = f"//*[@id='info']/div[{col_num}]/span[2]"

```是不是有更好的定位方式去解决!!
最后跑完整个代码将数据保存到XLSX表中的内容也是不全的 总共有40多组信息 但表格里只有8组*15次的重复 也不知道到底是哪里出现了问题 求救!!!!
  • 写回答

8条回答 默认 最新

  • Zman777 2024-04-13 09:59
    关注

    尝试了以上几位给的代码都是没有得到想要的结果 我自己再次修改可还是未得到想要的结果再测试输出时所有 row_data all_data 都会重复输出8 不知道什么原因 感觉是title2_elements 这里的xpath写错了 但又不知道怎么改 求解!以下是修改后的代码

    if rowsB:
                                for index, rowB in enumerate(rowsB, start=1):
                                    view_click_element_xpath = (
                                        f'//*[@id="applyList"]/tbody/tr[{index}]/td[9]/a'
                                    )
                                    view_click_element = WebDriverWait(driver, 10).until(
                                        EC.element_to_be_clickable(
                                            (By.XPATH, view_click_element_xpath)
                                        )
                                    )
                                    view_click_element.click()  # 点击查看详情按钮
                                    # 等待弹窗加载完成
                                    popup_element = WebDriverWait(driver, 10).until(
                                        EC.presence_of_element_located(
                                            (
                                                By.XPATH,
                                                "//*[@id='info']",
                                            )
                                        )
                                    )
                                    title2_elements = popup_element.find_elements(
                                        By.XPATH, "//*[@class='info']/div/span[2]"
                                    )
                                    for element in title2_elements:
                                        row_data = {}
                                        for col_num in range(1, 9):  # 假设你需要前8列的数据
                                            xpath = (
                                                f"//*[@id='info']/div[{col_num}]/span[2]"
                                            )
                                            print("col_num:", col_num)
                                            # 使用显式等待等待元素可见
                                            try:
                                                ch_element = WebDriverWait(
                                                    driver, 10
                                                ).until(
                                                    EC.visibility_of_element_located(
                                                        (By.XPATH, xpath)
                                                    )
                                                )
                                                row_data[f"Column{col_num}"] = (
                                                    ch_element.text
                                                )
                                                print("ch_element.text:", ch_element.text)
                                            except TimeoutException:
                                                row_data[f"Column{col_num}"] = ""
                                                print(
                                                    f"Timed out waiting for element {xpath}"
                                                )
                                        all_data.append(row_data)
                                        print("row_data:", row_data)
                                    close_button = WebDriverWait(driver, 10).until(
                                        EC.element_to_be_clickable(
                                            (By.XPATH, "//*[@class='aui_buttons']")
                                        )
                                    )
                                    close_button.click()
                                    print("已关闭弹窗")
                                    print("处理弹窗内的数据all_data:", all_data)
                                    df = pd.DataFrame(all_data)
                                    print(df)
    
    

    还有html的结构

    <div class="info xh-highlight" id="info" style="margin-top: 20px">
                        <div class="">
                            <span class="title">xxxx:</span>
                            <span class="title2 hphm">xxxxxx</span>
                        </div>
                        <div class="">
                            <span class="title">xxxxxx</span>
                            <span class="title2 hpzlStr">xxxxxx</span>
                        </div>
                        <div class="">
                            <span class="title">xxxxxx</span>
                            <span class="title2 wfsj">xxxxxx</span>
                        </div>
                        <div class="">
                            <span class="title">xxxxxx</span>
                            <span class="title2 wfdz">xxxxxx</span>
                        </div>
                        <div class="">
                            <span class="title">xxxxxx</span>
                            <span class="title2 wfms">xxxxxx</span>
                        </div>
                        <div class="">
                            <span class="title">xxxxxx</span>
                            <span class="title2 cjjgmc">xxxxxx</span>
                        </div>
                        <div class="">
                            <span class="title">xxxxxx</span>
                            <span class="title2 wfjfs">xxxxxx</span>
                        </div>
                        <div class="">
                            <span class="title">xxxxxx</span>
                            <span class="title2 fkje">xxxxxx</span>
                        </div>
                    </div>
    
    

    我只需要这个页面里所有title2的值 一次就可以 不希望输出8次

    评论

报告相同问题?

问题事件

  • 创建了问题 4月11日