最近在自学Python爬虫在获取一个页面里的弹窗详情时遇到了问题:
all_data = []
for page in range(1, 4 + 1): # 替换总页数为实际的页码数量
print(f"正在提取第{page}页数据...")
flip_page(driver, page) # 翻页到指定页码
try:
# 等待表格行加载完成
rows = WebDriverWait(driver, 10).until(
EC.visibility_of_all_elements_located(
(By.XPATH, '//*[@id="tscVeh"]/tbody/tr')
)
)
# 如果有行存在,则进行处理
if rows:
for index, row in enumerate(rows, start=1):
# 假设双击的元素位于每行的第一个td中,您可以根据实际情况调整XPath
double_click_element_xpath = (
f'//*[@id="tscVeh"]/tbody/tr[{index}]/td[1]'
)
double_click_element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable(
(By.XPATH, double_click_element_xpath)
)
)
# 执行双击操作
action = ActionChains(driver)
action.move_to_element(
double_click_element
).double_click().perform()
try:
# 假设行是通过特定的元素或类来定义的,例如<div>或<tr>,并且包含“未处理”字样
unhandled_rows = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located(
(
By.XPATH,
"//span[contains(text(), '未处理') and contains(@style, 'color: red')]",
)
)
)
for row in unhandled_rows:
# 找到span所在的tr元素
tr_element = row.find_element(By.XPATH, "..").find_element(
By.XPATH, ".."
) # span -> td -> tr
view_details_button = tr_element.find_element(
By.XPATH, ".//td[last()]/a[@class='view']"
)
# 点击“查看详情”按钮
view_details_button.click()
wait = WebDriverWait(driver, 10)
right_div = wait.until(
EC.visibility_of_element_located(
(
By.XPATH,
"//*[@id='info']/div",
)
)
)
title2_elements = right_div.find_elements(
By.XPATH, "//*[@class='info']/div"
)
for element in title2_elements:
row2_data = {}
for col_num in range(1, 9): # 假设你需要前8列的数据
xpath = f"//*[@id='info']/div[{col_num}]/span[2]"
# 使用显式等待等待元素可见
try:
ch_element = wait.until(
EC.visibility_of_element_located(
(By.XPATH, xpath)
)
)
row2_data[f"Column{col_num}"] = ch_element.text
print(ch_element.text)
except TimeoutException:
print(f"Timed out waiting for element {xpath}")
continue # 如果超时,则跳过当前循环的剩余部分,继续下一个循环
row2_data[f"Column{col_num}"] = ch_element.text
print(ch_element.text)
all_data.append(row2_data)
close_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, ".aui_close")
)
)
driver.execute_script("arguments[0].click();", close_button)
print("已关闭弹窗")
df = pd.DataFrame(all_data)
print(df)
# 等待返回按钮变得可点击
back_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable(
(
By.XPATH,
"//*[@id='mem-content']/div[1]/div/h3/a",
)
)
)
# 点击返回按钮
back_button.click()
print("已点击返回按钮,回到A页面处理下一条数据")
# print("all_data", all_data)
except TimeoutException:
# 等待返回按钮变得可点击
back_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable(
(By.XPATH, "//*[@id='mem-content']/div[1]/div/h3/a")
)
)
# 点击返回按钮
back_button.click()
print("已点击返回按钮,回到A页面处理下一条数据")
print("未找到包含'未处理'字样的行返回A页面")
except TimeoutException:
print("未找到返回按钮,无法回到Aaa页面")
sleep(3)
print(f"第{page}页数据提取完成。")
df.to_excel("SJSD违章明细2.xlsx", index=False, engine="openpyxl")
print("所有数据提取完成并已保存到Excel文件")
driver.quit()
整个代码运行过程中以下代码块执行过程中输出了重复信息
# 点击“查看详情”按钮
view_details_button.click()
wait = WebDriverWait(driver, 10)
right_div = wait.until(
EC.visibility_of_element_located(
(
By.XPATH,
"//*[@id='info']/div",
)
)
)
title2_elements = right_div.find_elements(
By.XPATH, "//*[@class='info']/div"
)
for element in title2_elements:
row2_data = {}
for col_num in range(1, 9): # 假设你需要前8列的数据
xpath = f"//*[@id='info']/div[{col_num}]/span[2]"
# 使用显式等待等待元素可见
try:
ch_element = wait.until(
EC.visibility_of_element_located(
(By.XPATH, xpath)
)
)
row2_data[f"Column{col_num}"] = ch_element.text
print(ch_element.text)
except TimeoutException:
print(f"Timed out waiting for element {xpath}")
continue # 如果超时,则跳过当前循环的剩余部分,继续下一个循环
row2_data[f"Column{col_num}"] = ch_element.text
print(ch_element.text)
all_data.append(row2_data)
close_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, ".aui_close")
)
)
driver.execute_script("arguments[0].click();", close_button)
print("已关闭弹窗")
df = pd.DataFrame(all_data)
print(df)
print(df)输出以下内容 单条内容重复了15次 但我只希望成功获取其中一条内容就可以了
Column1 Column2 Column3 ... Column6 Column7 Column80 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
1 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
2 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
3 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
4 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
5 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
6 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
7 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
8 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
9 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
10 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
11 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
12 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
13 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
14 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
15 获取值1 小型 2024-03-23 21:51 ... 获取值6 3 50
以下是要获取内容页面的结构
<div class="right" style="float: left;width: 50%">
<div style="font-size: 14px;font-weight: 600">信息详情</div>
<div class="info" id="info" style="margin-top: 20px">
<div>
<span class="title"></span>
<span class="title2 hphm">获取值1:</span>
</div>
<div>
<span class="title"></span>
<span class="title2 hpzlStr">小型</span>
</div>
<div>
<span class="title"></span>
<span class="title2 wfsj">2024-03-23 21:51</span>
</div>
<div>
<span class="title"></span>
<span class="title2 wfdz">获取值4</span>
</div>
<div>
<span class="title"></span>
<span class="title2 wfms">获取值5</span>
</div>
<div>
<span class="title"></span>
<span class="title2 cjjgmc">获取值6</span>
</div>
<div>
<span class="title"></span>
<span class="title2 wfjfs">3</span>
</div>
<div>
<span class="title"></span>
<span class="title2 fkje">50</span>
</div>
</div>
</div>
</div>
</div>
请各位python专家们帮忙看看 总感觉自己找的这几个Xpath有问题 但又不知道怎么解决
By.XPATH, "//*[@id='info']/div",
By.XPATH, "//*[@class='info']/div"
xpath = f"//*[@id='info']/div[{col_num}]/span[2]"
```是不是有更好的定位方式去解决!!
最后跑完整个代码将数据保存到XLSX表中的内容也是不全的 总共有40多组信息 但表格里只有8组*15次的重复 也不知道到底是哪里出现了问题 求救!!!!