代码:
import numpy as np
import pandas as pd
import time
from urllib import request
import os
导入模块
from selenium import webdriver
指定网址 华润万家 https://tousu.sina.com.cn/company/view/?couid=2015149711
url = 'https://tousu.sina.com.cn/company/view/?couid=2015149711%27
driver = webdriver.Chrome(r"C:\Users\CchengFengF\Desktop\大四上期末\chromedriver.exe")
driver.get(url)
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
设置高度变量
aims = 10
num = 0
while True:
# 对高度进行对比
if num == aims:
break
else:
# 模拟键盘 PgDn进行翻页
ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
time.sleep(1)
num+=1
print(num)
pageSource = driver.page_source
pageSource
from bs4 import BeautifulSoup
soup = BeautifulSoup(pageSource, 'html.parser')
titles = []
for span in soup.find_all('div', class_='m-i-tit'):
print(span.find('h2').text)
titles.append(span.find('h2').text)
contents = []
for span in soup.find_all('div', class_='m-i-cont'):
print(span.find('p').text)
contents.append(span.find('p').text)
labels = []
for span in soup.find_all('div', class_='m-i-list'):
print(span.find('li').text)
labels.append(span.find('li').text)
print(len(titles),len(contents),len(labels))
最后print出来的都是0