工作中需要获取各招标网站的信息,尝试用python进行抓取。
过程中,遇到1个招标网站(https://ec.chng.com.cn/ecmall/more.do?type=103)抓取不了,试了用requests请求和Selenium模拟浏览器操作。求帮忙,谢谢
requests请求:
import requests
from lxml import etree
# 爬取函数
def get_zb_info(url,header,pama):
response = requests.get(url=url, headers=header, params=pama)
response.encoding = 'utf-8'
wb_data = response.text
print('抓取到以下内容:', wb_data)
# 开始数据解析。。。
if __name__ == '__main__':
url = 'https://ec.chng.com.cn/ecmall/more.do'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': 'S6J51OuUjLieO=5AjWEiOna9Nm3YKgPv7rcuukAiPK8d8Xh1p..nidvUaptoBbPw.whSm2JGJQq.xgMMJH_Ysv9oJnwB.YKOuKiLA; S6J51OuUjLieP=cpu.dPoowrPlOqm3LaFqcgf4QPxNoMp0OqfGvHpahI1KoiImS7cEs3EP6tZpIppclExhsnhNKyZ51Vg3aRbpw1295vGOn0UIOCnz1Ok9tpX2VIDar9byqvoAeU56pyD1pSZufzyTThSS6Mr7IHSs4b2ab_CTkDB.cmsVLWJE0TqRrWOOsxHdQ.a6Pjs6NjlOmST.99_GdzxLZW1nM2GlnkrzZQVvl6yryzK1GS43r67',
# Cookie的值是在真实浏览器访问“https://ec.chng.com.cn/ecmall/more.do?type=103”,然后按F12,从请求头里复制的
# 粘贴到此后可正确抓取到招标信息,但仅短暂有效。
}
pama = {
'type': 103
}
get_zb_info(url, header, pama)
Selenium模拟浏览器操作:
import time,os
#引入selenium库中的webdriver模块,实现对网页的操作
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys # 模拟键盘输入
url = u'https://ec.chng.com.cn/ecmall/more.do?type=103'
#url = u'https://bot.sannysoft.com/'
#创建浏览器对象-谷歌浏览器
options = webdriver.ChromeOptions()
#options = webdriver.EdgeOptions()
driver = webdriver.Edge(options=options)
# 隐藏浏览器指纹
#'''
with open('stealth.min.js') as f:
js = f.read()
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": js
})
#'''
#防止网站检测selenium的webdriver undefined false
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => false
})
"""
})
driver.implicitly_wait(3)
#打开网页
print('准备打开网址')
driver.get(url)
print(f'已打开网址:{url}')
time.sleep(2)
print(f'当前页面title:{driver.title}')
print(f'当前页内容:{str(driver.page_source)}')
# 关闭当前浏览器页面、退出
driver.close()
driver.quit()
Selenium模拟浏览器操作,一直显示空白网页:
