pyhton代码
import time
import re
import codecs
from selenium import webdriver
from pony.orm import *
db = Database()
class Star(db.Entity):
id = PrimaryKey(int, column='id', auto=True)
name = Optional(str, column='name')
gender = Optional(str, column='gender')
href = Optional(str, column='href')
year = Optional(int, column='year')
month = Optional(int, column='month')
day = Optional(int, column='day')
xz = Optional(str, column='xz')
html = Optional(str, column='html')
address = Optional(str, column='address')
height = Optional(int, column='height')
# db.generate_mapping()
time.sleep(1)
db.bind(provider = 'sqlite', filename = 'e:/python/star.sqlite')
db.generate_mapping(create_tables = True)
set_sql_debug(True)
#打开网页
driver = webdriver.Chrome()
url = 'https://www.baidu.com/s?wd=%E6%98%8E%E6%98%9F&rsv_spt=1&rsv_iqid=0xbe4b76860031fb66&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&rqlang=&tn=baiduhome_pg&ch=&rsv_enter=1&rsv_dl=ib&inputT=2978'
driver.get(url)
#点击女
driver.find_element_by_xpath("//div[@id='1']/div/div/div/div[2]/p/span[4]").click()
#点击内地
driver.find_element_by_xpath("//div[@id='1']/div/div/div/div[2]/p[2]/span[3]").click()
time.sleep(1)
f1 = True
while f1 == True:
html = driver.page_source
reobj = re.compile(r'<p class="c-gap-top-small"><a href="([\d\D]*?)" title="(.{1,20})" target="_blank">[\d\D]*?</a></p>')#采集每个明星的href
for match in reobj.finditer(html):
hrefs = match.group(1)
driver2 = webdriver.Chrome()
url2 = 'https://www.baidu.com' + hrefs
driver2.get(url2)#打开浏览器进入明星的链接
names = match.group(2)
html2 = driver2.page_source
reobj4 = re.compile(r"""<div class="result-op c-container xpath-log" srcid="1547"[\d\D]*?<h3 class="t c-gap-bottom-small">
<a href="([\d\D]*?)" target="_blank"><em>.{1,30}</em>_(.{4})</a>
</h3>""")
for match4 in reobj4.finditer(html2):
if match4.group(2) == "百度百科":#判断点击改明星链接之后是否有该明星的百度百科
driver3 = webdriver.Chrome()
url3 = match4.group(1)
driver2.quit()#关闭明星链接的浏览器
driver3.get(url3)#进入明星百度百科的链接
time.sleep(1)
html3 = driver3.page_source
reobj3 = re.compile(r"""<dt class="basicInfo-item name">星 座</dt>
<dd class="basicInfo-item value">
([\d\D]*?)
</dd>
<dt class="basicInfo-item name">血 型</dt>
<dd class="basicInfo-item value">
A型
</dd>
<dt class="basicInfo-item name">[\d\D]*?
<dd class="basicInfo-item value">
([\d\D]*?)
</dd>
[\d\D]*?ss="basicInfo-item name">出生地</dt>
<dd class="basicInfo-item value">
([\d\D]*?)
</dd>
<dt class="basicInfo-item name">出生日期</dt>
<dd class="basicInfo-item value">
([\d\D]*?)年([\d\D]*?)月([\d\D]*?)日[\d\D]*?</dd>""")
for match3 in reobj3.finditer(html3):
print("2")
#新增一条数据
s = Star(name = names,gender = "女",href = hrefs,year = match3.group(6),month = match3.group(7),day = match3.group(8),xz = match3.group(1),html = html3,address = match3.group(5),height = match3.group(3))
db.commit()
driver3.quit()#关闭明星百度百科链接的浏览器
#判断下一页按钮是否存在,若存在则点击下一页
list = []
reobj2 = re.compile(r'<span class="opui-page-next OP_LOG_BTN" style="display: ([\d\D]*?)">下一页</span>')
for match2 in reobj2.finditer(html):
list.append(match2.group(1))
if len(list)>0:
f1 = False
else:
driver.find_element_by_xpath("//div[@id='1']/div/div/div[2]/div[2]/p/span[6]").click()#点击下一页
time.sleep(1)
f.close()