import requests
import urllib.request
from bs4 import BeautifulSoup
import re
import os
def collect_category(url):
category_urls = []
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
r = requests.get(url, headers = headers)
#print (r.status_code)
soup = BeautifulSoup(r.text, "html.parser")
urls = soup.select("div.pro_menu > dl > dd > a")
for i in urls:
category_urls.append("http://www.delixi-electric.com/"+i.get("href"))
print (category_urls)
return category_urls
def collect_items(url):
items_urls = []
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
r = requests.get(url, headers = headers)
soup = BeautifulSoup(r.text, "html.parser")
urls = soup.select("#pro_list > li > a")
for i in urls:
items_urls.append("http://www.delixi-electric.com/"+i.get("href"))
return items_urls
def download_pdf(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
urls = soup.select("body > div:nth-child(10) > div > div:nth-child(3) > ul > li > a")
for i in urls:
name = i.get_text()
name = re.sub("/", "", name)
print (name)
if os.path.exists('D:/delixi/%s.pdf' % name):
print("文件已存在")
continue
pdf_url = "http://www.delixi-electric.com/"+i.get("href")
print (pdf_url)
u = urllib.request.urlopen(pdf_url)
print ("进入成功,正在下载......")
block_sz = 8192
with open('D:/delixi/%s.pdf' % name, 'wb') as f:
while True:
buffer = u.read(block_sz)
if buffer:
f.write(buffer)
else:
print('第%d个文件已下载' % n)
break
print ("=====================")
url = "http://www.delixi-electric.com/dcyb/index.htm"
category_urls = collect_category(url)
print ("目录链接收集完毕")
n = 0
for i in category_urls:
items_urls = collect_items(i)
print ("准备开始下载PDF")
for a in items_urls:
n+=1
download_pdf(a)
print ("全部文件下载完毕")
这是我写的代码,请帮我看一下