Bernie121 2019-10-16 15:25 采纳率: 0%
浏览 236

怎样获取http://www.delixi-electric.com/cpzx/index.htm这个网站所有的PDF电器说明书

import requests
import urllib.request
from bs4 import BeautifulSoup
import re
import os

def collect_category(url):
    category_urls = []
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
    r = requests.get(url, headers = headers)
    #print (r.status_code)
    soup = BeautifulSoup(r.text, "html.parser")
    urls = soup.select("div.pro_menu > dl > dd > a")
    for i in urls:
        category_urls.append("http://www.delixi-electric.com/"+i.get("href"))
    print (category_urls)
    return category_urls

def collect_items(url):
    items_urls = []
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
    r = requests.get(url, headers = headers)
    soup = BeautifulSoup(r.text, "html.parser")
    urls = soup.select("#pro_list > li > a")
    for i in urls:
        items_urls.append("http://www.delixi-electric.com/"+i.get("href"))
    return items_urls

def download_pdf(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")
    urls = soup.select("body > div:nth-child(10) > div > div:nth-child(3) > ul > li > a")

    for i in urls:

        name = i.get_text()
        name = re.sub("/", "", name)
        print (name)

        if os.path.exists('D:/delixi/%s.pdf' % name):
            print("文件已存在")
            continue

        pdf_url = "http://www.delixi-electric.com/"+i.get("href")
        print (pdf_url)
        u = urllib.request.urlopen(pdf_url)
        print ("进入成功,正在下载......")
        block_sz = 8192
        with open('D:/delixi/%s.pdf' % name, 'wb') as f:
            while True:
                buffer = u.read(block_sz)
                if buffer:
                    f.write(buffer)
                else:
                    print('第%d个文件已下载' % n)
                    break
        print ("=====================")


url = "http://www.delixi-electric.com/dcyb/index.htm"
category_urls = collect_category(url)
print ("目录链接收集完毕")
n = 0
for i in category_urls:
    items_urls = collect_items(i)
    print ("准备开始下载PDF")
    for a in items_urls:
        n+=1
        download_pdf(a)
print ("全部文件下载完毕")

这是我写的代码,请帮我看一下

  • 写回答

2条回答 默认 最新

  • 关注
    评论

报告相同问题?

悬赏问题

  • ¥20 关于#硬件工程#的问题,请各位专家解答!
  • ¥15 关于#matlab#的问题:期望的系统闭环传递函数为G(s)=wn^2/s^2+2¢wn+wn^2阻尼系数¢=0.707,使系统具有较小的超调量
  • ¥15 FLUENT如何实现在堆积颗粒的上表面加载高斯热源
  • ¥30 截图中的mathematics程序转换成matlab
  • ¥15 动力学代码报错,维度不匹配
  • ¥15 Power query添加列问题
  • ¥50 Kubernetes&Fission&Eleasticsearch
  • ¥15 報錯:Person is not mapped,如何解決?
  • ¥15 c++头文件不能识别CDialog
  • ¥15 Excel发现不可读取的内容