Bernie121 2019-10-16 15:25 采纳率: 0%
浏览 236

怎样获取http://www.delixi-electric.com/cpzx/index.htm这个网站所有的PDF电器说明书

import requests
import urllib.request
from bs4 import BeautifulSoup
import re
import os

def collect_category(url):
    category_urls = []
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
    r = requests.get(url, headers = headers)
    #print (r.status_code)
    soup = BeautifulSoup(r.text, "html.parser")
    urls = soup.select("div.pro_menu > dl > dd > a")
    for i in urls:
        category_urls.append("http://www.delixi-electric.com/"+i.get("href"))
    print (category_urls)
    return category_urls

def collect_items(url):
    items_urls = []
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
    r = requests.get(url, headers = headers)
    soup = BeautifulSoup(r.text, "html.parser")
    urls = soup.select("#pro_list > li > a")
    for i in urls:
        items_urls.append("http://www.delixi-electric.com/"+i.get("href"))
    return items_urls

def download_pdf(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")
    urls = soup.select("body > div:nth-child(10) > div > div:nth-child(3) > ul > li > a")

    for i in urls:

        name = i.get_text()
        name = re.sub("/", "", name)
        print (name)

        if os.path.exists('D:/delixi/%s.pdf' % name):
            print("文件已存在")
            continue

        pdf_url = "http://www.delixi-electric.com/"+i.get("href")
        print (pdf_url)
        u = urllib.request.urlopen(pdf_url)
        print ("进入成功,正在下载......")
        block_sz = 8192
        with open('D:/delixi/%s.pdf' % name, 'wb') as f:
            while True:
                buffer = u.read(block_sz)
                if buffer:
                    f.write(buffer)
                else:
                    print('第%d个文件已下载' % n)
                    break
        print ("=====================")


url = "http://www.delixi-electric.com/dcyb/index.htm"
category_urls = collect_category(url)
print ("目录链接收集完毕")
n = 0
for i in category_urls:
    items_urls = collect_items(i)
    print ("准备开始下载PDF")
    for a in items_urls:
        n+=1
        download_pdf(a)
print ("全部文件下载完毕")

这是我写的代码,请帮我看一下

  • 写回答

2条回答 默认 最新

  • 关注
    评论

报告相同问题?

悬赏问题

  • ¥15 运筹学排序问题的应用
  • ¥15 ubuntu子系统密码忘记
  • ¥15 信号傅里叶变换在matlab上遇到的小问题请求帮助
  • ¥15 保护模式-系统加载-段寄存器
  • ¥15 电脑桌面设定一个区域禁止鼠标操作
  • ¥15 求NPF226060磁芯的详细资料
  • ¥15 使用R语言marginaleffects包进行边际效应图绘制
  • ¥20 usb设备兼容性问题
  • ¥15 错误(10048): “调用exui内部功能”库命令的参数“参数4”不能接受空数据。怎么解决啊
  • ¥15 安装svn网络有问题怎么办