qq_43178462 2023-01-13 20:49 采纳率: 30%
浏览 29
已结题

python获取直面内指定内容并下载

根据指定网页页面爬取内容
遇到的现象和发生背景,请写出第一个错误信息
用代码块功能插入代码,请勿粘贴截图。 不用代码块回答率下降 50%
运行结果及详细报错内容
我的解答思路和尝试过的方法,不写自己思路的,回答率下降 60%
我想要达到的结果,如果你需要快速回答,请尝试 “付费悬赏”
  • 写回答

2条回答 默认 最新

  • 素影·流年 2023-01-13 20:53
    关注

    参考一下,望采纳

    import requests
    import re
    import os
    import wget
    import threading
    import time
    import random
    
    Lock = threading.Lock()
    
    post_dict = {
        'catalogue_id': '',
        'name':'',
        'comments':'',
        'var_nonvariable': 'on',
        'var_variable':'on',
        'var_periodic':'on',
        'var_pmin':'',
        'var_pmax':'',
        'action':'search',
        'type_0':'on',
        'type_1':'on',
        'type_2':'on',
        'type_3':'on',
        'type_4':'on',
        'type_5':'on',
        'type_6':'on',
        'type_7':'on',
        'orb_incl_min':'',
        'orb_incl_max':'',
        'orb_period_min':'',
        'orb_period_max':''
        }
    header = {
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Cookie':'csrftoken=9jhoS62afWsXqv1DisuneTFkmWySczcN',
        'Host':'mmt.favor2.info',
        'Referer': 'http://mmt.favor2.info/satellites',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.76'
        }
    def SetDict(id):
        global post_dict
        post_dict['catalogue_id'] = str(id)
    
    def Download(url,savedir):
        print('%s 正在下载 将保存至 %s\n' % (url.strip(),savedir),end = '')
        wget.download(url,out = savedir)
        print('%s 已下载完毕 已保存至 %s\n' % (url.strip(),savedir),end = '')
    
    def checkStatus(track_id):
        print('正在检查: %s\n' % (track_id),end = '')
        headers= {
            "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; Tablet PC 2.0; wbx 1.0.0; wbxapp 1.0.0; Zoom 3.6.0)"
        }
        response = requests.get('http://mmt.favor2.info/satellites/track/%s' % (str(track_id)),headers = headers)
        text = response.text.encode(response.encoding).decode(response.apparent_encoding)
        tag = "<span class=\"text-default\">Periodic</span></td></tr><tr><td>Lightcurve period"
        if tag in text:
            global Lock
            global download
            print('%s 已通过检查\n' % (track_id),end = '')
            Lock.acquire()
            download.append(track_id)
            Lock.release()
        else:
            print('%s 未通过检查\n' % (track_id),end = '')
    
    def LoadPage(url,savedir):
        headers= {
            "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; Tablet PC 2.0; wbx 1.0.0; wbxapp 1.0.0; Zoom 3.6.0)"
        }
        response = requests.get(url,headers = headers)
        text = response.text.encode(response.encoding).decode(response.apparent_encoding)
        result = re.findall('<a href=\"/satellites/track/(.*)/download\" title=\"Downoad track\">T</a>',text)
        print('%s 中的下载地址已获取未筛选id(共%d项)' % (url,len(result)))
        threads = []
        global download
        download = []
        for sid in result:
            time.sleep(random.randint(1,10) / 10)
            thd = threading.Thread(target = checkStatus,args = (sid,))
            thd.start()
            threads.append(thd)
        while len(threads) != 0:
            threads[0].join()
            threads.pop(0)
        print('%s 中的下载地址已获取以筛选的id(共%d项)' % (url,len(download)))
        for sid in download:
            time.sleep(random.randint(1,10) / 10)
            thd = threading.Thread(target = Download,args = ('http://mmt.favor2.info/satellites/track/%s/download\n' % (sid),os.path.join(savedir,'track_%s.txt' % (sid)),))
            thd.start()
            threads.append(thd)
        while len(threads) != 0:
            threads[0].join()
            threads.pop(0)
    
    def LoadDownSatelites(id,savedir):
        SetDict(id)
        global post_dict
        response = requests.post("http://mmt.favor2.info/satellites",post_dict)
    ##    with open("test.html","w") as f:
    ##        f.write(response.text)
    ##    with open("test.html","r") as f:
    ##        text = f.read()
        text = response.text.encode(response.encoding).decode(response.apparent_encoding)
    
        sid = re.findall('<a href=\"/accounts/login/\?next=/satellites/(.*)\">Log in</a></li>',text)[0]
        page = len(re.findall('/satellites/%s\?page=.' % (sid),text)) + 1
        
        print('查找到%d对应的编号%s,共%d页准备下载' % (id,sid,page))
    
        if not os.path.exists(os.path.join(savedir,str(id))):
            print('%s不存在,程序已自动创建' % (os.path.join(savedir,str(id))))
            os.makedirs(os.path.join(savedir,str(id)))
        
        for pg in range(1,page + 1):
            LoadPage('http://mmt.favor2.info/satellites/%s?page=%d' % (sid,pg),os.path.join(savedir,str(id)))
    
    LoadDownSatelites(163,'./Data')
    
    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论
查看更多回答(1条)

报告相同问题?

问题事件

  • 系统已结题 1月21日
  • 已采纳回答 1月13日
  • 创建了问题 1月13日