Matoi_R 2017-04-29 13:36 采纳率: 100%
浏览 965
已采纳

求帮下新手。。有关PYTHON3的基础爬虫类问题

import urllib.request
import os
import re

def into(url):
url="http://www.piaofang168.com/"
response=urllib.request.urlopen(url)
html=response.read().decode('utf-8')

print(html)
return html

def find(url):
findit=into(url).html
findit=re.compile('

(.*?)',re.S)
items=re.findall(find,html)
for item in items:
print (item)
f=open("a.txt","a")
f.write(item)
f.close()

就是它为什么连html都打印不出来,一开始是可以的,就是我用了DEF将它们包装后就运行不了了。。。。

  • 写回答

3条回答 默认 最新

  • N4A 2017-04-29 15:57
    关注

    第一次用,改一下排版
    1

    import urllib.request
    import re
    
    
    def into(url):
        #url = "http://www.piaofang168.com/"
        response = urllib.request.urlopen(url)
        html = response.read().decode('utf-8')
    
        print(html)
        return html
    
    
    def find(url):
        #findit = into(url).html
        html = into(url)
        findit = re.compile('(.*?)', re.S)
        # items = re.findall(find, html)
        items = re.findall(findit, html)
        for item in items:
            print(item)
            f = open("a.txt", "a")
            f.write(item)
            f.close()
    
    url = "http://www.piaofang168.com/"
    if __name__ == '__main__':
        find(url)
    
    

    2

    #!/usr/bin/env python
    #coding:utf-8
    import urllib.request
    from bs4 import BeautifulSoup
    
    
    def parse_list(url):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
        req = urllib.request.Request(url, headers=headers)
        page = urllib.request.urlopen(req, timeout=60)
        contents = page.read()
        soup = BeautifulSoup(contents, "lxml")
        for tag in soup.find_all('div', class_='content-list'):
            try:
                data_url = tag.h3.a.attrs['href']
            except AttributeError:
                print("error at:", tag.get_text())
            else:
                if verbose:
                    print(data_url)
                parse_data(data_base_url+data_url)
    
    
    def parse_data(url):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
        req = urllib.request.Request(url, headers=headers)
        page = urllib.request.urlopen(req, timeout=60)
        try:
            contents = page.read().decode('UTF-8')
        except UnicodeDecodeError:
            print("UnicodeDecodeError: " + url)
        else:
            soup = BeautifulSoup(contents, "lxml")
            try:
                tag = soup.find('div', id='homepost')
                # if verbose:
                #     print(tag)
                title = tag.find('div', class_='toptit').h2.get_text()
                if verbose:
                    print(title)
                trs_left = tag.find('table', class_="infotable").find_all('tr')
                if verbose:
                    print(trs_left)
                read_num = trs_left[1].td.span.get_text()
                download_num = trs_left[2].td.span.get_text()
                download_points = trs_left[3].td.span.get_text()
    
            except AttributeError:
                print("error at:", url)
            else:
                write_data(title, read_num, download_num, download_points, url)
    
    
    def write_data(title, read_num, download_num, download_points, url):
        f.write(title + "," + read_num + "," + download_num + "," + download_points + "," + url + "\n")
    
    
    base_url = 'http://www.codeforge.cn/l/0/c/0/t/0/v/0/p/'
    data_base_url = 'http://www.codeforge.cn'
    f = open('data.csv', 'w')
    verbose = False
    if __name__ == '__main__':
        f.write("title, read_num, download_num, download_points, url \n")
        for i in range(1000):
            parse_list(base_url + str(i))
            f.flush()
            print("has finish %s" % str((i+1)*10))
    
    
    
    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论
查看更多回答(2条)

报告相同问题?

悬赏问题

  • ¥20 有关区间dp的问题求解
  • ¥15 多电路系统共用电源的串扰问题
  • ¥15 slam rangenet++配置
  • ¥15 有没有研究水声通信方面的帮我改俩matlab代码
  • ¥15 对于相关问题的求解与代码
  • ¥15 ubuntu子系统密码忘记
  • ¥15 信号傅里叶变换在matlab上遇到的小问题请求帮助
  • ¥15 保护模式-系统加载-段寄存器
  • ¥15 电脑桌面设定一个区域禁止鼠标操作
  • ¥15 求NPF226060磁芯的详细资料