sinat_41720061
sinat_41720061
2018-12-10 13:41

通过构造ajax请求爬取头条街拍图片时,生成的文件夹全是空的, 求大神指导

  • python爬虫
  • ajax
from multiprocessing.pool import Pool
import requests
from urllib.parse import urlencode
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"}
def get_page(offset):
    params = {
        'offset': offset,
        'format': 'json',
        'keyword': '街拍',
        'autoload': 'true',
        'count': '20',
        'cur_tab': '1',
        'from':'search_tab',
        'pd': 'synthesis'
    }
    # format the url ,add necessary params
    url = 'http://www.toutiao.com/search_content/?'+urlencode(params)
    try:
        response = requests.get(url,headers= headers)
        if response.status_code == 200:
            return response.json() # format the response in json
    except requests.ConnectionError as e:
        print('Error: ', e.args)
        return None
def parse_page(json):
    if json.get('data'):
        for item in json.get('data'):
            title = item.get('title')
            image_url = item.get('item_source_url') # get the real source url of pictures
            if image_url:
                yield {
                    'image': 'https://www.toutiao.com/'+image_url,
                     'title': title
                } # create a list of dictionary of image source url and title
            else:
                continue  # skip the wrong image_url
import re
def further_get(source_url):
        response = requests.get(source_url['image'], headers= headers)
        pattern = re.compile('http\:\\\\\/\\\\\/p99\.pstatp\.com\\\\\/origin\\\\\/pgc-image\\\\\/[a-f0-9]+',re.S)
        new_items = re.findall(pattern, response.text)
        return new_items # return the list of url
import os
from hashlib import md5
def save_image(item, lst):
    if not os.path.exists(item.get('title')):
        os.mkdir(item.get('title'))
    try:
        for url_deep in lst:
            response = requests.get(url_deep.replace('\\',''),headers= headers)
            if response.status_code == 200:
                file_path = "{}/{}.{}".format(item.get('title'),md5(response.content).hexdigest(),'png')
                if not os.path.exists(file_path):
                    with open(file_path, 'wb') as f:
                        f.write(response.content)
                else:
                    print("Already Download",file_path)
    except requests.ConnectionError:
        print('Failed to save image')
def main(offset):
    json = get_page(offset)
    for item in parse_page(json):
        lst = further_get(item)
        save_image(item, lst)
start = 1
end = 20
if __name__ == '__main__':
    pool= Pool()
    groups = (x * 20 for x in range(start,end+1))
    pool.map(main, groups)
    pool.close()
    pool.join()

代码原来是崔庆才的书里的,但是他那个代码过时了,头条把图片的连接藏到更深一步的链接里了,所以我加了一个further_get 函数

  • 点赞
  • 回答
  • 收藏
  • 复制链接分享

3条回答

为你推荐

换一换