通过构造ajax请求爬取头条街拍图片时,生成的文件夹全是空的, 求大神指导
from multiprocessing.pool import Pool
import requests
from urllib.parse import urlencode
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"}
def get_page(offset):
    params = {
        'offset': offset,
        'format': 'json',
        'keyword': '街拍',
        'autoload': 'true',
        'count': '20',
        'cur_tab': '1',
        'from':'search_tab',
        'pd': 'synthesis'
    }
    # format the url ,add necessary params
    url = 'http://www.toutiao.com/search_content/?'+urlencode(params)
    try:
        response = requests.get(url,headers= headers)
        if response.status_code == 200:
            return response.json() # format the response in json
    except requests.ConnectionError as e:
        print('Error: ', e.args)
        return None
def parse_page(json):
    if json.get('data'):
        for item in json.get('data'):
            title = item.get('title')
            image_url = item.get('item_source_url') # get the real source url of pictures
            if image_url:
                yield {
                    'image': 'https://www.toutiao.com/'+image_url,
                     'title': title
                } # create a list of dictionary of image source url and title
            else:
                continue  # skip the wrong image_url
import re
def further_get(source_url):
        response = requests.get(source_url['image'], headers= headers)
        pattern = re.compile('http\:\\\\\/\\\\\/p99\.pstatp\.com\\\\\/origin\\\\\/pgc-image\\\\\/[a-f0-9]+',re.S)
        new_items = re.findall(pattern, response.text)
        return new_items # return the list of url
import os
from hashlib import md5
def save_image(item, lst):
    if not os.path.exists(item.get('title')):
        os.mkdir(item.get('title'))
    try:
        for url_deep in lst:
            response = requests.get(url_deep.replace('\\',''),headers= headers)
            if response.status_code == 200:
                file_path = "{}/{}.{}".format(item.get('title'),md5(response.content).hexdigest(),'png')
                if not os.path.exists(file_path):
                    with open(file_path, 'wb') as f:
                        f.write(response.content)
                else:
                    print("Already Download",file_path)
    except requests.ConnectionError:
        print('Failed to save image')
def main(offset):
    json = get_page(offset)
    for item in parse_page(json):
        lst = further_get(item)
        save_image(item, lst)
start = 1
end = 20
if __name__ == '__main__':
    pool= Pool()
    groups = (x * 20 for x in range(start,end+1))
    pool.map(main, groups)
    pool.close()
    pool.join()

代码原来是崔庆才的书里的,但是他那个代码过时了,头条把图片的连接藏到更深一步的链接里了,所以我加了一个further_get 函数

3个回答

在headers里添加cookie试试

sinat_41720061
sinat_41720061 应该不是cookie的问题,我不加cookie的时候也能爬到一部分,只是我爬第二层链接的时候出问题了,见下面评论
大约一年之前 回复

我看了你的代码里爬的链接都是视频链接,所以都没有图片啊

sinat_41720061
sinat_41720061 回答见我下面那个评论
大约一年之前 回复

图片说明
图片说明
我爬的是今日头条街拍下的图集,如图所示,我想爬取item_source_url这个连接后的图片,因为上面那个image_list,_列表不全,
改了之后还是只能爬到很少一部分图片,不过内容的确是图片

Csdn user default icon
上传中...
上传图片
插入图片
抄袭、复制答案,以达到刷声望分或其他目的的行为,在CSDN问答是严格禁止的,一经发现立刻封号。是时候展现真正的技术了!
立即提问