from multiprocessing.pool import Pool
import requests
from urllib.parse import urlencode
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"}
def get_page(offset):
params = {
'offset': offset,
'format': 'json',
'keyword': '街拍',
'autoload': 'true',
'count': '20',
'cur_tab': '1',
'from':'search_tab',
'pd': 'synthesis'
}
# format the url ,add necessary params
url = 'http://www.toutiao.com/search_content/?'+urlencode(params)
try:
response = requests.get(url,headers= headers)
if response.status_code == 200:
return response.json() # format the response in json
except requests.ConnectionError as e:
print('Error: ', e.args)
return None
def parse_page(json):
if json.get('data'):
for item in json.get('data'):
title = item.get('title')
image_url = item.get('item_source_url') # get the real source url of pictures
if image_url:
yield {
'image': 'https://www.toutiao.com/'+image_url,
'title': title
} # create a list of dictionary of image source url and title
else:
continue # skip the wrong image_url
import re
def further_get(source_url):
response = requests.get(source_url['image'], headers= headers)
pattern = re.compile('http\:\\\\\/\\\\\/p99\.pstatp\.com\\\\\/origin\\\\\/pgc-image\\\\\/[a-f0-9]+',re.S)
new_items = re.findall(pattern, response.text)
return new_items # return the list of url
import os
from hashlib import md5
def save_image(item, lst):
if not os.path.exists(item.get('title')):
os.mkdir(item.get('title'))
try:
for url_deep in lst:
response = requests.get(url_deep.replace('\\',''),headers= headers)
if response.status_code == 200:
file_path = "{}/{}.{}".format(item.get('title'),md5(response.content).hexdigest(),'png')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(response.content)
else:
print("Already Download",file_path)
except requests.ConnectionError:
print('Failed to save image')
def main(offset):
json = get_page(offset)
for item in parse_page(json):
lst = further_get(item)
save_image(item, lst)
start = 1
end = 20
if __name__ == '__main__':
pool= Pool()
groups = (x * 20 for x in range(start,end+1))
pool.map(main, groups)
pool.close()
pool.join()
代码原来是崔庆才的书里的,但是他那个代码过时了,头条把图片的连接藏到更深一步的链接里了,所以我加了一个further_get 函数