import requests
import re
import csv
def getHTML():
name = input('请输入爬取商品的名字:')
start_url = 'https://gkxy.gyao511.com/goods/search?keyword={}&s='.format(name)
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}
path = 'F:\mycookie.txt'
with open(path, 'r')as f:
mycookies = f.read()
mycookies = mycookies.split(';')
cookies = {}
for cookie in mycookies:
name, value = cookie.strip().split('=', 1)
cookies[name] = value
pages = input('请输入爬取的商品页数:')
goods = ''
for i in range(int(pages)):
url = start_url + str(i * 20)
r = requests.get(url, headers=header, cookies=cookies, timeout=60)
r.encoding = r.apparent_encoding
goods += r.text
return goods
def findMS(html):
print('=' * 20, '正在爬取商品信息', '=' * 20, '\n')
marketnames = re.findall('"goodsName":"(.*?)"', html)
titles = re.findall('"goodsPrice":"(.*?)"', html)
prices = re.findall('"brandName":"(.*?)"', html)
pays = re.findall('"gcName":"(.*?)"', html)
data = []
try:
for i in range(len(titles)):
data.append([marketnames[i], titles[i], prices[i],pays[i]])
if data == '':
print('=' * 20, '暂无此商品信息', '=' * 20, '\n')
return data
print('=' * 20, '爬取成功', '=' * 20, '\n')
except:
print('异常,爬取中断')
return data
def download(data):
print('=' * 20, '正在保存商品信息', '=' * 20, '\n')
path = 'F:\goods.csv'
try:
f = open(path, "w", newline="")
writer = csv.writer(f)
writer.writerow(['商品名', '商品价格', '生产厂家', '分类'])
writer.writerows(data)
print('=' * 20, '保存成功', '=' * 20, '\n')
except:
print('保存失败')
f.close()
def main():
html = getHTML()
data = findMS(html)
download(data)
if __name__ == "__main__":
main()