Traceback (most recent call last): File "C:\Users\asus\PycharmProjects\pythonProject2\完整代码.py", line 132, in <module> newsary=parseListLinks(newsurl) File "C:\Users\asus\PycharmProjects\pythonProject2\完整代码.py", line 62, in parseListLinks jd = json.loads('{' + res.text.lstrip('try{feedCardJsonpCallback(').rstrip(');}catch(e){};') + '}}',encoding='utf-8') File "C:\Users\asus\AppData\Local\Programs\Python\Python39\lib\json\__init__.py", line 359, in loads return cls(**kw).decode(s) TypeError: __init__() got an unexpected keyword argument 'encoding' 以上是报错内容,我是用写的python代码试着爬取新浪的新闻网页
具体代码如下:
# -*- coding: utf-8 -*- import json import requests import re import pymysql import jieba import jieba.analyse import numpy as np import wordcloud import PIL.Image as image from bs4 import BeautifulSoup from matplotlib import pyplot as plt from collections import Counter res=requests.get('http://news.sina.com.cn/china') res.enconding='utf-8' soup=BeautifulSoup(res.text,'html.parser') url="https://feed.sina.com.cn/api/roll/get?pageid=121&lid=1356&num=20&versionNumber=1.2.4&page={}&encode=utf-8&callback=feedCardJsonpCallback" def getcomments(newsurl): try: m=re.search('doc-i(.*).shtml',newsurl) newsid=m.group(1) comment_url="https://comment.sina.com.cn/page/info?version=1&format=json&channel=gn&newsid=comos-{}&group=undefined&compress=0&ie=utf-8&oe=utf-8&page_size=3&t_size=3&h_size=3" comments=requests.get(comment_url.format(newsid)) jd=json.loads(comments.text) return jd['result']['count']['total'] except KeyError: return 0 def getNewsDetail(newsurl): headers={ 'Referer':'https://news.sina.com.cn/china/', 'accept':'*/*', 'accept-encoding':'gzip,deflate,br', 'accept-language':'zh-CN,zh;q=0.9', 'User-agent':'Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit/537.36(LHTML,like Gecko) Chrome/80.0.3987.149 Safari/537.36' } result={} res=requests.get(newsurl,headers=headers) res.encoding = 'utf-8' soup=BeautifulSoup(res.text,'html.parser') result['title']=soup.select('.main-title')[0].text timesource=soup.select('.date')[0].contents[0].strip() result['dt']=timesource result['article']=' '.join([p.text.strip() for p in soup.select('.article p')[:-1]]) result['editor']=soup.select('.show_author')[0].text.lstrip('责任编辑:') try: result['source']=soup.select('.date-source a')[0].text except IndexError: result['source']=soup.select('.source')[0].text result['comments']=getcomments(newsurl) return result def parseListLinks(url): headers = { 'Referer': 'https://news.sina.com.cn/china/', 'accept': '*/*', 'accept-encoding': 'gzip,deflate,br', 'accept-language': 'zh-CN,zh;q=0.9', 'User-agent': 'Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit/537.36(LHTML,like Gecko) Chrome/80.0.3987.149 Safari/537.36' } newsdetails = [] res = requests.get(url,headers=headers) res.encoding = 'utf-8' jd = json.loads('{' + res.text.lstrip('try{feedCardJsonpCallback(').rstrip(');}catch(e){};') + '}}',encoding='utf-8') for ent in jd['result']['data']: newsdetails.append(getNewsDetail(ent['url'])) return newsdetails class yfsql(): db=None config={ 'host':"localhost", 'port':3306, 'username':"root", 'password':'228742', 'database':'data', 'charset':"utf-8" } def connect(self): if self.db==None: self.db=pymysql.connect( host=self.config['host'], port=self.config['port'], user=self.config['username'], passwd=self.config['password'], db=self.config['database'], charset=self.config['charset'] ) return self.db def __init__(self): self.connect() def delete(self): if self.db!=None: self.db.close() def create(self,table1): cursor=self.connect().cursor() cursor.execute("DROP TABLE IF EXISTS"+table1) sql='create table'+table1+'(id int(11) not null auto_increment primary key,title varchar(50) not null,dt varchar(20) not null,editor varchar(50) not null,article text not null,commentscount int(5),source varchar(50))' try: cursor.execute(sql) self.connect().commit() print('创建数据库成功') return True except: self.connect().rollback() print('创建数据库失败') return False def query(self,sql1): cursor=self.connect().cursor() try: cursor.execute(sql1) data=cursor.fetchall() self.connect().commit() except: self.connect().rollback() return False return data def insert(self,value,table1): cursor=self.connect().cursor() sql2="INSERT INTO"+table1+"(title,dt,editor,article,commentscount,source)\VALUES(%s,%s,%s,%s,%s,%s)" try: cursor.execute(sql2,value) self.connect().commit() print('导入数据库成功') return True except: self.connect().rollback() print('导入数据库失败') return False news_total=[] for i in range(1,140): newsurl=url.format(i) newsary=parseListLinks(newsurl) news_total.extend(newsary) mysql1=yfsql() mysql1.create('ceshi') for new_total in news_total: title=new_total['title'] dt=new_total['dt'] editor=new_total['editor'] article=new_total['article'] commentscount=new_total['comments'] source=new_total['source'] value=(title,dt,editor,article,commentscount,source) mysql1.insert(value) results=[] for x in mysql1.query('SELECT article FROM ceshi'): results.append(x) file=open('ceshi.txt','a',encoding='utf-8') file.write('\n'.join(['%s' %i for i in results])) file.close() def generate_image(): jieba.analyse.set_stop_words('stopword.txt') content=open('ceshi.txt',encoding='utf-8').read() tags=jieba.analyse.extract_tags(content,topK=150,withWeight=False,allowPOS=()) data=" ".join(tags) mask=np.array(image.open(r"C:\Users\asus\词云.jpg")) w=wordcloud.WordCloud(font_path='msyh.ttc',background_color='white',mask=mask) w.generate(data) w.to_file('ciyun.jpg') print('创建词云图成功') generate_image() def tfidf_list(): jieba.analyse.set_stop_words('stopword.txt') content=open('ceshi.txt',encoding='utf-8').read() content=re.sub("[A-Za-z0-9\: \·\——\,\。\“\”\\(\)\,\‘\.\%]","",content) tags=jieba.analyse.extract_tags(content,topK=20,withWeight=True,allowPOS=()) for x,w in tags: print(x+'\t'+str(w)) tfidf_list() def generate_bar(): jieba.analyse.set_stop_words('stopword.txt') content=open('ceshi.txt',encoding='utf-8').read() content=re.sub('[A-Za-z0-9\: \·\——\,\。\”\“\\(\)\,\‘\、\?\;\.\%]',"",content) content=str(content) tags=jieba.analyse.extract_tags(content,topK=30,withWeight=True,allowPOS=()) y=[] #关键词列表 x=[] #权重列表 for (k,v) in tags: y.append(k) x.append(v) x1=np.array(x) fig,ax=plt.subplots(figsize=(30,12)) plt.rcParams['font.size']=20 plt.rcParams['font.sans-serif']=['SimHei'] y_pos=np.arange(len(y)) rects=ax.barth(y=y_pos,align='center',width=x1,) ax.set_yticks(y_pos) #设置标度的位置 ax.set_yticklabels(y) #设置纵坐标的每一个刻度的属性值 ax.set_xlabel('Importance') #设置横坐标的单位 ax.set_title('TF-IDF') #设定图片的标题 for rect,y,num in zip(rects,y_pos,x1): x=rect.get_width() plt.text(x,y,"%f"%num) plt.savefig("barchart.png") bar=plt.show() return bar generate_bar() def generate_graph(): a_title=[] a_count=[] a_source=[] for z,x,y in mysql1.query('SELECT article,title,commentscount FROM ceshi order by commentscount desc limit 15'): a_source.append(z) a_title.append(x) a_count.append(y) #print(a_title) #print(a_count) #print(a_source) keylist=[] for i in a_source: i=re.sub("[:\·\——\,\。\“\”\\(\)\,\'\、\?\;]","",i) i=str(i) jieba.analyse.set_stop_words("stopword.txt") tags=jieba.analyse.extract_tags(i,topK=1,withWeight=False,allowPOS=()) for i in tags: keylist.append(i) fig,ax=plt.subplots(figsize=(30,12)) plt.rcParams['font.size']=20 plt.rcParams['font.sans-serif']=['SimHei'] colors=['lightcoral'] plt.title('评论数-关键词对应图') plt.bar(keylist,a_count,color=colors) for x,y in zip(range(len(a_count)),a_count): plt.text(x+0.1,y+1,'%i'%y,ha='center',va='bottom') plt.savefig("graph.png") graph=plt.show() return graph generate_graph() def generate_pie(): source_results=[] list=[] num=0 for x in mysql1.query('SELECT source FROM ceshi'): x=str(x) x=re.sub("[A-Za-z0-9\:\·\——\,\。\“\”\\(\)\,\']","",x) source_results.append(x) for i in source_results: i=str(i) list.append(i) list=set(list) for j in list: num=num+1 counter=Counter() count=0 for word in source_results: count=count+1 counter[word]+=1 pro_list=[] #吧各个新闻的来源的占比存为列表 k_list=[] #各个新闻来源标签存为列表 for (k,v) in counter.most_common(15): k_list.append(k) pro=v/count #计算新闻来源占比,结果为小数 pro_list.append(pro) plt.rcParams['font.size']=20 #设置字体大小 plt.rcParams['font.sans-serif']=['SimHei'] fig,ax=plt.subplots(figsize=(30,12)) #设置画布大小 colors=['lightcoral','orange','plum','c','pink'] plt.pie(x=pro_list,radius=1.0,pctdistance=0.8,labels=k_list,colors=colors,startangle=90,autopct='%1.1f%%') x_0=[1,0,0,0] #用于显示空心 plt.pie(x_0,radius=0.6,colors='w') #绘制空心圆 plt.title('新闻来源占比图') plt.savefig("piechart.png") show=plt.show() return show generate_pie()