import jieba
import pandas as pd
from scipy.misc import imread
import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud,ImageColorGenerator
file = open("E:/箴言资料.txt", mode='r', encoding='utf-8',newline=None)
content = file.read()
file.close()
jieba.load_userdict('E:/箴言词库.txt')
segments = []
segs = jieba.cut(content)
for seg in segs:
if len(seg) > 1:
segments.append(seg)
segmentDF = pd.DataFrame(segments)
segmentDF.columns=['segment']
stopwords = pd.read_csv(
"D:/StopwordsCN1.txt",
encoding='utf8',
index_col=False,
quoting=3,
sep="\t"
)
segmentDF = segmentDF[~segmentDF.segment.isin(stopwords.stopword)]
wyStopWords = pd.Series([
'之', '其', '或', '亦', '方', '于', '即', '皆', '因', '仍', '故',
'尚', '呢', '了', '的', '着', '一', '不', '乃', '呀', '吗', '咧',
'啊', '把', '让', '向', '往', '是', '在', '越', '再', '更', '比',
'很', '偏', '别', '好', '可', '便', '就', '但', '儿', # 高频副词
'又', '也', '都', '要',
'这', '那', '你', '我', '他',
'来', '去', '道', '笑', '说',
])
segmentDF = segmentDF[~segmentDF.segment.isin(wyStopWords)]
segdict = {col: segmentDF[col].tolist() for col in segmentDF.columns}
segcount = segmentDF.groupby(by='segment')['segment'].agg({
'频数':np.size}).reset_index().sort_index(by=['频数'],ascending=False)
words = segcount.set_index('segment').to_dict()
wc=WordCloud(
r'D:/simhei.ttf',width=500, height=400,
background_color='white',font_step=3,
mask=imread("E:/书本.jpg"),
random_state=False, prefer_horizontal=0.9
)
wc.fit_words(words['频数'])
plt.figure()
plt.imshow(wc)
plt.axis('off')
plt.show()
wc.to_file('E:/result.jpg')