代码如下:
def get_tfidf():
try:
with open('./clean.txt', "r", encoding='ANSI') as fr:
lines = fr.readlines()
except FileNotFoundError:
print("no file like this")
transformer=TfidfVectorizer()
tfidf = transformer.fit_transform(lines)
# 转为数组形式
tfidf_arr = tfidf.toarray()
return tfidf_arr
def get_cluster(tfidf_arr,k):
kmeans = KMeansClusterer(num_means=k, distance=cosine_distance) # 分成k类,使用余弦相似分析
kmeans.cluster(tfidf_arr)
# 获取分类
kinds = pd.Series([kmeans.classify(i) for i in tfidf_arr])
fw = open('./cluster.txt', 'a+', encoding='ANSI')
for i, v in kinds.items():
fw.write(str(i) + '\t' + str(v) + '\n')
fw.close()
def cluster_text():
index_cluser = []
try:
with open('./cluster.txt', "r", encoding='ANSI') as fr:
lines = fr.readlines()
except FileNotFoundError:
print("no file like this")
for line in lines:
line = line.strip('\n')
line = line.split('\t')
index_cluser.append(line)
# index_cluser[i][j]表示第i行第j列
try:
with open('./clean.txt', "r", encoding='ANSI') as fr:
lines = fr.readlines()
except FileNotFoundError:
print("no file like this")
for index,line in enumerate(lines):
for i in range(28):
if str(index) == index_cluser[i][0]:
fw = open('Cluster' + index_cluser[i][1] + '.txt', 'a+', encoding='ANSI')
fw.write(line)
fw.close()
def get_title(cluster):
for i in range(cluster):
try:
with open('Cluster' + str(i) + '.txt', "r", encoding='ANSI') as fr:
lines = fr.readlines()
except FileNotFoundError:
print("no file like this")
all_words = []
for line in lines:
line = line.strip('\n')
line = line.split('\t')
for word in line:
all_words.append(word)
c = Counter()
for x in all_words:
if len(x) > 1 and x != '\r\n':
c[x] += 1
print('主题' + str(i+1) + '\n词频统计结果:')
# 输出词频最高的那个词,也可以输出多个高频词
for (k, v) in c.most_common(1):
print(k,':',v,'\n')
if name == 'main':
# 定义聚类的个数
cluster = 10
# 获取tfidf矩阵
tfidf_arr = get_tfidf()
print(tfidf_arr)
print(tfidf_arr.shape)
# K-means聚类
get_cluster(tfidf_arr,cluster)
# 获取分类文件
cluster_text()
# 统计出主题词
get_title(cluster)
运行结果
tfidf矩阵可以输出
但是