基于Python和TFIDF实现提取文本中的关键词_python
https://www.jb51.net/article/245943.htm
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(X.toarray())
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import regex as re
import string
import pandas as pd
import numpy as np
import nltk.data
import re
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, sent_tokenize, pos_tag
import os
path = "./data/theses100/"
all_files = os.listdir(path + "docsutf8")
all_keys = os.listdir(path + "keys")
print(len(all_files)," files n",all_files,
"n", all_keys) # 不一定要排序
all_documents =[]
all_keys = []
all_files_names = []
for i, fname in enumerate(all_files):
with open(path+'docsutf8/'+fname) as f:
lines = f.readlines()
key_name= fname[:-4 ]
with open(path+'keys/'+key_name+'.key') as f:
k = f.readlines()
all_text = ' '.join(lines)
keyss = ' '.join(k)
all_documents.append(all_text)
all_keys.append(keyss.split("n"))
all_files_names.append(key_name)
import pandas as pd
dtf = pd.DataFrame({'goldkeys': all_keys,
'text': all_documents})
dtf.head()
dtf['cleaned_text'] = dtf.text.apply(lambda x: ' '.join(preprocess_text(x)))
dtf.head()
# 清理基本关键字,删除空格和噪音
def clean_orginal_kw(orginal_kw):
orginal_kw_clean =[]
for doc_kw in orginal_kw:
temp =[]
for t in doc_kw:
tt = ' '.join(preprocess_text(t))
if len(tt.split())>0:
temp.append(tt)
orginal_kw_clean.append(temp)
return orginal_kw_clean
orginal_kw= clean_orginal_kw(dtf['goldkeys'])
orginal_kw[0:1]