在 TF-IDF 特征提取的基础上, 分别采用逻辑回归、 朴素贝叶斯两种模型对文本数据的分类效果进行比较(准确率、召回率、F1);逻辑回归对分类标签种类比较(准确率、召回率、F1)
数据来源:https://github.com/aceimnorstuvwxz/toutiao-text-classfication-dataset
import numpy as np
import pandas as pd
import time
import jieba
import re
import string
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import StratifiedKFold
# # Pandas设置
# pd.set_option("display.max_columns", None) # 设置显示完整的列
# pd.set_option("display.max_rows", None) # 设置显示完整的行
# pd.set_option("display.expand_frame_repr", False) # 设置不折叠数据
# pd.set_option("display.max_colwidth", 100) # 设置列的最大宽度
# 加载数据集
def data():
a=[]
with open(r'D:\桌面\文本分类\data.txt',encoding='utf-8') as f:
a.append(f.readlines())
df = pd.DataFrame(a[0])
df['label']=df[0].apply(lambda x:x.split('_!_')[1]) #对第二个!前的数据名称设置为label
df['label_desc']=df[0].apply(lambda x:x.split('_!_')[2]) #对第三个!前的数据名称设置为label_desc
df['sentence']=df[0].apply(lambda x:x.split('_!_')[3]) #对第四个!前的数据名称设置为sentence
return df.iloc[0:,1:]
df=data()
#去重仅保留第一次出现的评论
df=df.drop_duplicates(keep='first',inplace=False)
#去除空白行
df= df.dropna(axis=0)
STOPWORDS = r'D:\桌面\文本分类\停用词\stopwords-master\哈工大停用词表.txt'
stoplist = []
content_list = []
for word in open(STOPWORDS,encoding='utf-8'):
stoplist.append(word.strip()) #添加停用词表
#去除无效符号
def qu_dian(text):
punc = r'[~`!#$%^&*()_+-=|\';":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》《{}]'
ff=re.sub(punc, "",text)
f1=jieba.lcut(ff,cut_all=False) #中文分词
def rn(x):
return x.replace(' ','')
ben=list(filter(rn,f1))
return ben
df['quchu']=df['sentence'].apply(qu_dian)
#去除停用词
def qu_word(x):
real=[]
for i in x:
if i not in stoplist:
real.append(i)
return real
df['quchu']=df['quchu'].apply(qu_word)
# 实例化CountVectorizer
vectorizer = CountVectorizer()
# 将文本数据转化为特征矩阵
X = vectorizer.fit_transform(df['sentence'].tolist())
# 实例化TfidfTransformer
transformer = TfidfTransformer()
# 对特征矩阵进行tf-idf转换
X = transformer.fit_transform(X)
# 实例化StratifiedKFold
skf = StratifiedKFold(n_splits=5)
# 对数据进行划分
for train_index, test_index in skf.split(X, df['label']):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = df['label'][train_index], df['label'][test_index]