import pandas as pd
data=pd.read_table('D:/机器学习课程设计/noteData.txt',sep='\t',header=None,nrows = 10000,names=["标签","短信内容"])
data.head()
import jieba
jieba.setLogLevel(jieba.logging.INFO)
data['分词后数据']=data["短信内容"].apply(lambda x:' '.join(jieba.cut(x)))
data.head()
X = data['分词后数据']
y = data['标签']
f = open('D:/机器学习课程设计/my_stop_words.txt','r')
my_stop_words_data = f.readlines()
f.close()
my_stop_words_list=[]
for each in my_stop_words_data:
my_stop_words_list.append(each.strip('\n'))
X = data['分词后数据']
y = data['标签']
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
skf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
pipeline = Pipeline([
('vect', TfidfVectorizer(stop_words=my_stop_words_list)),
('clf', MultinomialNB(alpha=1.0))])
pipeline.fit(X_train, y_train)
# 进行预测
predict = pipeline.predict(X_test)
score = pipeline.score(X_test, y_test)
print(score)
data["数据类型"] = pipeline.predict(X) #lambda x:x+1 if not 2==1 else 0
data['数据类型']=data["数据类型"].apply(lambda x:"垃圾短信" if x==1 else "正常短信")
data.head()
可以帮我看一下这段代码吗 总是出现这样的报错 有什么办法可以解决吗?