基于数据库problem表里的问题描述problem desc,问题类型problem type(有十种)建立模型,实现对实时地对数据库新增的问题描述,给出对应的问题类型的相似度排序,python代码,谢谢!
目前用的随机森林算法,但是结果不太好,Accuracy: 0.41,如何优化算法或者有更好的算法模型??。还有个问题请教,如何把他变成一个服务,通过接口的方式接受业务系统数据库增加的问题描述数据(发现项),并通过算法得到结果后,又把OPL类型(problem type)通过接口的形式传递并存储到业务系统的数据库?
Jupyter代码
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
data = pd.read_excel("C:/Users/xiao/Desktop/Desk/dream/audit/oplall.xls",sheet_name = "Sheet0")
# 划分训练数据和测试数据
X_train, X_test, y_train, y_test = train_test_split(data[["发现项", "条款编号"]], data["OPL类型"], test_size=0.2, random_state=42)
# 使用TF-IDF向量化器将文本数据转换为数字数据
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train["发现项"])
X_test_tfidf = tfidf_vectorizer.transform(X_test["发现项"])
# 将clause列转换为数值类型
clause_encoder = LabelEncoder()
X_train["条款编号"] = clause_encoder.fit_transform(X_train["条款编号"])
X_test["条款编号"] = clause_encoder.transform(X_test["条款编号"])
import numpy as np
from scipy.sparse import hstack
# 合并TF-IDF向量化器和clause列
X_train_combined = hstack((X_train_tfidf, X_train["条款编号"].values.reshape(-1,1)))
X_test_combined = hstack((X_test_tfidf, X_test["条款编号"].values.reshape(-1,1)))
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score
param_grid = {
'n_estimators': [20, 30, 50, 80, 100, 200, 300],
'max_depth': [None,1, 2, 3, 5, 10, 15],
}
# 随机森林分类器
rf_classifier = RandomForestClassifier(random_state=42)
# 网格搜索
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid,
scoring=make_scorer(accuracy_score), cv=5, verbose=1)
# 使用训练数据拟合网格搜索对象
grid_search.fit(X_train_combined, y_train)
# 输出最佳参数
print("Best parameters set found on development set:")
print()
print(grid_search.best_params_)
best_rf_classifier = grid_search.best_estimator_
# 预测测试集
y_pred = best_rf_classifier.predict(X_test_combined)
# 评估模型性能
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:", classification_report(y_test, y_pred))
print("Confusion Matrix:", confusion_matrix(y_test, y_pred))
test_indices = y_test.index
test_df = data.loc[test_indices, ['条款编号', '发现项', 'OPL类型']]
test_df['预测结果'] = y_pred
test_df.head(30)