在执行模型预测时发生代码错误:ValueError: Error when checking input: expected embedding_1_input to have shape (100,) but got array with shape (3,)
测试代码:
from keras.models import load_model
import pandas as pd
import numpy as np
import jieba
import jieba.posseg as pseg
import re
#import csv
import string
#from keras import models
from keras import layers
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
#from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import word2vec
from keras.preprocessing.sequence import pad_sequences
from keras.layers import *
#from keras.models import Model
#from sklearn import metrics
from keras.models import load_model
from keras.models import Sequential
import matplotlib.pyplot as plt
# 模型的保存
# 模型的加载
model = load_model('TextCNN')
train_data = pd.read_csv('酒店分类.csv', encoding='GB18030',lineterminator='\n')
test_data=pd.read_csv('酒店分类.csv',encoding='GB18030', lineterminator='\n')
predictions = model.predict(test_data)
print(predictions)
模型代码:
import pandas as pd import numpy as np import jieba import jieba.posseg as pseg import re #import csv import string #from keras import models from keras import layers from keras.utils.np_utils import to_categorical from keras.preprocessing.text import Tokenizer from sklearn.preprocessing import LabelEncoder #from sklearn.feature_extraction.text import CountVectorizer from gensim.models import word2vec from keras.preprocessing.sequence import pad_sequences from keras.layers import * #from keras.models import Model #from sklearn import metrics from keras.models import load_model from keras.models import Sequential import matplotlib.pyplot as plt #读入数据集,lineterminator是行分隔符,默认notebook文件保存在c盘用户下面 #newTrain.csv和newTest.csv是和这个文件Untitled5.ipynb在同一个目录下面的 train_data = pd.read_csv('酒店分类.csv', encoding='GB18030',lineterminator='\n') test_data=pd.read_csv('酒店分类.csv',encoding='GB18030', lineterminator='\n') #数据的预处理: #利用LabelEncoder对数据标签进行规格化处理 def encodeLabel(data): listLable=[] #这里我的标签的名字全都打成了lable,我知道标签的英文是label,如果大家实在看不惯想改过来的话记得前面加载的csv文件的开头的lable也改成label for lable in data['分类']: listLable.append(lable) #到这里都是把lable整合到一起,下面是规格化处理 le = LabelEncoder() resultLable=le.fit_transform(listLable) return resultLable trainLable=encodeLabel(train_data) testLable=encodeLabel(test_data) #这里输出testLable给大家看看 print(testLable) #这里出来是所有review的集合: def getReview(data): listReview=[] for review in data['评论内容']: listReview.append(review) return listReview trainReview=getReview(train_data) testReview=getReview(test_data) #这里输出testReview给大家看看 print(testReview) #分词: def stopwordslist():#加载停用词表,这个中文停用词表.txt也是要和文件放在同一目录下的喔,因为我这里用的都是相对路径 stopwords = [line.strip() for line in open('中文停用词表.txt',encoding='UTF-8').readlines()] return stopwords def deleteStop(sentence): #去停用词 stopwords=stopwordslist() outstr="" for i in sentence: if i not in stopwords and i!="\n": outstr+=i return outstr def wordCut(Review): Mat=[] for rec in Review: seten=[] rec = re.sub('[%s]' % re.escape(string.punctuation), '',rec) fenci=jieba.lcut(rec) #精准模式分词 stc=deleteStop(fenci) #去停用词 # sentence = list(map(lambda x: x.strip().lower() if len(x.strip().lower()) > 0 else None, jieba.cut(stc))) # 每句话里的单词拿出来 seg_list=pseg.cut(stc) #标注词性 for word,flag in seg_list: if flag not in ["nr","ns","nt","nz","m","f","ul","l","r","t"]: #去掉这些词性(人名、地名等)的词语 seten.append(word) Mat.append(seten) return Mat trainCut=wordCut(trainReview) testCut=wordCut(testReview) #看看testCut长什么样子吧,想看的自己去掉注释哈 print(testCut) wordCut=trainCut+testCut #下面这几行代码是为了flask部署模型的时候对拿进来的数据进行同样预处理,所以把这些所有的词又存在了wordCut.txt里面(我视频里面也讲清楚了) fileDic=open('wordCut.txt','w',encoding='UTF-8') for i in wordCut: fileDic.write(" ".join(i)) fileDic.write('\n') fileDic.close() #我们可以看一看读出来是些啥,但是实在是太多了,所以出不来 words = [line.strip().split(" ") for line in open('wordCut.txt',encoding='UTF-8').readlines()] print(words) maxLen=100 #word2vec的训练: # 设置词语向量维度 num_featrues = 100 # 保证被考虑词语的最低频度 min_word_count = 3 # 设置并行化训练使用CPU计算核心数量 num_workers =4 # 设置词语上下文窗口大小 context = 4 model = word2vec.Word2Vec(wordCut, workers=num_workers, size=num_featrues, min_count=min_word_count,window=context) # 强制单位归一化 model.init_sims(replace=True) # 输入一个路径,保存训练好的模型,其中./data/model目录事先要存在 model = word2vec.Word2Vec(wordCut, workers=num_workers,size=num_featrues, min_count=min_word_count,window=context) print(model) #加载模型,如果之前word2vec已经训练好了直接用这句就好了: #fit_on_texts函数可以将输入的文本中的每个词编号,编号是根据词频的,词频越大,编号越小 tokenizer=Tokenizer() tokenizer.fit_on_texts(words) vocab = tokenizer.word_index # 得到每个词的编号,这里的vocab已经剔除掉stoplist了 #print(vocab) #特征数字编号,不足的话会在前面补充0 trainID = tokenizer.texts_to_sequences(trainCut) # print(trainID) testID = tokenizer.texts_to_sequences(testCut) trainSeq=pad_sequences(trainID,maxlen=maxLen) #print(trainSeq) testSeq=pad_sequences(testID,maxlen=maxLen) #标签的独热编码 trainCate = to_categorical(trainLable, num_classes=5) # 将标签转换为one-hot编码 #print(trainCate) testCate= to_categorical(testLable, num_classes=5) # 将标签转换为one-hot编码 #利用训练后的word2vec自定义Embedding的训练矩阵,每行代表一个词(结合独热码和矩阵乘法理解) embedding_matrix = np.zeros((len(vocab) + 1, 100)) for word, i in vocab.items(): try: embedding_vector = model[str(word)] embedding_matrix[i] = embedding_vector except KeyError: continue #训练模型 main_input = Input(shape=(maxLen,), dtype='float64') # 词嵌入(使用预训练word2vec的词向量,自定义权重矩阵,100是输出的词向量维度) embedder = Embedding(len(vocab) + 1, 100, input_length=maxLen, weights=[embedding_matrix], trainable=False) model=Sequential() model.add(embedder) model.add(Conv1D(256,3,padding='same',activation='relu')) model.add(MaxPool1D(maxLen-5,3,padding='same')) model.add(Conv1D(32,3,padding='same',activation='relu')) model.add(Flatten()) model.add(Dropout(0.3)) model.add(Dense(256,activation='relu')) model.add(Dropout(0.2)) model.add(Dense(units=5,activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history=model.fit(trainSeq, trainCate, batch_size=256, epochs=66,validation_split=0.2) model.save("TextCNN2") #预测与评估 mainModel = load_model('TextCNN2') result = mainModel.predict(testSeq) # 预测样本属于每个类别的概率 print(result) print(np.argmax(result,axis=1)) score = mainModel.evaluate(testSeq, testCate, batch_size=64) print(score) plt.plot(history.history['accuracy']) plt.plot(history.history['val_accuracy']) plt.title('Model accuracy') plt.ylabel('Accuracy') plt.xlabel('Epoch') plt.legend(['Train','Valid'],loc='upper left') plt.show() plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('Model loss') plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train','Valid'],loc='upper left') plt.show() from tensorflow.keras.utils import plot_model import pydotplus #参数 :模型名称,结构图保存位置,是否展示shape plot_model(model,to_file='textCNN_model1.png',show_shapes=True)