weixin_45486045
weixin_45486045
采纳率0%
2021-05-05 00:34

执行模型预测时出现: embedding_1_input to have shape (100,)

在执行模型预测时发生代码错误:ValueError: Error when checking input: expected embedding_1_input to have shape (100,) but got array with shape (3,)

测试代码:

from keras.models import load_model
import pandas as pd
import numpy as np
import jieba
import jieba.posseg as pseg
import re
#import csv
import string
#from keras import models
from keras import layers
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
#from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import word2vec
from keras.preprocessing.sequence import pad_sequences
from keras.layers import *
#from keras.models import Model
#from sklearn import metrics
from keras.models import load_model
from keras.models import Sequential
import matplotlib.pyplot as plt
# 模型的保存


# 模型的加载
model = load_model('TextCNN')
train_data = pd.read_csv('酒店分类.csv', encoding='GB18030',lineterminator='\n')
test_data=pd.read_csv('酒店分类.csv',encoding='GB18030', lineterminator='\n')
predictions = model.predict(test_data)
print(predictions)


模型代码:

import pandas as pd
import numpy as np
import jieba
import jieba.posseg as pseg
import re
#import csv
import string
#from keras import models
from keras import layers
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
#from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import word2vec
from keras.preprocessing.sequence import pad_sequences
from keras.layers import *
#from keras.models import Model
#from sklearn import metrics
from keras.models import load_model
from keras.models import Sequential
import matplotlib.pyplot as plt
#读入数据集,lineterminator是行分隔符,默认notebook文件保存在c盘用户下面
#newTrain.csv和newTest.csv是和这个文件Untitled5.ipynb在同一个目录下面的
train_data = pd.read_csv('酒店分类.csv', encoding='GB18030',lineterminator='\n')
test_data=pd.read_csv('酒店分类.csv',encoding='GB18030', lineterminator='\n')

#数据的预处理:
#利用LabelEncoder对数据标签进行规格化处理
def encodeLabel(data):
    listLable=[]
    #这里我的标签的名字全都打成了lable,我知道标签的英文是label,如果大家实在看不惯想改过来的话记得前面加载的csv文件的开头的lable也改成label
    for lable in data['分类']:
        listLable.append(lable)
    #到这里都是把lable整合到一起,下面是规格化处理
    le = LabelEncoder()
    resultLable=le.fit_transform(listLable)
    return resultLable

trainLable=encodeLabel(train_data)
testLable=encodeLabel(test_data)
#这里输出testLable给大家看看
print(testLable)
#这里出来是所有review的集合:
def getReview(data):
    listReview=[]
    for review in data['评论内容']:
        listReview.append(review)
    return listReview

trainReview=getReview(train_data)
testReview=getReview(test_data)
#这里输出testReview给大家看看
print(testReview)
#分词:
def stopwordslist():#加载停用词表,这个中文停用词表.txt也是要和文件放在同一目录下的喔,因为我这里用的都是相对路径
    stopwords = [line.strip() for line in open('中文停用词表.txt',encoding='UTF-8').readlines()]
    return stopwords

def deleteStop(sentence):     #去停用词
    stopwords=stopwordslist()
    outstr=""
    for i in sentence:
        if i not in stopwords and i!="\n":
            outstr+=i
    return outstr
def wordCut(Review):
    Mat=[]
    for rec in Review:
        seten=[]
        rec = re.sub('[%s]' % re.escape(string.punctuation), '',rec)
        fenci=jieba.lcut(rec)    #精准模式分词
        stc=deleteStop(fenci)     #去停用词
#         sentence = list(map(lambda x: x.strip().lower() if len(x.strip().lower()) > 0 else None, jieba.cut(stc)))  # 每句话里的单词拿出来
        seg_list=pseg.cut(stc)    #标注词性
        for word,flag in seg_list:
            if flag not in ["nr","ns","nt","nz","m","f","ul","l","r","t"]:  #去掉这些词性(人名、地名等)的词语
                seten.append(word)
        Mat.append(seten)
    return Mat
trainCut=wordCut(trainReview)
testCut=wordCut(testReview)
#看看testCut长什么样子吧,想看的自己去掉注释哈
print(testCut)
wordCut=trainCut+testCut
#下面这几行代码是为了flask部署模型的时候对拿进来的数据进行同样预处理,所以把这些所有的词又存在了wordCut.txt里面(我视频里面也讲清楚了)
fileDic=open('wordCut.txt','w',encoding='UTF-8')
for i in wordCut:
    fileDic.write(" ".join(i))
    fileDic.write('\n')
fileDic.close()
#我们可以看一看读出来是些啥,但是实在是太多了,所以出不来
words = [line.strip().split(" ") for line in open('wordCut.txt',encoding='UTF-8').readlines()]
print(words)

maxLen=100
#word2vec的训练:
# 设置词语向量维度
num_featrues = 100
# 保证被考虑词语的最低频度
min_word_count = 3
# 设置并行化训练使用CPU计算核心数量
num_workers =4
# 设置词语上下文窗口大小
context = 4
model = word2vec.Word2Vec(wordCut, workers=num_workers, size=num_featrues, min_count=min_word_count,window=context)
# 强制单位归一化
model.init_sims(replace=True)
# 输入一个路径,保存训练好的模型,其中./data/model目录事先要存在

model = word2vec.Word2Vec(wordCut, workers=num_workers,size=num_featrues, min_count=min_word_count,window=context)
print(model)
#加载模型,如果之前word2vec已经训练好了直接用这句就好了:

#fit_on_texts函数可以将输入的文本中的每个词编号,编号是根据词频的,词频越大,编号越小
tokenizer=Tokenizer()
tokenizer.fit_on_texts(words)
vocab = tokenizer.word_index  # 得到每个词的编号,这里的vocab已经剔除掉stoplist了
#print(vocab)
#特征数字编号,不足的话会在前面补充0
trainID = tokenizer.texts_to_sequences(trainCut)
# print(trainID)
testID = tokenizer.texts_to_sequences(testCut)
trainSeq=pad_sequences(trainID,maxlen=maxLen)
#print(trainSeq)
testSeq=pad_sequences(testID,maxlen=maxLen)

#标签的独热编码
trainCate = to_categorical(trainLable, num_classes=5)  # 将标签转换为one-hot编码
#print(trainCate)
testCate= to_categorical(testLable, num_classes=5)  # 将标签转换为one-hot编码
#利用训练后的word2vec自定义Embedding的训练矩阵,每行代表一个词(结合独热码和矩阵乘法理解)
embedding_matrix = np.zeros((len(vocab) + 1, 100))
for word, i in vocab.items():
    try:
        embedding_vector = model[str(word)]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        continue

#训练模型
main_input = Input(shape=(maxLen,), dtype='float64')
# 词嵌入(使用预训练word2vec的词向量,自定义权重矩阵,100是输出的词向量维度)
embedder = Embedding(len(vocab) + 1, 100, input_length=maxLen, weights=[embedding_matrix], trainable=False)
model=Sequential()
model.add(embedder)
model.add(Conv1D(256,3,padding='same',activation='relu'))
model.add(MaxPool1D(maxLen-5,3,padding='same'))
model.add(Conv1D(32,3,padding='same',activation='relu'))
model.add(Flatten())
model.add(Dropout(0.3))
model.add(Dense(256,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=5,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history=model.fit(trainSeq, trainCate, batch_size=256, epochs=66,validation_split=0.2)
model.save("TextCNN2")
#预测与评估
mainModel = load_model('TextCNN2')
result = mainModel.predict(testSeq)  # 预测样本属于每个类别的概率
print(result)
print(np.argmax(result,axis=1))
score = mainModel.evaluate(testSeq,
                           testCate,
                           batch_size=64)
print(score)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train','Valid'],loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train','Valid'],loc='upper left')
plt.show()
from tensorflow.keras.utils import plot_model
import pydotplus
#参数 :模型名称,结构图保存位置,是否展示shape
plot_model(model,to_file='textCNN_model1.png',show_shapes=True)

  • 点赞
  • 写回答
  • 关注问题
  • 收藏
  • 复制链接分享
  • 邀请回答

相关推荐