这是一个用cnn做文本分类的一个模型,我在自己的mac上跑准确率有90%,但是放到windows服务器上准确率竟然只有25%,不知道是什么原因?
from future import print_function
import numpy as np
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import os
from keras import backend as K
print('Loading Dict')
embeddings_index = {}
f = open(os.path.join( 'glove.6B.100d.txt'))
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Loading dataset')
tmp=pd.read_csv('train.csv')
train_X=np.array(tmp.iloc[:,2]).astype('str')
train_y=np.array(tmp.iloc[:,0]).astype('int16')
train_y_ohe = np_utils.to_categorical(train_y)
del tmp
tmp=pd.read_csv('test.csv')
test_X=np.array(tmp.iloc[:,2]).astype('str')
test_y=np.array(tmp.iloc[:,0]).astype('int16')
test_y_ohe = np_utils.to_categorical(test_y)
del tmp
train_y_ohe=train_y_ohe.astype('float32')
test_y_ohe=test_y_ohe.astype('float32')
X=np.append(train_X,test_X)
print('Tokening')
t = Tokenizer()
t.fit_on_texts(X)
vocab_size = len(t.word_index) + 1
integer encode the documents
encoded_X = t.texts_to_sequences(X)
pad documents to a max length of x words
max_length = 50
padded_X = pad_sequences(encoded_X, maxlen=max_length, padding='post')
embedding_matrix = np.zeros((vocab_size, 100)).astype('float32')
for word, i in t.word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
padded_X_train=pad_sequences(encoded_X[0:119999],maxlen=max_length, padding='post')
padded_X_test=pad_sequences(encoded_X[119999:127598],maxlen=max_length, padding='post')
padded_X_test=padded_X_test.astype('float32')
padded_X_train=padded_X_train.astype('float32')
print('Estabilish model')
from keras.models import Model
from keras.layers import Dense,Embedding,Convolution1D,concatenate,Flatten,Input,MaxPooling1D,Dropout,Merge
from keras.callbacks import TensorBoard
K.clear_session()
x=Input(shape=(50,),dtype='float32')
embed=Embedding(input_dim=vocab_size,output_dim=100,weights=[embedding_matrix],input_length=max_length)(x)
cnn1=Convolution1D(128,9,activation='relu',padding='same',strides=1)(embed)
cnn1=MaxPooling1D(5)(cnn1)
cnn2=Convolution1D(128,6,activation='relu',padding='same',strides=1)(embed)
cnn2=MaxPooling1D(5)(cnn2)
cnn3=Convolution1D(128,3,activation='relu',padding='same',strides=1)(embed)
cnn3=MaxPooling1D(5)(cnn3)
cnn=concatenate([cnn1,cnn2,cnn3])
flat=Flatten()(cnn)
drop=Dropout(0.1)(flat)
y=Dense(5,activation='softmax')(drop)
model=Model(inputs=x,outputs=y)
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
tensorboard=TensorBoard(log_dir='./logs',write_graph=True,write_grads=True,histogram_freq=True)
model.fit(padded_X_train,
train_y_ohe,
epochs=5,
batch_size=10000,
verbose=1,callbacks=[tensorboard],validation_data=[padded_X_test,test_y_ohe])
'''pred0=model.predict_classes(padded_X,verbose=0)
acc_train=np.sum(train_y==pred0,axis=0)/train_X.shape[0]'''