不使用 sklearn写的knn算法,识别mnist数据集,准确率只有百分之六十, 如何进一步提高识别的准确率
已经尝试过使用不同的k值和对图片进行归一化处理
def load_mnist():
X_train = np.fromfile('mnist_data/train-images-idx3-ubyte', dtype=np.uint8, offset=16)
X_train = X_train.reshape(int(6e4), 28, 28)
X_test = np.fromfile('mnist_data/t10k-images-idx3-ubyte', dtype=np.uint8, offset=16)
X_test = X_test.reshape(int(1e4), 28, 28)
y_train = np.fromfile('mnist_data/train-labels-idx1-ubyte', dtype=np.uint8, offset=8)
y_train = y_train.reshape(int(6e4))
y_test = np.fromfile('mnist_data/t10k-labels-idx1-ubyte', dtype=np.uint8, offset=8)
y_test = y_test.reshape(int(1e4))
class Knn(object):
def __init__(self, k=3):
self.k = k
def fit(self, X, y):
self.X = X
self.y = y
def predict(self, X):
dataset = self.X
labels = self.y
k = self.k
predict_labels = []
X = np.reshape(X, (X.shape[0], -1))
dataset = np.reshape(dataset, (dataset.shape[0], -1))
scalar = MaxAbsScaler()
scalar.fit(dataset)
dataset = scalar.transform(dataset)
X = scalar.transform(X)
print(dataset[0])
dataset_size = dataset.shape[0]
for i in tqdm(range(X.shape[0])):
diff_mat = np.tile(X[i], (dataset_size, 1)) - dataset
sq_diff_mat = diff_mat ** 2
sq_distances = sq_diff_mat.sum(axis=1)
distances = sq_distances ** 0.5
sorted_dist_indicies = distances.argsort()
class_count = {}
for j in range(k):
vote_label = labels[sorted_dist_indicies[i]]
class_count[vote_label] = class_count.get(vote_label, 0) + 1
sorted_class_count = sorted(class_count.items(), key=operator.itemgetter(1), reverse=True)
predict_labels.append(sorted_class_count[0][0])
predict_labels = np.array(predict_labels)
return predict_labels