请问各路,谁能帮忙看一下我的SVM复现代码?
不知道是SMO的理解有问题,还是循环多了,或者是哪里参数没调整好,用乳腺癌的数据跑出来只有40%左右的准确率;训练速度也非常慢,差不多1分钟迭代30次左右。
不调SVM包,想知道对SVM的底层哪里有理解偏差
import numpy as np
from sklearn import datasets
from sklearn.metrics import accuracy_score, precision_score, recall_score, cohen_kappa_score, f1_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
class SupportVectorMachine:
def __init__(self, x_train, y_train):
self.x_train = x_train
self.y_train = y_train
def kernel(self, name, sample1, sample2, **kwargs):
if name == 'Gauss':
sigma = kwargs.get('sigma', 1)
delta = sample1 - sample2
norm2 = np.dot(delta.T, delta)
gauss_kernel = np.exp(-norm2 / (2 * sigma ** 2))
return gauss_kernel
elif name == 'polynomial':
p = kwargs.get('p', 2)
dot = np.dot(sample1.T, sample2) + 1
polynomial_kernel = dot ** p
return polynomial_kernel
elif name == 'linear':
linear_kernal = np.dot(sample1, sample2)
return linear_kernal
else:
return 'Please input Gauss or polynomial or linear'
def find_hyperplane(self, alpha):
label = self.y_train
train_set = self.x_train
m, n = train_set.shape
weight_sample = train_set * alpha
labeled_weight = weight_sample * label
w = np.sum(labeled_weight, axis=0).reshape(n, 1)
w_mult_x = np.dot(train_set, w)
pos = []; neg = []
for i in range(len(w_mult_x)):
if label[i] == 1:
pos.append(w_mult_x[i][0])
else:
neg.append(w_mult_x[i][0])
max_neg = np.min(neg)
min_pos = np.max(pos)
b = -(max_neg + min_pos) / 2
return w, b
def predict(self, sample, alpha):
train_set = self.x_train
label = self.y_train
m, n = train_set.shape
w, b = self.find_hyperplane(alpha)
predict = 0
for i in range(m):
emp = alpha[i] * label[i] * self.kernel('linear', sample, train_set[i], sigma=1)
predict += emp
predict += b
return predict
def find_alpha1(self, alpha, toler=0.01, C=1):
train_set = self.x_train
label = self.y_train
w, b = self.find_hyperplane(alpha)
find_target = False
for alpha1_index in range(len(alpha)):
if 0 < alpha[alpha1_index] < C:
emp = label[alpha1_index] * (np.dot(w.T, train_set[alpha1_index]) + b)
if not (1 - toler < emp < 1 + toler):
alpha1 = alpha[alpha1_index]
find_target = True
break
if find_target:
return alpha1, alpha1_index, find_target
else:
for alpha1_index in range(len(alpha)):
emp = label[alpha1_index] * (np.dot(w.T, train_set[alpha1_index]) + b)
if (alpha[alpha1_index] == 0 and emp < 1 + toler) or \
(alpha[alpha1_index] == C and emp > 1 - toler):
find_target = True
break
alpha1 = alpha[alpha1_index]
return alpha1, alpha1_index, find_target
def update_alpha(self, alpha, alpha1_index, alpha2_index, error, C):
error = np.array(error)
alpha1 = alpha[alpha1_index]
alpha2 = alpha[alpha2_index]
train_set = self.x_train
label = self.y_train
kernal_x1_x1 = self.kernel('linear', train_set[alpha1_index], train_set[alpha1_index], sigma=1)
kernal_x1_x2 = self.kernel('linear', train_set[alpha1_index], train_set[alpha2_index], sigma=1)
kernal_x2_x2 = self.kernel('linear', train_set[alpha2_index], train_set[alpha2_index], sigma=1)
alpha2_new_unc = alpha2 + (label[alpha2_index] * (error[alpha1_index] - error[alpha2_index]) / (kernal_x1_x1 - 2 * kernal_x1_x2 + kernal_x2_x2))
y1 = label[alpha1_index]; y2 = label[alpha2_index]
if y1 == y2:
L = max(0, alpha2 - alpha1)
H = min(C, C + alpha2 - alpha1)
else:
L = max(0, alpha2 + alpha1 - C)
H = min(C, alpha2 + alpha1)
if alpha2_new_unc > H:
alpha2_new = H
elif alpha2_new_unc < L:
alpha2_new = L
else:
alpha2_new = alpha2_new_unc
alpha1_new = alpha1 + label[alpha1_index] * label[alpha2_index] * (alpha2 - alpha2_new)
alpha[alpha1_index] = alpha1_new
alpha[alpha2_index] = alpha2_new
return alpha
def SMO_algorithm(self, iteration=100, C=1, sigma=1):
train_set = self.x_train
label = self.y_train
m, n = train_set.shape
alpha = np.zeros((m, 1)) # Initialize alpha to 0
for k in range(iteration):
print('迭代{}次'.format(k))
error = []
for i in range(m):
predict = self.predict(train_set[i], alpha)
error.append(predict - label[i])
alpha1, alpha1_index, find_target = self.find_alpha1(alpha, C=C)
if not find_target:
break
else:
error1 = error[alpha1_index]
if error1 >= 0:
min_error = np.min(error)
alpha2_index = error.index(min_error)
else:
max_error = np.max(error)
alpha2_index = error.index(max_error)
alpha = self.update_alpha(alpha, alpha1_index, alpha2_index, error, C)
return alpha
# Cross-validation to find the best C
def cross_validate_C(C_values, x_train, y_train, x_test, y_test, sigma=1):
best_C = None
best_score = 0
for C in C_values:
svm = SupportVectorMachine(x_train, y_train)
alpha = svm.SMO_algorithm(C=C, sigma=sigma)
w, b = svm.find_hyperplane(alpha)
y_predict = []
for i in range(x_test.shape[0]):
pred = svm.predict(x_test[i], alpha)
if pred > 0:
y_predict.append(1)
else:
y_predict.append(-1)
score = accuracy_score(y_predict, y_test)
if score > best_score:
best_score = score
best_C = C
return best_C
# Load data and preprocess
cancer = datasets.load_breast_cancer()
data = cancer["data"]
label = cancer.target.reshape(data.shape[0], 1)
for i in range(len(label)):
if label[i] == 0:
label[i] = -1
m, n = data.shape
scaler = MinMaxScaler()
data = scaler.fit_transform(data)
C_values = [0.001,0.1,1,10,100] # Possible C values to test
sigma = 1 # Sigma for the Gaussian kernel
folder = KFold(n_splits=10, shuffle=True, random_state=50)
best_C = 0
for train_index, test_index in folder.split(data):
x_train = data[train_index, :].astype('float64')
x_test = data[test_index, :].astype('float64')
y_train = label[train_index].astype('float64')
y_test = label[test_index].astype('float64')
best_C = cross_validate_C(C_values, x_train, y_train, x_test, y_test, sigma)
break # We only need to find the best C once
# Now use the best C to train the final model
svm = SupportVectorMachine(data, label)
alpha = svm.SMO_algorithm(C=best_C, sigma=sigma)
w, b = svm.find_hyperplane(alpha)
# Evaluate the final model
y_predict = []
for i in range(data.shape[0]):
pred = svm.predict(data[i], alpha)
if pred > 0:
y_predict.append(1)
else:
y_predict.append(-1)
accuracy = accuracy_score(y_predict, label)
precision = precision_score(y_predict, label)
recall = recall_score(y_predict, label)
f1 = f1_score(y_predict, label)
kappa = cohen_kappa_score(y_predict, label)
print('Best C is {}; 准确率为{};查全率为{};召回率为{};f1系数为{};kappa系数为{}'.format(best_C, accuracy, precision, recall, f1, kappa))