用softmax 做一个二分类,预测正确率为0.2左右, 将预测结果的0,1互换后感觉才是比较理想的预测结果。可能是哪里出了问题?
这个神经网络(5层左右,隐藏层是reLU)的输出层用sigmoid的时候挺正常的。我只是把输出层改成了softmax, 梯度下降的时候cost也在减小。
我试着将原始数据的0,1标签对换,没有用,正确率还是0.2左右。
代码有点长。下面是类代码
import numpy as np
from function_set.activate_functions import sigmoid, reLU, tanh, softmax # eval()会用到,不删
from function_set.activate_differential import sigmoid_d, reLU_d, tanh_d # eval()会用到,不删
class neural_network:
def __init__(self,
layer_num=3,
dim_num_list=None,
funct_list=None,
a_functs=("tanh", "sigmoid"),
reg="L2",
lambda_reg=0.0,
beta_momentum=0.9,
batch_size=-1,
keep_prob=1.0
):
self.funct_list = funct_list # 激活函数列表,记录每一层的激活函数
self.layer_num = layer_num # 层数
self.dim_num_list = dim_num_list # 每一层的结点数
self.a_functs = a_functs # 激活函数名
self.paras = dict() # 保存参数 W,b
self.cache = dict() # 存储前向传播的数据,供反向传播使用
self.reg = reg.upper()
self.epsilon = 1e-8 # 精度,防止除0或log
self.lambda_reg = lambda_reg # L2正则化的lambda
self.beta_momentum = beta_momentum # 动量梯度下降法的超参数
self.iter_cnt = 0 # 迭代计数, 用于momentum等优化
self.batch_size = batch_size # mini-batch的大小
self.keep_prob = keep_prob # dropout中的保持概率
def init(self, X, Y):
if not self.dim_num_list:
self.dim_num_list = np.random.randint(30, 31, self.layer_num + 1) # 默认维度为30
else:
self.layer_num = len(self.dim_num_list) - 1
if not self.funct_list:
self.funct_list = list()
for i in range(0, self.layer_num):
self.funct_list.append(self.a_functs[0])
self.funct_list.append(self.a_functs[1])
self.dim_num_list[0] = X.shape[0]
self.dim_num_list[self.layer_num] = Y.shape[0]
for i in range(1, self.layer_num + 1):
self.cache["v_dW" + str(i)] = np.zeros((self.dim_num_list[i], self.dim_num_list[i - 1]))
self.cache["v_db" + str(i)] = np.zeros((self.dim_num_list[i], 1))
self.init_paras()
def init_paras(self):
for i in range(1, self.layer_num + 1):
self.paras["W" + str(i)] = np.random.randn(self.dim_num_list[i], self.dim_num_list[i - 1])
self.paras["b" + str(i)] = np.zeros(self.dim_num_list[i], float).reshape(-1, 1)
def forward_propagate(self, X):
self.cache["A0"] = X
for i in range(1, self.layer_num + 1):
self.cache["Z" + str(i)] = np.dot(self.paras["W" + str(i)], self.cache["A" + str(i - 1)])
self.cache["A" + str(i)] = eval(self.funct_list[i])(self.cache["Z" + str(i)])
if self.keep_prob < 1: # dropout
D = np.random.rand(self.cache["A" + str(i)].shape[0], self.cache["A" + str(i)].shape[1])
D = (D < self.keep_prob) * 1
self.cache["D" + str(i)] = D
self.cache["A" + str(i)] = self.cache["A" + str(i)] * D / self.keep_prob
def get_cost(self, Y):
m = self.cache["A0"].shape[1]
A = self.cache["A" + str(self.layer_num)]
Z = self.cache["A" + str(self.layer_num)]
cost1 = 0 # cost1为交叉熵
loss_matrix = None
if self.funct_list[self.layer_num] == "sigmoid":
# loss_matrix = -Y * (np.maximum(Z, 0) - np.log(1 + np.exp(-np.abs(Z))))
loss_matrix = np.maximum(Z, 0) - Z * Y + np.log(1 + np.exp(-np.abs(Z)))
elif self.funct_list[self.layer_num] == "softmax":
max_Z = np.max(Z, axis=0)
loss_matrix = Y * (max_Z - Z + np.log(np.sum(np.exp(Z - max_Z), axis=0)))
cost1 += (1 / m) * np.sum(np.sum(loss_matrix, axis=0), axis=0)
cost2 = 0 # cost2 为正则项
if self.reg == "L2":
for i in range(1, self.layer_num + 1):
cost2 += self.lambda_reg * np.sum(np.sum(self.paras["W" + str(i)] * self.paras["W" + str(i)], axis=0),
axis=0)
else:
pass
cost = cost1 + cost2
print("cost1 = ", cost1, "\tcost2=", cost2)
return cost
def backward_propagate(self, Y):
A = self.cache["A" + str(self.layer_num)]
m = A.shape[1]
dA = (-1 / m) * Y * (1 / (A + self.epsilon))
i = self.layer_num
while (i > 0):
function_name = self.funct_list[i]
if (i == self.layer_num) and (function_name == "softmax" or function_name == "sigmoid"):
dZ = (1 / m) * (A - Y)
else:
if self.keep_prob < 1:
dA = dA * self.cache["D" + str(i)] / self.keep_prob
dZ = dA * eval(function_name + "_d")(self.cache["A" + str(i)], self.cache["Z" + str(i)])
dW_reg = 2 * self.lambda_reg * self.paras["W" + str(i)]
db_reg = 2 * self.lambda_reg * self.paras["b" + str(i)]
self.cache["dW" + str(i)] = np.dot(dZ, self.cache["A" + str(i - 1)].T) + dW_reg
self.cache["db" + str(i)] = np.sum(dZ, axis=1).reshape(-1, 1) + db_reg
dA = np.dot(self.paras["W" + str(i)].T, dZ)
i -= 1
def update_parameters(self, learning_rate):
for i in range(1, self.layer_num + 1):
self.cache["v_dW" + str(i)] = self.beta_momentum * self.cache["v_dW" + str(i)] + (1 - self.beta_momentum) * \
self.cache["dW" + str(i)]
self.cache["v_db" + str(i)] = self.beta_momentum * self.cache["v_db" + str(i)] + (1 - self.beta_momentum) * \
self.cache["db" + str(i)]
self.cache["v_c_dW" + str(i)] = self.cache["v_dW" + str(i)] / (1 - self.beta_momentum ** self.iter_cnt)
self.cache["v_c_db" + str(i)] = self.cache["v_db" + str(i)] / (1 - self.beta_momentum ** self.iter_cnt)
self.paras["W" + str(i)] = self.paras["W" + str(i)] - learning_rate * self.cache["v_c_dW" + str(i)]
self.paras["b" + str(i)] = self.paras["b" + str(i)] - learning_rate * self.cache["v_c_db" + str(i)]
def fit(self, X, Y, learning_rate=0.5, iter_num=1000):
self.init(X, Y)
batch_generator = self.get_batch(X, Y)
for X, Y in batch_generator:
self.iter_cnt = 0
for epoch in range(iter_num):
self.iter_cnt += 1
self.forward_propagate(X)
cost = self.get_cost(Y)
self.backward_propagate(Y)
self.update_parameters(learning_rate)
def predict_probability(self, X):
self.forward_propagate(X)
return self.cache["A" + str(self.layer_num)]
def predict(self, X):
probabilty = self.predict_probability(X)
return self.map_to_int(probabilty)
def get_batch(self, X, Y): # mini-batch生成器
m = X.shape[1]
if self.batch_size == -1:
self.batch_size = m
batch_num = (m + self.batch_size - 1) // self.batch_size
index_list = np.random.permutation(m)
start = 0
for i in range(batch_num):
end = min(start + self.batch_size, m)
yield X[:, index_list[start:end]], Y[:, index_list[start:end]]
start = end
def map_to_int(self, A): # 概率转预测结果
result = None #
if self.funct_list[self.layer_num] == "softmax":
result = np.argmax(A, axis=0)
elif self.funct_list[self.layer_num] == "sigmoid":
result = (A >= 0.5) * 1
result = result[0]
return result
def get_one_hot(self, y, class_num): # 将整数类别转换为独热编码
n = class_num
m = len(y)
result = np.zeros((n, m))
for i in range(m):
result[y[i]][i] = 1
return result
下面是测试函数
def test(X_train, Y_train, X_test, Y_test):
L = [2, 5, 5, 5, 5, 1]
nn = neural_network(dim_num_list=L, a_functs=("reLU", "softmax"), batch_size=200)
nn.fit(X_train, Y_train, learning_rate=0.1, iter_num=1000)
result = nn.predict(X_test)
print("result = ", result)
y = Y_test[0]
cnt = 0
for e in range(len(y)):
if result[e] == y[e]:
cnt += 1
print(cnt / len(y))
下面是激活函数代码
import numpy as np
def sigmoid(Z):
positive_mask = (Z >= 0)
negative_mask = Z < 0
result_positive = 1 / (1 + np.exp(-Z * positive_mask))
result_positive[~positive_mask] = 0
result_negative = np.exp(Z * negative_mask) / (np.exp(Z * negative_mask) + 1)
result_negative[~negative_mask] = 0
result = result_negative + result_positive
return result
def reLU(Z):
return np.maximum(0, Z)
def tanh(Z):
return np.tanh(Z)
def softmax(Z):
max_Z = np.max(Z, axis=0)
return np.exp(Z - max_Z) / np.sum(np.exp(Z - max_Z), axis=0)
下面是激活函数的微分代码:因为反向传播的时候并没有对softmax直接求导,所以就没有实现softmax_d函数
import numpy as np
def sigmoid_d(A, Z):
return A * (1 - A)
def reLU_d(A ,Z):
result = (Z > 0) * 1
return result
def tanh_d(A, Z):
return 1 - A * A
def softmax_d(A, Z):
pass
下面是main函数代码:
from nn import neural_network, test
from deep_learning_course.course_1_3.planar_utils import load_planar_dataset
X_train, Y_train = load_planar_dataset()
X_train /= 4
# plt.scatter(X_train[0, :], X_train[1, :], c=Y_train, s=40, cmap=plt.cm.Spectral) #绘制散点图
# plt.show()
X_test, Y_test = load_planar_dataset()
X_test /= 4
Y_train = neural_network().get_one_hot(Y_train[0], 2)
Y_test = neural_network().get_one_hot(Y_test[0], 2)
test(X_train, Y_train, X_test, Y_test)
下面是planar_utils文件里的内容:
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import sklearn.datasets
import sklearn.linear_model
def plot_decision_boundary(model, X, y):
# Set min and max values and give it some padding
x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
h = 0.01
# Generate a grid of points with distance h between them
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Predict the function value for the whole grid
Z = model(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Plot the contour and training examples
plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
plt.ylabel('x2')
plt.xlabel('x1')
plt.scatter(X[0, :], X[1, :], c=y, cmap=plt.cm.Spectral)
def sigmoid(Z):
positive_mask = (Z >= 0)
negative_mask = Z < 0
result_positive = 1 / (1 + np.exp(-Z * positive_mask))
result_positive[~positive_mask] = 0
result_negative = np.exp(Z * negative_mask) / (np.exp(Z * negative_mask) + 1)
result_negative[~negative_mask] = 0
result = result_negative + result_positive
return result
def load_planar_dataset():
# np.random.seed(1)
m = 400 # number of examples
N = int(m/2) # number of points per class
D = 2 # dimensionality
X = np.zeros((m,D)) # data matrix where each row is a single example
Y = np.zeros((m,1), dtype='uint8') # labels vector (0 for red, 1 for blue)
a = 4 # maximum ray of the flower
for j in range(2):
ix = range(N*j,N*(j+1))
t = np.linspace(j*3.12,(j+1)*3.12,N) + np.random.randn(N)*0.2 # theta
r = a*np.sin(4*t) + np.random.randn(N)*0.2 # radius
X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
Y[ix] = j
X = X.T
Y = Y.T
return X, Y
def load_extra_datasets():
N = 200
noisy_circles = sklearn.datasets.make_circles(n_samples=N, factor=.5, noise=.3)
noisy_moons = sklearn.datasets.make_moons(n_samples=N, noise=.2)
blobs = sklearn.datasets.make_blobs(n_samples=N, random_state=5, n_features=2, centers=6)
gaussian_quantiles = sklearn.datasets.make_gaussian_quantiles(mean=None, cov=0.5, n_samples=N, n_features=2, n_classes=2, shuffle=True, random_state=None)
no_structure = np.random.rand(N, 2), np.random.rand(N, 2)
return noisy_circles, noisy_moons, blobs, gaussian_quantiles, no_structure