用softmax 做一个二分类,预测正确率为0.2左右, 将预测结果的0,1互换后感觉才是比较理想的预测结果。可能是哪里出了问题?

这个神经网络(5层左右,隐藏层是reLU)的输出层用sigmoid的时候挺正常的。我只是把输出层改成了softmax, 梯度下降的时候cost也在减小。



import numpy as np
from function_set.activate_functions import sigmoid, reLU, tanh, softmax  # eval()会用到,不删
from function_set.activate_differential import sigmoid_d, reLU_d, tanh_d  # eval()会用到,不删

class neural_network:
    def __init__(self,
                 a_functs=("tanh", "sigmoid"),
        self.funct_list = funct_list  # 激活函数列表,记录每一层的激活函数
        self.layer_num = layer_num  # 层数
        self.dim_num_list = dim_num_list  # 每一层的结点数
        self.a_functs = a_functs  # 激活函数名
        self.paras = dict()  # 保存参数 W,b
        self.cache = dict()  # 存储前向传播的数据,供反向传播使用
        self.reg = reg.upper()
        self.epsilon = 1e-8  # 精度,防止除0或log
        self.lambda_reg = lambda_reg  # L2正则化的lambda
        self.beta_momentum = beta_momentum  # 动量梯度下降法的超参数
        self.iter_cnt = 0  # 迭代计数, 用于momentum等优化
        self.batch_size = batch_size  # mini-batch的大小
        self.keep_prob = keep_prob  # dropout中的保持概率

    def init(self, X, Y):
        if not self.dim_num_list:
            self.dim_num_list = np.random.randint(30, 31, self.layer_num + 1)  # 默认维度为30
            self.layer_num = len(self.dim_num_list) - 1
        if not self.funct_list:
            self.funct_list = list()
            for i in range(0, self.layer_num):
        self.dim_num_list[0] = X.shape[0]
        self.dim_num_list[self.layer_num] = Y.shape[0]
        for i in range(1, self.layer_num + 1):
            self.cache["v_dW" + str(i)] = np.zeros((self.dim_num_list[i], self.dim_num_list[i - 1]))
            self.cache["v_db" + str(i)] = np.zeros((self.dim_num_list[i], 1))

    def init_paras(self):
        for i in range(1, self.layer_num + 1):
            self.paras["W" + str(i)] = np.random.randn(self.dim_num_list[i], self.dim_num_list[i - 1])
            self.paras["b" + str(i)] = np.zeros(self.dim_num_list[i], float).reshape(-1, 1)

    def forward_propagate(self, X):
        self.cache["A0"] = X
        for i in range(1, self.layer_num + 1):
            self.cache["Z" + str(i)] = np.dot(self.paras["W" + str(i)], self.cache["A" + str(i - 1)])
            self.cache["A" + str(i)] = eval(self.funct_list[i])(self.cache["Z" + str(i)])
            if self.keep_prob < 1:  # dropout
                D = np.random.rand(self.cache["A" + str(i)].shape[0], self.cache["A" + str(i)].shape[1])
                D = (D < self.keep_prob) * 1
                self.cache["D" + str(i)] = D
                self.cache["A" + str(i)] = self.cache["A" + str(i)] * D / self.keep_prob

    def get_cost(self, Y):
        m = self.cache["A0"].shape[1]
        A = self.cache["A" + str(self.layer_num)]
        Z = self.cache["A" + str(self.layer_num)]
        cost1 = 0  # cost1为交叉熵
        loss_matrix = None
        if self.funct_list[self.layer_num] == "sigmoid":
            # loss_matrix = -Y * (np.maximum(Z, 0) - np.log(1 + np.exp(-np.abs(Z))))
            loss_matrix = np.maximum(Z, 0) - Z * Y + np.log(1 + np.exp(-np.abs(Z)))
        elif self.funct_list[self.layer_num] == "softmax":
            max_Z = np.max(Z, axis=0)
            loss_matrix = Y * (max_Z - Z + np.log(np.sum(np.exp(Z - max_Z), axis=0)))
        cost1 += (1 / m) * np.sum(np.sum(loss_matrix, axis=0), axis=0)
        cost2 = 0  # cost2 为正则项
        if self.reg == "L2":
            for i in range(1, self.layer_num + 1):
                cost2 += self.lambda_reg * np.sum(np.sum(self.paras["W" + str(i)] * self.paras["W" + str(i)], axis=0),
        cost = cost1 + cost2
        print("cost1 = ", cost1, "\tcost2=", cost2)
        return cost

    def backward_propagate(self, Y):
        A = self.cache["A" + str(self.layer_num)]
        m = A.shape[1]
        dA = (-1 / m) * Y * (1 / (A + self.epsilon))
        i = self.layer_num
        while (i > 0):
            function_name = self.funct_list[i]
            if (i == self.layer_num) and (function_name == "softmax" or function_name == "sigmoid"):
                dZ = (1 / m) * (A - Y)
                if self.keep_prob < 1:
                    dA = dA * self.cache["D" + str(i)] / self.keep_prob
                dZ = dA * eval(function_name + "_d")(self.cache["A" + str(i)], self.cache["Z" + str(i)])
            dW_reg = 2 * self.lambda_reg * self.paras["W" + str(i)]
            db_reg = 2 * self.lambda_reg * self.paras["b" + str(i)]
            self.cache["dW" + str(i)] = np.dot(dZ, self.cache["A" + str(i - 1)].T) + dW_reg
            self.cache["db" + str(i)] = np.sum(dZ, axis=1).reshape(-1, 1) + db_reg
            dA = np.dot(self.paras["W" + str(i)].T, dZ)
            i -= 1

    def update_parameters(self, learning_rate):
        for i in range(1, self.layer_num + 1):
            self.cache["v_dW" + str(i)] = self.beta_momentum * self.cache["v_dW" + str(i)] + (1 - self.beta_momentum) * \
                                          self.cache["dW" + str(i)]
            self.cache["v_db" + str(i)] = self.beta_momentum * self.cache["v_db" + str(i)] + (1 - self.beta_momentum) * \
                                          self.cache["db" + str(i)]
            self.cache["v_c_dW" + str(i)] = self.cache["v_dW" + str(i)] / (1 - self.beta_momentum ** self.iter_cnt)
            self.cache["v_c_db" + str(i)] = self.cache["v_db" + str(i)] / (1 - self.beta_momentum ** self.iter_cnt)
            self.paras["W" + str(i)] = self.paras["W" + str(i)] - learning_rate * self.cache["v_c_dW" + str(i)]
            self.paras["b" + str(i)] = self.paras["b" + str(i)] - learning_rate * self.cache["v_c_db" + str(i)]

    def fit(self, X, Y, learning_rate=0.5, iter_num=1000):
        self.init(X, Y)
        batch_generator = self.get_batch(X, Y)
        for X, Y in batch_generator:
            self.iter_cnt = 0
            for epoch in range(iter_num):
                self.iter_cnt += 1
                cost = self.get_cost(Y)

    def predict_probability(self, X):
        return self.cache["A" + str(self.layer_num)]

    def predict(self, X):
        probabilty = self.predict_probability(X)
        return self.map_to_int(probabilty)

    def get_batch(self, X, Y):  # mini-batch生成器
        m = X.shape[1]
        if self.batch_size == -1:
            self.batch_size = m
        batch_num = (m + self.batch_size - 1) // self.batch_size
        index_list = np.random.permutation(m)
        start = 0
        for i in range(batch_num):
            end = min(start + self.batch_size, m)
            yield X[:, index_list[start:end]], Y[:, index_list[start:end]]
            start = end

    def map_to_int(self, A):  # 概率转预测结果
        result = None  #
        if self.funct_list[self.layer_num] == "softmax":
            result = np.argmax(A, axis=0)
        elif self.funct_list[self.layer_num] == "sigmoid":
            result = (A >= 0.5) * 1
            result = result[0]
        return result

    def get_one_hot(self, y, class_num):  # 将整数类别转换为独热编码
        n = class_num
        m = len(y)
        result = np.zeros((n, m))
        for i in range(m):
            result[y[i]][i] = 1
        return result


def test(X_train, Y_train, X_test, Y_test):
    L = [2, 5, 5, 5, 5, 1]
    nn = neural_network(dim_num_list=L, a_functs=("reLU", "softmax"), batch_size=200)
    nn.fit(X_train, Y_train, learning_rate=0.1, iter_num=1000)
    result = nn.predict(X_test)
    print("result = ", result)
    y = Y_test[0]
    cnt = 0
    for e in range(len(y)):
        if result[e] == y[e]:
            cnt += 1
    print(cnt / len(y))


import numpy as np

def sigmoid(Z):
    positive_mask = (Z >= 0)
    negative_mask = Z < 0
    result_positive = 1 / (1 + np.exp(-Z * positive_mask))
    result_positive[~positive_mask] = 0
    result_negative = np.exp(Z * negative_mask) / (np.exp(Z * negative_mask) + 1)
    result_negative[~negative_mask] = 0
    result = result_negative + result_positive
    return result

def reLU(Z):
    return np.maximum(0, Z)

def tanh(Z):
    return np.tanh(Z)

def softmax(Z):
    max_Z = np.max(Z, axis=0)
    return np.exp(Z - max_Z) / np.sum(np.exp(Z - max_Z), axis=0)


import numpy as np

def sigmoid_d(A, Z):
    return A * (1 - A)

def reLU_d(A ,Z):
    result = (Z > 0) * 1
    return result

def tanh_d(A, Z):
    return 1 - A * A

def softmax_d(A, Z):


from nn import neural_network, test
from deep_learning_course.course_1_3.planar_utils import load_planar_dataset

X_train, Y_train = load_planar_dataset()
X_train /= 4
# plt.scatter(X_train[0, :], X_train[1, :], c=Y_train, s=40, cmap=plt.cm.Spectral) #绘制散点图
# plt.show()
X_test, Y_test = load_planar_dataset()
X_test /= 4
Y_train = neural_network().get_one_hot(Y_train[0], 2)
Y_test = neural_network().get_one_hot(Y_test[0], 2)

test(X_train, Y_train, X_test, Y_test)


import matplotlib.pyplot as plt
import numpy as np
import sklearn
import sklearn.datasets
import sklearn.linear_model

def plot_decision_boundary(model, X, y):
    # Set min and max values and give it some padding
    x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
    y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
    h = 0.01
    # Generate a grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Predict the function value for the whole grid
    Z = model(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    # Plot the contour and training examples
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.scatter(X[0, :], X[1, :], c=y, cmap=plt.cm.Spectral)

def sigmoid(Z):
    positive_mask = (Z >= 0)
    negative_mask = Z < 0
    result_positive = 1 / (1 + np.exp(-Z * positive_mask))
    result_positive[~positive_mask] = 0
    result_negative = np.exp(Z * negative_mask) / (np.exp(Z * negative_mask) + 1)
    result_negative[~negative_mask] = 0
    result = result_negative + result_positive
    return result

def load_planar_dataset():
    # np.random.seed(1)
    m = 400 # number of examples
    N = int(m/2) # number of points per class
    D = 2 # dimensionality
    X = np.zeros((m,D)) # data matrix where each row is a single example
    Y = np.zeros((m,1), dtype='uint8') # labels vector (0 for red, 1 for blue)
    a = 4 # maximum ray of the flower

    for j in range(2):
        ix = range(N*j,N*(j+1))
        t = np.linspace(j*3.12,(j+1)*3.12,N) + np.random.randn(N)*0.2 # theta
        r = a*np.sin(4*t) + np.random.randn(N)*0.2 # radius
        X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
        Y[ix] = j

    X = X.T
    Y = Y.T

    return X, Y

def load_extra_datasets():  
    N = 200
    noisy_circles = sklearn.datasets.make_circles(n_samples=N, factor=.5, noise=.3)
    noisy_moons = sklearn.datasets.make_moons(n_samples=N, noise=.2)
    blobs = sklearn.datasets.make_blobs(n_samples=N, random_state=5, n_features=2, centers=6)
    gaussian_quantiles = sklearn.datasets.make_gaussian_quantiles(mean=None, cov=0.5, n_samples=N, n_features=2, n_classes=2, shuffle=True, random_state=None)
    no_structure = np.random.rand(N, 2), np.random.rand(N, 2)

    return noisy_circles, noisy_moons, blobs, gaussian_quantiles, no_structure

