单隐层多分类网络
目录
单隐层多分类神经网络(numpy实现)
使用Numpy实现,并且使用命令行的形式设定参数。
是一个作业里面的,实现的时候踩了一些坑,主要是训练里面的,最主要的就是没有实现batch,所以对于batch数据就要加上一层循环了。还有就是shuffle函数,返回值不要和参数一样,意思就是要对一个固定的数据进行打乱,而不是对上一epoch的数据进行打乱,这个坑很难察觉到。
下面是代码:
import numpy as np
import matplotlib.pyplot as plt
import argparse
import logging
from typing import Callable
# This takes care of command line argument parsing for you!
# To access a specific argument, simply access args.<argument name>.
= argparse.ArgumentParser()
parser
'train_input', type=str,
parser.add_argument(
help='path to training input .csv file')
'validation_input', type=str,
parser.add_argument(
help='path to validation input .csv file')
'train_out', type=str,
parser.add_argument(
help='path to store prediction on training data')
'validation_out', type=str,
parser.add_argument(
help='path to store prediction on validation data')
'metrics_out', type=str,
parser.add_argument(
help='path to store training and testing metrics')
'num_epoch', type=int,
parser.add_argument(
help='number of training epochs')
'hidden_units', type=int,
parser.add_argument(
help='number of hidden units')
'init_flag', type=int, choices=[1, 2],
parser.add_argument(
help='weight initialization functions, 1: random')
'learning_rate', type=float,
parser.add_argument(
help='learning rate')
'--debug', type=bool, default=False,
parser.add_argument(
help='set to True to show logging')
def args2data(args) :
'''
No need to modify this function!
Parse command line arguments, create train/test data and labels.
:return:
X_tr: train data *without label column and without bias folded in* (numpy array)
y_tr: train label (numpy array)
X_te: test data *without label column and without bias folded in* (numpy array)
y_te: test label (numpy array)
out_tr: predicted output for train data (file)
out_te: predicted output for test data (file)
out_metrics: output for train and test error (file)
n_epochs: number of train epochs
n_hid: number of hidden units
init_flag: weight initialize flag -- 1 means random, 2 means zero
lr: learning rate
'''
# Get data from arguments
= args.train_out
out_tr
= args.validation_out
out_te
= args.metrics_out
out_metrics
= args.num_epoch
n_epochs
= args.hidden_units
n_hid
= args.init_flag
init_flag
= args.learning_rate
lr
= np.loadtxt(args.train_input, delimiter=',')
X_tr
= X_tr[:, 0].astype(int)
y_tr
= X_tr[:, 1:] # cut off label column
X_tr
= np.loadtxt(args.validation_input, delimiter=',')
X_te
= X_te[:, 0].astype(int)
y_te
= X_te[:, 1:] # cut off label column
X_te
return (X_tr, y_tr, X_te, y_te, out_tr, out_te, out_metrics,
n_epochs, n_hid, init_flag, lr)
def shuffle(X, y, epoch):
'''
DO NOT modify this function.
Permute the training data for SGD.
:param X: The original input data in the order of the file.
:param y: The original labels in the order of the file.
:param epoch: The epoch number (0-indexed).
:return: Permuted X and y training data for the epoch.
'''
np.random.seed(epoch)
= len(y)
N
= np.random.permutation(N)
ordering
return X[ordering], y[ordering]
def random_init(shape):
'''
Randomly initialize a numpy array of the specified shape
:param shape: list or tuple of shapes
:return: initialized weights
'''
= shape
M, D
*D) # Don't change this line!
np.random.seed(M
# TODO: create the random matrix here!
# Hint: numpy might have some useful function for this
= np.random.uniform(-0.1, 0.1, size=(M, D))
W
return W
def zero_init(shape):
'''
Do not modify this function.
Initialize a numpy array of the specified shape with zero
:param shape: list or tuple of shapes
:return: initialized weights
'''
return np.zeros(shape = shape)
def softmax(z: np.ndarray) -> np.ndarray:
'''
Implement softmax function.
:param z: input logits of shape (num_classes,)
:return: softmax output of shape (num_classes,)
'''
= z - np.max(z)
shiftz
= np.exp(shiftz)
expz
return expz / np.sum(expz)
def cross_entropy(y: np.ndarray, y_hat: np.ndarray) -> np.ndarray:
'''
Compute cross entropy loss.
:param y: label (a number or an array containing a single element)
:param y_hat: prediction with shape (num_classes,)
:return: cross entropy loss
'''
if isinstance(y, np.ndarray):
= y[0]
y
return -np.log(y_hat[y])
def d_softmax_cross_entropy(y: np.ndarray, y_hat: np.ndarray) -> np.ndarray:
'''
Compute gradient of loss w.r.t. ** softmax input **.
Note that here instead of calculating the gradient w.r.t. the softmax
probabilities, we are directly computing gradient w.r.t. the softmax input.
Try deriving the gradient yourself (see Question 1.2(b) on the written),
and you'll see why we want to calculate this in a single step.
:param y: label (a number or an array containing a single element)
:param y_hat: predicted softmax probability with shape (num_classes,)
:return: gradient with shape (num_classes,)
'''
if isinstance(y, np.ndarray):
= y[0]
y
-= 1
y_hat[y]
return y_hat
class Sigmoid(object):
def __init__(self):
'''
Initialize state for sigmoid activation layer
'''
# Create cache to hold values for backward pass
self.cache: dict[str, np.ndarray] = dict()
def forward(self, x: np.ndarray) -> np.ndarray:
'''
Implement sigmoid activation function.
:param x: input of shape (num_classes,)
:return: sigmoid output of shape (num_classes,)
'''
= 1 / (1 + np.exp(-x))
sigmoid
self.cache['sigmoid'] = sigmoid
return sigmoid
def backward(self, dz: np.ndarray) -> np.ndarray:
'''
:param dz: partial derivative of loss with respect to output of sigmoid activation
:return: partial derivative of loss with respect to input of sigmoid activation
'''
= self.cache['sigmoid']
sigmoid
return dz * sigmoid * (1 - sigmoid)
# This refers to a function type that takes in a tuple of 2 integers (row, col)
# and returns a numpy array (which should have the specified dimensions).
class Linear(object):
def __init__(self, input_size: int, output_size: int,
float):
weight_init_fn, learning_rate:
'''
:param input_size: number of units in the input of the layer
*not including* the folded bias
:param output_size: number of units in the output of the layer
:param weight_init_fn: function that creates and initializes weight
matrices for layer. This function takes in a
tuple (row, col) and returns a matrix with
shape row x col.
:param learning_rate: learning rate for SGD training updates
'''
# TODO: Initialize weight matrix for this layer - since we are
# folding the bias into the weight matrix, be careful about the
# shape you pass in.
self.w = weight_init_fn((output_size, input_size + 1))
# TODO: Initialize matrix to store gradient with respect to weights
self.dw = np.zeros_like(self.w, dtype=np.float64)
# TODO: Initialize learning rate for SGD
self.lr = learning_rate
# Create cache to hold certain values for backward pass
self.cache: dict[str, np.ndarray] = dict()
def forward(self, x: np.ndarray) -> np.ndarray:
'''
:param x: Input to linear layer with shape (input_size,)
where input_size *does not include* the folded bias.
In other words, the input does not contain the bias column
and you will need to add it in yourself in this method.
:return: output z of linear layer with shape (output_size,)
HINT: You may want to cache some of the values you compute in this
function. Inspect your expressions for backprop to see which values
should be cached.
'''
if x.ndim == 1:
= np.append(1, x)
x
else:
= np.append(np.ones((x.shape[0], 1)), x, axis=1)
x
= np.dot(x, self.w.T)
z
self.cache['x'] = x
return z
def backward(self, dz: np.ndarray) -> np.ndarray:
'''
:param dz: partial derivative of loss with respect to output z of linear
:return: dx, partial derivative of loss with respect to input x of linear
Note that this function should set self.dw (gradient of weights with respect to loss)
but not directly modify self.w; NN.step() is responsible for updating the weights.
HINT: You may want to use some of the values you previously cached in
your forward() method.
'''
# TODO: implement this based on your written answers!
# Hint: when calculating dx, be careful to use the right "version" of
# the weight matrix!
= np.dot(self.w.T, dz)[1:]
dx
self.dw = np.dot(dz[:, np.newaxis], self.cache['x'][:, np.newaxis].T)
return dx
def step(self) -> None:
'''
Apply SGD update to weights using self.dw, which should have been
set in NN.backward().
'''
# TODO: implement this!
self.w -= self.lr * self.dw
self.dw = np.zeros_like(self.w, dtype=np.float64)
class NN(object):
def __init__(self, input_size: int, hidden_size: int, output_size: int,
float):
weight_init_fn, learning_rate:
'''
Initalize neural network (NN) class. Note that this class is composed
of the layer objects (Linear, Sigmoid) defined above.
:param input_size: number of units in input to network
:param hidden_size: number of units in the hidden layer of the network
:param output_size: number of units in output of the network - this
should be equal to the number of classes
:param weight_init_fn: function that creates and initializes weight
matrices for layer. This function takes in a
tuple (row, col) and returns a matrix with
shape row x col.
:param learning_rate: learning rate for SGD training updates
'''
self.weight_init_fn = weight_init_fn
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
# TODO: initialize modules (see section 9.1.2 of the writeup)
# Hint: use the classes you've implemented above!
self.linear1 = Linear(input_size, hidden_size, weight_init_fn, learning_rate)
self.activation = Sigmoid()
self.linear2 = Linear(hidden_size, output_size, weight_init_fn, learning_rate)
def forward(self, x: np.ndarray) -> np.ndarray:
'''
Neural network forward computation.
Follow the pseudocode!
:param X: input data point *without the bias folded in*
:param nn: neural network class
:return: output prediction with shape (num_classes,). This should be
a valid probability distribution over the classes.
'''
= self.linear1.forward(x)
z1
= self.activation.forward(z1)
a1
= self.linear2.forward(a1)
z2
return softmax(z2)
def backward(self, y: np.ndarray, y_hat: np.ndarray) -> None:
'''
Neural network backward computation.
Follow the pseudocode!
:param y: label (a number or an array containing a single element)
:param y_hat: prediction with shape (num_classes,)
:param nn: neural network class
'''
= d_softmax_cross_entropy(y, y_hat)
dz2
= self.linear2.backward(dz2)
da1
= self.activation.backward(da1)
dz1
= self.linear1.backward(dz1)
dx
def step(self) -> None:
'''
Apply SGD update to weights.
'''
self.linear1.step()
self.linear2.step()
def print_weights(self) -> None:
'''
An example of how to use logging to print out debugging infos.
Note that we use the debug logging le vel -- if we use a higher logging
level, we will log things with the default logging configuration,
causing potential slowdowns.
Note that we log NumPy matrices on separate lines -- if we do not do this,
the arrays will be turned into strings even when our logging is set to
ignore debug, causing potential massive slowdowns.
'''
f"shape of w1: {self.linear1.w.shape}")
logging.debug(
self.linear1.w)
logging.debug(
f"shape of w2: {self.linear2.w.shape}")
logging.debug(
self.linear2.w)
logging.debug(
def test(X: np.ndarray, y: np.ndarray, nn: NN):
'''
Compute the label and error rate.
:param X: input data
:param y: label
:param nn: neural network class
:return:
labels: predicted labels
error_rate: prediction error rate
'''
= []
labels
for x in X:
= nn.forward(x)
y_hat
labels.append(np.argmax(y_hat))
= np.array(labels)
labels
= np.mean(labels != y)
error_rate
return labels, error_rate
def train(X_tr: np.ndarray, y_tr: np.ndarray,
X_test: np.ndarray, y_test: np.ndarray,
int):
nn: NN, n_epochs:
'''
Train the network using SGD for some epochs.
:param X_tr: train data
:param y_tr: train label
:param X_te: train data
:param y_te: train label
:param nn: neural network class
:param n_epochs: number of epochs to train for
:return:
train_losses: Training losses *after* each training epoch
test_losses: Test losses *after* each training epoch
'''
# TODO: implement this!
# Hint: Be sure to shuffle the train data at the start of each epoch
# using *our provided* shuffle() function.
= []
train_losses
= []
test_losses
for epoch in range(n_epochs):
= shuffle(X_tr, y_tr, epoch)
X_train, y_train
for x, y in zip(X_train, y_train):
= nn.forward(x)
y_hat
nn.backward(y, y_hat)
nn.step()
print('alpha:', nn.linear1.w)
print('beta:', nn.linear2.w)
= []
train_temp
= []
test_temp
for x, y in zip(X_train, y_train):
= nn.forward(x)
y_hat
train_temp.append(cross_entropy(y, y_hat))
for x, y in zip(X_test, y_test):
= nn.forward(x)
y_hat
test_temp.append(cross_entropy(y, y_hat))
train_losses.append(np.mean(train_temp))
test_losses.append(np.mean(test_temp))
return train_losses, test_losses
记录下来主要是学习这种思想,神经网络就是不同的组件一点一点拼接起来的,这点在复现transformer的时候深有感受,因此以后复现论文的话要按照这样的思路来,比如linear、sigmoid,都实现了forward和backword,这样的思想就不会显得特别复杂。特此记录。