CS231n Assignment2

本篇文章中,我們將完成Assignment 2。

一 全連接網路

在本練習中,我們將使用更加模塊化的方法實現全連接網路。 每個模塊之間相互獨立,運行的時候可以相互調用,使得我們的神經網路結構十分靈活。

Fully-Connected Neural Nets(主)

# 下載 CIFAR10 data.data = get_CIFAR10_data()for k, v in data.iteritems(): print %s: % k, v.shapeX_val: (1000, 3, 32, 32)X_train: (49000, 3, 32, 32)X_test: (1000, 3, 32, 32)y_val: (1000,)y_train: (49000,)y_test: (1000,)


import numpy as npdef affine_forward(x, w, b): """ - x: (N, d_1, ..., d_k) - w: (D, M) - b: (M,) 返回: - out: (N, M) - cache: (x, w, b) """ out = None N = x.shape[0] x_temp = x.reshape(N,-1) out = x_temp.dot(w) + b cache = (x, w, b) return out, cachedef affine_backward(dout, cache): """ 輸入: - dout: (N, M) - cache: - x: (N, d_1, ... d_k) - w: (D, M) 返回: - dx: (N, d1, ..., d_k) - dw: (D, M) - db: (M,) """ x, w, b = cache dx, dw, db = None, None, None db = np.sum(dout, axis = 0) x_temp = x.reshape(x.shape[0],-1) dw = x_temp.T.dot(dout) dx = dout.dot(w.T).reshape(x.shape) return dx, dw, dbdef relu_forward(x): """ 輸入: - x 返回: - out - cache: x """ out = None out = np.copy(x) out[out<0] = 0 cache = x return out, cachedef relu_backward(dout, cache): dx, x = None, cache dx = np.copy(dout) dx[x<0] = 0 return dx


def affine_relu_forward(x, w, b): """ affine - ReLU 輸入: - x: affine層的輸入 - w, b: affine 層的權重 返回: - out: ReLU層的輸出 - cache: 反向傳播所需 """ a, fc_cache = affine_forward(x, w, b) out, relu_cache = relu_forward(a) cache = (fc_cache, relu_cache) return out, cachedef affine_relu_backward(dout, cache): fc_cache, relu_cache = cache da = relu_backward(dout, relu_cache) dx, dw, db = affine_backward(da, fc_cache) return dx, dw, dbpass

下面完成損失層 SVM和Softmax:(與之前相同)

def svm_loss(x, y): """ 輸入: - x: (N,C) - y: (N,) 返回: - loss - dx """ N = x.shape[0] correct_class_scores = x[np.arange(N), y] margins = np.maximum(0, x - correct_class_scores[:, np.newaxis] + 1.0) margins[np.arange(N), y] = 0 loss = np.sum(margins) / N num_pos = np.sum(margins > 0, axis=1) dx = np.zeros_like(x) dx[margins > 0] = 1 dx[np.arange(N), y] -= num_pos dx /= N return loss, dxdef softmax_loss(x, y): probs = np.exp(x - np.max(x, axis=1, keepdims=True)) probs /= np.sum(probs, axis=1, keepdims=True) N = x.shape[0] loss = -np.sum(np.log(probs[np.arange(N), y])) / N dx = probs.copy() dx[np.arange(N), y] -= 1 dx /= Nreturn loss, dx


import numpy as npfrom cs231n.layers import *from cs231n.layer_utils import *class TwoLayerNet(object): """ affine - relu - affine - softmax. """ def __init__(self, input_dim=3*32*32, hidden_dim=100, num_classes=10, weight_scale=1e-3, reg=0.0): """ 輸入: - input_dim: 輸入層尺寸 - hidden_dim: 隱藏層尺寸 - num_classes: 類別數 - dropout: 隨機失活強度 0~1 - weight_scale: - reg: """ self.params = {} self.reg = reg self.params[W1] = np.random.normal(0, weight_scale, (input_dim, hidden_dim)) self.params[b1] = np.zeros(hidden_dim) self.params[W2] = np.random.normal(0, weight_scale, (hidden_dim, num_classes)) self.params[b2] = np.zeros(num_classes) def loss(self, X, y=None): """ 輸入: - X: (N, d_1, ..., d_k) - y: (N,) 返回: If y is None, 運行 test-time forward 返回: - scores: (N, C) If y is not None, 運行 training-time forward 和 backward pass 返回: - loss - grads """ scores = None affine_relu_out, affine_relu_cache = affine_relu_forward(X, self.params[W1], self.params[b1]) affine2_out, affine2_cache = affine_forward(affine_relu_out, self.params[W2], self.params[b2]) scores = affine2_out # If y is None 我們運行測試集返回的是分數 if y is None: return scores loss, grads = 0, {} loss, dscores = softmax_loss(scores, y) loss += 0.5 * self.reg*(np.sum(self.params[W1]* self.params[W1]) + np.sum(self.params[W2]* self.params[W2])) affine2_dx, affine2_dw, affine2_db = affine_backward(dscores, affine2_cache) grads[W2] = affine2_dw + self.reg * self.params[W2] #別忘加上正則化部分 grads[b2] = affine2_db affine1_dx, affine1_dw, affine1_db = affine_relu_backward(affine2_dx, affine_relu_cache) grads[W1] = affine1_dw + self.reg * self.params[W1] grads[b1] = affine1_db return loss, grads


import numpy as npfrom cs231n import optimclass Solver(object): """ Solver 中封裝了所有需要用來進行訓練的模塊. Solver 中完成SGD所用的不同更新 規則全部都在 optim.py中. solver 接受 training 和 validataion 中的數據和標籤 要訓練模型,你將首先構造一個Solver, 傳進模型 數據 和各種參數 (learning rate, batch size, etc) 然後運行train()模塊去進行優化 之後model.params 中將會包含訓練後的表現最好的參數 就像下面這樣: data = { X_train: # training 數據 y_train: # training 標籤 X_val: # validation 數據 X_train: # validation 標籤 } model = MyAwesomeModel(hidden_size=100, reg=10) solver = Solver(model, data, update_rule=sgd, optim_config={ learning_rate: 1e-3, }, lr_decay=0.95, num_epochs=10, batch_size=100, print_every=100) solver.train() 一個 Solver 包含下列 API: - model.params:包含各種參數 - model.loss(X, y) 計算損失值 """ def __init__(self, model, data, **kwargs): """ 要求包含: - model: - data: X_train: (N_train, d_1, ..., d_k) X_val: (N_val, d_1, ..., d_k) y_train: (N_train,) y_val: (N_val,) 可選參數: - update_rule: 更新規則 全部包含於optim.py. 默認為 sgd. - optim_config: - lr_decay: - batch_size: - num_epochs: - print_every: - verbose: """ self.model = model self.X_train = data[X_train] self.y_train = data[y_train] self.X_val = data[X_val] self.y_val = data[y_val] self.update_rule = kwargs.pop(update_rule, sgd) self.optim_config = kwargs.pop(optim_config, {}) self.lr_decay = kwargs.pop(lr_decay, 1.0) self.batch_size = kwargs.pop(batch_size, 100) self.num_epochs = kwargs.pop(num_epochs, 10) self.print_every = kwargs.pop(print_every, 10) self.verbose = kwargs.pop(verbose, True) # Throw an error if there are extra keyword arguments if len(kwargs) > 0: extra = , .join("%s" % k for k in kwargs.keys()) raise ValueError(Unrecognized arguments %s % extra) # Make sure the update rule exists, then replace the string # name with the actual function if not hasattr(optim, self.update_rule): raise ValueError(Invalid update_rule "%s" % self.update_rule) self.update_rule = getattr(optim, self.update_rule) self._reset() def _reset(self): """ Set up some book-keeping variables for optimization. Dont call this manually. """ # Set up some variables for book-keeping self.epoch = 0 self.best_val_acc = 0 self.best_params = {} self.loss_history = [] self.train_acc_history = [] self.val_acc_history = [] # Make a deep copy of the optim_config for each parameter self.optim_configs = {} for p in self.model.params: d = {k: v for k, v in self.optim_config.iteritems()} self.optim_configs[p] = d def _step(self): """ Make a single gradient update. This is called by train() and should not be called manually. """ # Make a minibatch of training data num_train = self.X_train.shape[0] batch_mask = np.random.choice(num_train, self.batch_size) X_batch = self.X_train[batch_mask] y_batch = self.y_train[batch_mask] # Compute loss and gradient loss, grads = self.model.loss(X_batch, y_batch) self.loss_history.append(loss) # Perform a parameter update for p, w in self.model.params.iteritems(): dw = grads[p] config = self.optim_configs[p] next_w, next_config = self.update_rule(w, dw, config) self.model.params[p] = next_w self.optim_configs[p] = next_config def check_accuracy(self, X, y, num_samples=None, batch_size=100): """ Check accuracy of the model on the provided data. Inputs: - X: Array of data, of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,) - num_samples: If not None, subsample the data and only test the model on num_samples datapoints. - batch_size: Split X and y into batches of this size to avoid using too much memory. Returns: - acc: Scalar giving the fraction of instances that were correctly classified by the model. """ # Maybe subsample the data N = X.shape[0] if num_samples is not None and N > num_samples: mask = np.random.choice(N, num_samples) N = num_samples X = X[mask] y = y[mask] # Compute predictions in batches num_batches = N / batch_size if N % batch_size != 0: num_batches += 1 y_pred = [] for i in xrange(num_batches): start = i * batch_size end = (i + 1) * batch_size scores = self.model.loss(X[start:end]) y_pred.append(np.argmax(scores, axis=1)) y_pred = np.hstack(y_pred) acc = np.mean(y_pred == y) return acc def train(self): """ Run optimization to train the model. """ num_train = self.X_train.shape[0] iterations_per_epoch = max(num_train / self.batch_size, 1) num_iterations = self.num_epochs * iterations_per_epoch for t in xrange(num_iterations): self._step() # Maybe print training loss if self.verbose and t % self.print_every == 0: print (Iteration %d / %d) loss: %f % ( t + 1, num_iterations, self.loss_history[-1]) # At the end of every epoch, increment the epoch counter and decay the # learning rate. epoch_end = (t + 1) % iterations_per_epoch == 0 if epoch_end: self.epoch += 1 for k in self.optim_configs: self.optim_configs[k][learning_rate] *= self.lr_decay # Check train and val accuracy on the first iteration, the last # iteration, and at the end of each epoch. first_it = (t == 0) last_it = (t == num_iterations + 1) if first_it or last_it or epoch_end: train_acc = self.check_accuracy(self.X_train, self.y_train, num_samples=1000) val_acc = self.check_accuracy(self.X_val, self.y_val) self.train_acc_history.append(train_acc) self.val_acc_history.append(val_acc) if self.verbose: print (Epoch %d / %d) train acc: %f; val_acc: %f % ( self.epoch, self.num_epochs, train_acc, val_acc) # Keep track of the best model if val_acc > self.best_val_acc: self.best_val_acc = val_acc self.best_params = {} for k, v in self.model.params.iteritems(): self.best_params[k] = v.copy() # At the end of training swap the best params into the model self.model.params = self.best_params





cache += dx**2x += - learning_rate * dx / (np.sqrt(cache) + eps)

這種方法的好處是,對於高梯度的權重,它們的有效學習率被降低了;而小梯度的權重迭代過程中學習率提升了。要注意的是,這裡開根號很重要。平滑參數eps是為了避免除以0的情況,eps一般取值1e-4 到1e-8。



cache = decay_rate * cache + (1 - decay_rate) * dx**2x += - learning_rate * dx / (np.sqrt(cache) + eps)

其中,decay_rate是一個超參數,其值可以在 [0.9, 0.99, 0.999]中選擇。



m = beta1*m + (1-beta1)*dxv = beta2*v + (1-beta2)*(dx**2)x += - learning_rate * m / (np.sqrt(v) + eps)

論文中推薦eps = 1e-8,beta1 = 0.9,beta2 = 0.999。

import numpy as np"""輸入: - w: - dw: - config: 包含各種超參數返回: - next_w: - config: """def sgd(w, dw, config=None): if config is None: config = {} config.setdefault(learning_rate, 1e-2) w -= config[learning_rate] * dw return w, configdef sgd_momentum(w, dw, config=None): """ 結合動量的SGD(最常用) - learning_rate: - momentum: 動量值 - velocity: A numpy array of the same shape as w and dw used to store a moving average of the gradients. """ if config is None: config = {} config.setdefault(learning_rate, 1e-2) config.setdefault(momentum, 0.9) v = config.get(velocity, np.zeros_like(w)) next_w = None next_w = w v = config[momentum]* v - config[learning_rate]*dw next_w +=v config[velocity] = v return next_w, configdef rmsprop(x, dx, config=None): """ - learning_rate: - decay_rate: - epsilon: 小數值 避免分母為零 - cache: """ if config is None: config = {} config.setdefault(learning_rate, 1e-2) config.setdefault(decay_rate, 0.99) config.setdefault(epsilon, 1e-8) config.setdefault(cache, np.zeros_like(x)) next_x = None next_x = x config[cache] = config[decay_rate]*config[cache]+(1-config[decay_rate])*(dx*dx) x += -config[learning_rate]* dx / (np.sqrt(config[cache])+config[epsilon]) return next_x, configdef adam(x, dx, config=None): """ - learning_rate - beta1: m的衰減率 - beta2: v的衰減率 - epsilon - m: Moving average of gradient. - v: Moving average of squared gradient. - t: Iteration number. """ if config is None: config = {} config.setdefault(learning_rate, 1e-3) config.setdefault(beta1, 0.9) config.setdefault(beta2, 0.999) config.setdefault(epsilon, 1e-8) config.setdefault(m, np.zeros_like(x)) config.setdefault(v, np.zeros_like(x)) config.setdefault(t, 0) next_x = None config[t]+=1 config[m] = config[beta1]*config[m] + (1- config[beta1])*dx config[v] = config[beta2]*config[v] + (1- config[beta2])*(dx**2) mb = config[m]/(1-config[beta1]**config[t]) vb = config[v]/(1-config[beta2]**config[t]) next_x = x -config[learning_rate]* mb / (np.sqrt(vb) + config[epsilon]) return next_x, config


model = TwoLayerNet()solver = Nonefor k, v in data.iteritems(): print %s: % k, v.shapemodel = TwoLayerNet(hidden_dim=100, reg= 8.598929e-03)solver = Solver(model, data, update_rule=sgd, optim_config={ learning_rate: 1.207591e-03, }, lr_decay=0.95, num_epochs=10, batch_size=100, print_every=49000)solver.train()

X_val: (1000, 3, 32, 32)X_train: (49000, 3, 32, 32)X_test: (1000, 3, 32, 32)y_val: (1000,)y_train: (49000,)y_test: (1000,)(Iteration 1 / 4900) loss: 2.304148(Epoch 0 / 10) train acc: 0.131000; val_acc: 0.113000(Epoch 1 / 10) train acc: 0.442000; val_acc: 0.447000(Epoch 2 / 10) train acc: 0.448000; val_acc: 0.466000(Epoch 3 / 10) train acc: 0.469000; val_acc: 0.471000(Epoch 4 / 10) train acc: 0.533000; val_acc: 0.496000(Epoch 5 / 10) train acc: 0.553000; val_acc: 0.469000(Epoch 6 / 10) train acc: 0.565000; val_acc: 0.503000(Epoch 7 / 10) train acc: 0.584000; val_acc: 0.497000(Epoch 8 / 10) train acc: 0.592000; val_acc: 0.511000(Epoch 9 / 10) train acc: 0.560000; val_acc: 0.519000(Epoch 10 / 10) train acc: 0.624000; val_acc: 0.501000


#results = {}best_val = -1best_model = Nonelearning_rates = 10**np.random.uniform(-5,-1,5) #-5~-1 平均取5個數regularization_strengths = 10**np.random.uniform(-4,1,5)print learning_ratesprint regularization_strengthsfor lr in learning_rates: for reg in regularization_strengths: model = TwoLayerNet(hidden_dim=100, reg= reg) solver = Solver(model, data, update_rule=sgd, optim_config={ learning_rate: lr, }, lr_decay=0.95, num_epochs=10, batch_size=100, verbose=False) solver.train() val_acc = solver.val_acc_history[-1] if val_acc > best_val: best_val = val_acc best_model = model results[(lr,reg)] = val_acc Print out results.for lr, reg in sorted(results): val_acc = results[(lr, reg)] print lr %e reg %e val accuracy: %f % ( lr, reg, val_acc)print best validation accuracy achieved during cross-validation: %f % best_valprint "a"


首先完成 fc_net.py 里的 FullyConnectedNet類

1 關於Batch Normalization:

Batch Normalization就是在每一層的wx+b和f(wx+b)之間加一個歸一化(將wx+b歸一化成:均值為0,方差為1

通常:Means should be close to zero and stds close to one

gamma, beta = np.ones(C), np.zeros(C)

先給出Batch Normalization的演算法和反向求導公式:

import numpy as npdef batchnorm_forward(x, gamma, beta, bn_param): mode = bn_param[mode] eps = bn_param.get(eps, 1e-5) momentum = bn_param.get(momentum, 0.9) N, D = x.shape running_mean = bn_param.get(running_mean, np.zeros(D, dtype=x.dtype)) running_var = bn_param.get(running_var, np.zeros(D, dtype=x.dtype)) out, cache = None, None if mode == train: sample_mean = np.mean(x, axis=0, keepdims=True) # [1,D] sample_var = np.var(x, axis=0, keepdims=True) # [1,D] x_normalized = (x - sample_mean) / np.sqrt(sample_var + eps) # [N,D] out = gamma * x_normalized + beta cache = (x_normalized, gamma, beta, sample_mean, sample_var, x, eps) running_mean = momentum * running_mean + (1 - momentum) * sample_mean running_var = momentum * running_var + (1 - momentum) * sample_var elif mode == test: x_normalized = (x - running_mean) / np.sqrt(running_var + eps) out = gamma * x_normalized + beta else: raise ValueError(Invalid forward batchnorm mode "%s" % mode) # Store the updated running means back into bn_param bn_param[running_mean] = running_mean bn_param[running_var] = running_var return out, cachedef batchnorm_backward(dout, cache): dx, dgamma, dbeta = None, None, None x_normalized, gamma, beta, sample_mean, sample_var, x, eps = cache N, D = x.shape dx_normalized = dout * gamma # [N,D] x_mu = x - sample_mean # [N,D] sample_std_inv = 1.0 / np.sqrt(sample_var + eps) # [1,D] dsample_var = -0.5 * np.sum(dx_normalized * x_mu, axis=0, keepdims=True) * sample_std_inv**3 dsample_mean = -1.0 * np.sum(dx_normalized * sample_std_inv, axis=0, keepdims=True) - 2.0 * dsample_var * np.mean(x_mu, axis=0, keepdims=True) dx1 = dx_normalized * sample_std_inv dx2 = 2.0/N * dsample_var * x_mu dx = dx1 + dx2 + 1.0/N * dsample_mean dgamma = np.sum(dout * x_normalized, axis=0, keepdims=True) dbeta = np.sum(dout, axis=0, keepdims=True) return dx, dgamma, dbeta

2 Dropout


layers.py 里的 dropout_forward 和 dropout_backward函數

def dropout_forward(x, dropout_param): p, mode = dropout_param[p], dropout_param[mode] if seed in dropout_param: np.random.seed(dropout_param[seed]) mask = None out = None if mode == train: #訓練環節開啟 mask = (np.random.rand(*x.shape) < p) / p out = x * mask elif mode == test: #測試環節關閉 out = x cache = (dropout_param, mask) out = out.astype(x.dtype, copy=False) return out, cachedef dropout_backward(dout, cache): dropout_param, mask = cache mode = dropout_param[mode] dx = None if mode == train: dx = dout * mask elif mode == test: dx = dout return dx

from layer_utils import *class FullyConnectedNet(object): """ 一個擁有任意隱藏層數的全連接網路 我們也會使用dropout 和 batch normalization {affine - [batch norm] - relu - [dropout]} x (L - 1) - affine - softmax def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10, dropout=0, use_batchnorm=False, reg=0.0, weight_scale=1e-2, dtype=np.float32, seed=None): """ def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10, dropout=0, use_batchnorm=False, reg=0.0, weight_scale=1e-2, dtype=np.float32, seed=None): self.use_batchnorm = use_batchnorm self.use_dropout = dropout > 0 self.reg = reg self.num_layers = 1 + len(hidden_dims) self.dtype = dtype self.params = {} layers_dims = [input_dim] + hidden_dims + [num_classes] for i in xrange(self.num_layers): self.params[W + str(i+1)] = weight_scale * np.random.randn(layers_dims[i], layers_dims[i+1]) self.params[b + str(i+1)] = np.zeros((1, layers_dims[i+1])) if self.use_batchnorm and i < len(hidden_dims): self.params[gamma + str(i+1)] = np.ones((1, layers_dims[i+1])) self.params[beta + str(i+1)] = np.zeros((1, layers_dims[i+1])) # 當使用 dropout時,我們要傳遞 dropout_param 到每一dropout層 # 以便知道 dropout probability and the mode # (train / test). You can pass the same dropout_param to each dropout layer. self.dropout_param = {} if self.use_dropout: self.dropout_param = {mode: train, p: dropout} if seed is not None: self.dropout_param[seed] = seed # With batch normalization we need to keep track of running means and # variances, so we need to pass a special bn_param object to each batch # normalization layer. You should pass self.bn_params[0] to the forward pass # of the first batch normalization layer, self.bn_params[1] to the forward # pass of the second batch normalization layer, etc. self.bn_params = [] if self.use_batchnorm: self.bn_params = [{mode: train} for i in xrange(self.num_layers - 1)] # Cast all parameters to the correct datatype for k, v in self.params.iteritems(): self.params[k] = v.astype(dtype) def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = test if y is None else train # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.dropout_param is not None: self.dropout_param[mode] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param[mode] = mode scores = None h, cache1, cache2, cache3, cache4, bn, out = {}, {}, {}, {}, {}, {}, {} out[0] = X # Forward pass: compute loss for i in xrange(self.num_layers-1): # Unpack variables from the params dictionary W, b = self.params[W + str(i+1)], self.params[b + str(i+1)] if self.use_batchnorm: gamma, beta = self.params[gamma + str(i+1)], self.params[beta + str(i+1)] h[i], cache1[i] = affine_forward(out[i], W, b) bn[i], cache2[i] = batchnorm_forward(h[i], gamma, beta, self.bn_params[i]) out[i+1], cache3[i] = relu_forward(bn[i]) if self.use_dropout: out[i+1], cache4[i] = dropout_forward(out[i+1], self.dropout_param) else: out[i+1], cache3[i] = affine_relu_forward(out[i], W, b) if self.use_dropout: out[i+1], cache4[i] = dropout_forward(out[i+1], self.dropout_param) W, b = self.params[W + str(self.num_layers)], self.params[b + str(self.num_layers)] scores, cache = affine_forward(out[self.num_layers-1], W, b) # If test mode return early if mode == test: return scores loss, reg_loss, grads = 0.0, 0.0, {} data_loss, dscores = softmax_loss(scores, y) for i in xrange(self.num_layers): reg_loss += 0.5 * self.reg * np.sum(self.params[W + str(i+1)]*self.params[W + str(i+1)]) loss = data_loss + reg_loss # Backward pass: compute gradients dout, dbn, dh, ddrop = {}, {}, {}, {} t = self.num_layers-1 dout[t], grads[W+str(t+1)], grads[b+str(t+1)] = affine_backward(dscores, cache) for i in xrange(t): if self.use_batchnorm: if self.use_dropout: ddrop[t-1-i] = dropout_backward(dout[t-i], cache4[t-1-i]) dout[t-i] = ddrop[t-1-i] dbn[t-1-i] = relu_backward(dout[t-i], cache3[t-1-i]) dh[t-1-i], grads[gamma+str(t-i)], grads[beta+str(t-i)] = batchnorm_backward(dbn[t-1-i], cache2[t-1-i]) dout[t-1-i], grads[W+str(t-i)], grads[b+str(t-i)] = affine_backward(dh[t-1-i], cache1[t-1-i]) else: if self.use_dropout: ddrop[t-1-i] = dropout_backward(dout[t-i], cache4[t-1-i]) dout[t-i] = ddrop[t-1-i] dout[t-1-i], grads[W+str(t-i)], grads[b+str(t-i)] = affine_relu_backward(dout[t-i], cache3[t-1-i]) # Add the regularization gradient contribution for i in xrange(self.num_layers): grads[W+str(i+1)] += self.reg * self.params[W + str(i+1)] return loss, grads


#使用一個三層網路 50張圖片num_train = 50small_data = { X_train: data[X_train][:num_train], y_train: data[y_train][:num_train], X_val: data[X_val], y_val: data[y_val],}weight_scale = 3.862881e-02 #1e-2learning_rate = 1.946705e-03 #1e-4model = FullyConnectedNet([100, 100], weight_scale=weight_scale, dtype=np.float64,use_batchnorm=False)solver = Solver(model, small_data, print_every=10, num_epochs=20, batch_size=25, update_rule=sgd, optim_config={ learning_rate: learning_rate, }, verbose = False ) solver.train()plt.plot(solver.loss_history, o)plt.title(Training loss history)plt.xlabel(Iteration)plt.ylabel(Training loss)plt.show()


results = {}best_val = -1best_model = Nonenum_train = 50small_data = { X_train: data[X_train][:num_train], y_train: data[y_train][:num_train], X_val: data[X_val], y_val: data[y_val], }learning_rates = 10**np.random.uniform(-3,-2,2)weight_scales = 10**np.random.uniform(-2,-1,2)for lr in learning_rates: for ws in weight_scales: model = FullyConnectedNet([100, 100], weight_scale=ws, dtype=np.float64) solver = Solver(model, small_data, print_every=10, num_epochs=20, batch_size=25, update_rule=sgd, optim_config={ learning_rate: lr, }, verbose = False ) solver.train() train_acc = solver.train_acc_history[-1] results[(lr,ws)] = train_acc# Print out results.for lr, ws in sorted(results): train_acc = results[(lr, ws)] print lr %e ws %e train accuracy: %f % ( lr, ws, train_acc)lr 1.858925e-03 ws 1.595077e-02 train accuracy: 0.980000lr 1.858925e-03 ws 2.034488e-02 train accuracy: 1.000000lr 2.270886e-03 ws 1.595077e-02 train accuracy: 0.980000lr 2.270886e-03 ws 2.034488e-02 train accuracy: 1.000000


#首先得到理想的超參數results = {}best_val = -1best_model = Nonenum_train = len(data[X_train])small_data = { X_train: data[X_train][:num_train], y_train: data[y_train][:num_train], X_val: data[X_val], y_val: data[y_val], } learning_rates = [3.113669e-04] #[10] #[2.379994e-04] # 10**np.random.uniform(-7,1,20) weight_scales = [2.461858e-02]#[5.923238e-02] # 10**np.random.uniform(-4,0,20) for lr in learning_rates: for ws in weight_scales: model = FullyConnectedNet([100, 100, 100, 100], weight_scale=ws, dtype=np.float64,use_batchnorm=False, reg= 1e-2) solver = Solver(model, small_data, print_every=100, num_epochs=10, batch_size=25, update_rule=adam, optim_config={ learning_rate: lr, }, lr_decay = 0.9, verbose = True ) solver.train() train_acc = solver.train_acc_history[-1] val_acc = solver.val_acc_history[-1] results[(lr,ws)] = train_acc, val_acc

二 卷積神經網路(Convolutional Neural Networks, CNNs)



輸出數據體在空間上的尺寸可以通過輸入數據體尺寸(W),卷積層中神經元的感受野尺寸(F),步長(S)和零填充的數量(P)的函數來計算:輸出數據體的空間尺寸為(W-F +2P)/S+1 一般說來,當步長S=1時,零填充的值是P=(F-1)/2,這樣就能保證輸入和輸出數據體有相同的空間尺寸。

2 匯聚層





INPUT --> [[CONV --> RELU]*N --> POOL?]*M --> [FC --> RELU]*K --> FC(OUTPUT)


· INPUT --> FC/OUT 這其實就是個線性分類器

· INPUT --> CONV --> RELU --> FC/OUT

· INPUT --> [CONV --> RELU --> POOL]*2 --> FC --> RELU --> FC/OUT

· INPUT --> [CONV --> RELU --> CONV --> RELU --> POOL]*3 --> [FC --> RELU]*2 --> FC/OUT



# Load the (preprocessed) CIFAR10 data.data = get_CIFAR10_data()for k, v in data.iteritems(): print %s: % k, v.shape

X_val: (1000, 3, 32, 32)X_train: (49000, 3, 32, 32)X_test: (1000, 3, 32, 32)y_val: (1000,)y_train: (49000,)y_test: (1000,)


def conv_forward_naive(x, w, b, conv_param): stride, pad = conv_param[stride], conv_param[pad] N, C, H, W = x.shape F, C, HH, WW = w.shape x_padded = np.pad(x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), mode=constant) #進行零填充 保證尺寸不變 H_new = 1 + (H + 2 * pad - HH) / stride W_new = 1 + (W + 2 * pad - WW) / stride s = stride out = np.zeros((N, F, H_new, W_new)) for i in xrange(N): # ith image for f in xrange(F): # fth filter for j in xrange(H_new): for k in xrange(W_new): out[i, f, j, k] = np.sum(x_padded[i, :, j*s:HH+j*s, k*s:WW+k*s] * w[f]) + b[f] #out[i, f, j, k] = np.sum(x_padded[i(ith image), :(所有顏色通道), j*s:HH+j*s(橫向), k*s:WW+k*s(縱向)] * w[f]) + b[f] cache = (x, w, b, conv_param) return out, cache


def conv_backward_naive(dout, cache): x, w, b, conv_param = cache pad = conv_param[pad] stride = conv_param[stride] F, C, HH, WW = w.shape N, C, H, W = x.shape H_new = 1 + (H + 2 * pad - HH) / stride W_new = 1 + (W + 2 * pad - WW) / stride dx = np.zeros_like(x) dw = np.zeros_like(w) db = np.zeros_like(b) s = stride x_padded = np.pad(x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), constant) dx_padded = np.pad(dx, ((0, 0), (0, 0), (pad, pad), (pad, pad)), constant) for i in xrange(N): # ith image for f in xrange(F): # fth filter for j in xrange(H_new): for k in xrange(W_new): window = x_padded[i, :, j*s:HH+j*s, k*s:WW+k*s] db[f] += dout[i, f, j, k] dw[f] += window * dout[i, f, j, k] dx_padded[i, :, j*s:HH+j*s, k*s:WW+k*s] += w[f] * dout[i, f, j, k] # Unpad dx = dx_padded[:, :, pad:pad+H, pad:pad+W] return dx, dw, db


def max_pool_forward_naive(x, pool_param): HH, WW = pool_param[pool_height], pool_param[pool_width] s = pool_param[stride] N, C, H, W = x.shape H_new = 1 + (H - HH) / s W_new = 1 + (W - WW) / s out = np.zeros((N, C, H_new, W_new)) for i in xrange(N): for j in xrange(C): for k in xrange(H_new): for l in xrange(W_new): window = x[i, j, k*s:HH+k*s, l*s:WW+l*s] out[i, j, k, l] = np.max(window) cache = (x, pool_param) return out, cachedef max_pool_backward_naive(dout, cache): x, pool_param = cache HH, WW = pool_param[pool_height], pool_param[pool_width] s = pool_param[stride] N, C, H, W = x.shape H_new = 1 + (H - HH) / s W_new = 1 + (W - WW) / s dx = np.zeros_like(x) for i in xrange(N): for j in xrange(C): for k in xrange(H_new): for l in xrange(W_new): window = x[i, j, k*s:HH+k*s, l*s:WW+l*s] m = np.max(window) dx[i, j, k*s:HH+k*s, l*s:WW+l*s] = (window == m) * dout[i, j, k, l] return dx


def conv_relu_forward(x, w, b, conv_param): """ A convenience layer that performs a convolution followed by a ReLU. Inputs: - x: Input to the convolutional layer - w, b, conv_param: Weights and parameters for the convolutional layer Returns a tuple of: - out: Output from the ReLU - cache: Object to give to the backward pass """ a, conv_cache = conv_forward_fast(x, w, b, conv_param) out, relu_cache = relu_forward(a) cache = (conv_cache, relu_cache) return out, cachedef conv_relu_backward(dout, cache): """ Backward pass for the conv-relu convenience layer. """ conv_cache, relu_cache = cache da = relu_backward(dout, relu_cache) dx, dw, db = conv_backward_fast(da, conv_cache) return dx, dw, dbdef conv_relu_pool_forward(x, w, b, conv_param, pool_param): """ Convenience layer that performs a convolution, a ReLU, and a pool. Inputs: - x: Input to the convolutional layer - w, b, conv_param: Weights and parameters for the convolutional layer - pool_param: Parameters for the pooling layer Returns a tuple of: - out: Output from the pooling layer - cache: Object to give to the backward pass """ a, conv_cache = conv_forward_fast(x, w, b, conv_param) s, relu_cache = relu_forward(a) out, pool_cache = max_pool_forward_fast(s, pool_param) cache = (conv_cache, relu_cache, pool_cache) return out, cachedef conv_relu_pool_backward(dout, cache): """ Backward pass for the conv-relu-pool convenience layer """ conv_cache, relu_cache, pool_cache = cache ds = max_pool_backward_fast(dout, pool_cache) da = relu_backward(ds, relu_cache) dx, dw, db = conv_backward_fast(da, conv_cache)return dx, dw, db


from layer_utils import *class ThreeLayerConvNet(object): """ A three-layer convolutional network with the following architecture: conv - relu - 2x2 max pool - affine - relu - affine - softmax """ def __init__(self, input_dim=(3, 32, 32), num_filters=32, filter_size=7, hidden_dim=100, num_classes=10, weight_scale=1e-3, reg=0.0, dtype=np.float32): self.params = {} self.reg = reg self.dtype = dtype # Initialize weights and biases C, H, W = input_dim self.params[W1] = weight_scale * np.random.randn(num_filters, C, filter_size, filter_size) self.params[b1] = np.zeros((1, num_filters)) self.params[W2] = weight_scale * np.random.randn(num_filters*H*W/4, hidden_dim) self.params[b2] = np.zeros((1, hidden_dim)) self.params[W3] = weight_scale * np.random.randn(hidden_dim, num_classes) self.params[b3] = np.zeros((1, num_classes)) for k, v in self.params.iteritems(): self.params[k] = v.astype(dtype) def loss(self, X, y=None): W1, b1 = self.params[W1], self.params[b1] W2, b2 = self.params[W2], self.params[b2] W3, b3 = self.params[W3], self.params[b3] # pass conv_param to the forward pass for the convolutional layer filter_size = W1.shape[2] conv_param = {stride: 1, pad: (filter_size - 1) / 2} # pass pool_param to the forward pass for the max-pooling layer pool_param = {pool_height: 2, pool_width: 2, stride: 2} # compute the forward pass a1, cache1 = conv_relu_pool_forward(X, W1, b1, conv_param, pool_param) a2, cache2 = affine_relu_forward(a1, W2, b2) scores, cache3 = affine_forward(a2, W3, b3) if y is None: return scores # compute the backward pass data_loss, dscores = softmax_loss(scores, y) da2, dW3, db3 = affine_backward(dscores, cache3) da1, dW2, db2 = affine_relu_backward(da2, cache2) dX, dW1, db1 = conv_relu_pool_backward(da1, cache1) # Add regularization dW1 += self.reg * W1 dW2 += self.reg * W2 dW3 += self.reg * W3 reg_loss = 0.5 * self.reg * sum(np.sum(W * W) for W in [W1, W2, W3]) loss = data_loss + reg_loss grads = {W1: dW1, b1: db1, W2: dW2, b2: db2, W3: dW3, b3: db3} return loss, grads


在你建立一個新的網路後,應該做的第一件事之一是檢查loss。 當我們使用softmax損失時,我們預期隨機權重的損失(和沒有正則化)是關於C類的log(C)。 當我們添加正則化時,這個值也會有所提升。

model = ThreeLayerConvNet()N = 50X = np.random.randn(N, 3, 32, 32)y = np.random.randint(10, size=N)loss, grads = model.loss(X, y)print Initial loss (no regularization): , lossmodel.reg = 0.5loss, grads = model.loss(X, y)print Initial loss (with regularization): , loss

Initial loss (no regularization): 2.30258332514Initial loss (with regularization): 2.50922157191


在損失看起來合理後,使用數值法梯度檢查,以確保反向傳播是正確的。 當使用數值法梯度檢查時,應該使用少量的數據和每層的少量神經元。

num_inputs = 2input_dim = (3, 16, 16)reg = 0.0num_classes = 10X = np.random.randn(num_inputs, *input_dim)y = np.random.randint(num_classes, size=num_inputs)model = ThreeLayerConvNet(num_filters=3, filter_size=3, input_dim=input_dim, hidden_dim=7, dtype=np.float64)loss, grads = model.loss(X, y)for param_name in sorted(grads): f = lambda _: model.loss(X, y)[0] param_grad_num = eval_numerical_gradient(f, model.params[param_name], verbose=False, h=1e-6) e = rel_error(param_grad_num, grads[param_name]) print %s max relative error: %e % (param_name, rel_error(param_grad_num, grads[param_name]))

W1 max relative error: 1.959851e-03W2 max relative error: 2.075249e-02W3 max relative error: 5.021812e-05b1 max relative error: 7.648966e-05b2 max relative error: 8.115147e-07b3 max relative error: 9.181460e-10


一個好的技巧是訓練你的模型只用幾個訓練樣本。 你應該能夠對小數據集進行過擬合,得到非常高的訓練集準確性和相對較低的驗證集準確性。

num_train = 100small_data = { X_train: data[X_train][:num_train], y_train: data[y_train][:num_train], X_val: data[X_val], y_val: data[y_val],}model = ThreeLayerConvNet(weight_scale=1e-2)solver = Solver(model, small_data, num_epochs=10, batch_size=50, update_rule=adam, optim_config={ learning_rate: 1e-3, }, verbose=True, print_every=1)solver.train()

(Iteration 1 / 20) loss: 2.273136(Epoch 0 / 10) train acc: 0.220000; val_acc: 0.097000(Iteration 2 / 20) loss: 2.268275(Epoch 1 / 10) train acc: 0.260000; val_acc: 0.161000(Iteration 3 / 20) loss: 3.628737(Iteration 4 / 20) loss: 2.487226(Epoch 2 / 10) train acc: 0.330000; val_acc: 0.127000(Iteration 5 / 20) loss: 2.216854(Iteration 6 / 20) loss: 2.045461(Epoch 3 / 10) train acc: 0.390000; val_acc: 0.145000(Iteration 7 / 20) loss: 1.691353(Iteration 8 / 20) loss: 1.607438(Epoch 4 / 10) train acc: 0.520000; val_acc: 0.184000(Iteration 9 / 20) loss: 1.310616(Iteration 10 / 20) loss: 1.450377(Epoch 5 / 10) train acc: 0.570000; val_acc: 0.175000(Iteration 11 / 20) loss: 1.740461(Iteration 12 / 20) loss: 1.087948(Epoch 6 / 10) train acc: 0.730000; val_acc: 0.224000(Iteration 13 / 20) loss: 1.164601(Iteration 14 / 20) loss: 0.858544(Epoch 7 / 10) train acc: 0.730000; val_acc: 0.243000(Iteration 15 / 20) loss: 0.758457(Iteration 16 / 20) loss: 0.699982(Epoch 8 / 10) train acc: 0.810000; val_acc: 0.240000(Iteration 17 / 20) loss: 0.593262(Iteration 18 / 20) loss: 0.551767(Epoch 9 / 10) train acc: 0.810000; val_acc: 0.214000(Iteration 19 / 20) loss: 0.717523(Iteration 20 / 20) loss: 0.383384(Epoch 10 / 10) train acc: 0.870000; val_acc: 0.183000



批量歸一化(spatia Batch Normalization)




同樣的:#Means should be close to zero and stds close to one

gamma, beta = np.ones(C), np.zeros(C)

def spatial_batchnorm_forward(x, gamma, beta, bn_param): N, C, H, W = x.shape x_new = x.transpose(0, 2, 3, 1).reshape(N*H*W, C) out, cache = batchnorm_forward(x_new, gamma, beta, bn_param) out = out.reshape(N, H, W, C).transpose(0, 3, 1, 2) return out, cachedef spatial_batchnorm_backward(dout, cache): N, C, H, W = dout.shape dout_new = dout.transpose(0, 2, 3, 1).reshape(N*H*W, C) dx, dgamma, dbeta = batchnorm_backward(dout_new, cache) dx = dx.reshape(N, H, W, C).transpose(0, 3, 1, 2) return dx, dgamma, dbeta


