TF官方的ResNet代碼詳解

02-08

剛開始接觸TensorFlow。剛好在實際項目中需要用到ResNet，分享一下我對官方代碼的學習筆記，主要以注釋的形式表達，如有錯誤請指出，謝謝。

首先我們來看一下代碼對各層的定義，這個各種網路大同小異，熟悉的可以跳過。

1.各層的定義

其卷積層的定義。name是該卷基層的名字（擁有唯一的特性），x是輸入，filter_size是卷積大小，in_filters和out_filters顧名思義分別是輸入和輸出feature的數目。中間的參數的維度即成了[filter_size, filter_size, in_filters, out_filters]。stride也是常規參數——步長。

和一般卷積網路相同，這裡的卷積核參數初始化採用了Xavier初始化。padding採用了SAME PADDING 使輸入輸出feature內部維度相等。

注意輸入X的維度是[batch, in_height, in_width, in_channels]。

def _conv(self, name, x, filter_size, in_filters, out_filters, strides): """卷積層""" with tf.variable_scope(name): n = filter_size * filter_size * out_filters kernel = tf.get_variable( name + DW, [filter_size, filter_size, in_filters, out_filters], tf.float32, initializer=tf.random_normal_initializer( stddev=np.sqrt(2.0 / n))) return tf.nn.conv2d(x, kernel, strides, padding=SAME)

ReLu層比較簡單，採用普通ReLu。若想改成Leaky ReLu，使用上面一段代碼。

def _relu(self, x, leakiness=0.0): """Relu, with optional leaky support.""" # return tf.select(tf.less(x, 0.0), leakiness * x, x, name=leaky_relu) return tf.nn.relu(x)

全連接層也比較簡單，首先將上一層的輸入reshape到[batch_size, -1]上，變成二維變數。這一層的參數維度即[x.get_shape()[1], out_dim]。初始化採用uniform。

def _fully_connected(self, x, out_dim, name=): """FullyConnected layer for final output.""" x = tf.reshape(x, [self.hps.batch_size, -1]) w = tf.get_variable( name+DW, [x.get_shape()[1], out_dim], initializer=tf.uniform_unit_scaling_initializer(factor=1.0)) b = tf.get_variable(name+biases, [out_dim], initializer=tf.constant_initializer()) return tf.nn.xw_plus_b(x, w, b)

Pooling層採用平均池化，直接使用了tf.nn.tf.nn.avg_pool。下面這個長得很像的並不是pool層，作用是統計每個feature的平均值。使用reduce_mean函數對x的第二、三維度（即對每一個feature，X的維度是[batch, in_height, in_width, in_channels]）。

def _global_avg_pool(self, x): assert x.get_shape().ndims == 4 return tf.reduce_mean(x, [1, 2]) #其實這個函數並不是池化層，他是統計每個feature的平均值

下面就是兩個關鍵的residual層和bottleneck_residual。

殘差運算單元（2層conv為一個單元）分為兩種用參數activate_before_residual控制。

def _residual(self, x, in_filter, out_filter, stride, activate_before_residual=False): """Residual unit with 2 sub layers.""" if activate_before_residual: with tf.variable_scope(shared_activation): x = self._batch_norm(init_bn, x) x = self._relu(x, self.hps.relu_leakiness) orig_x = x else: with tf.variable_scope(residual_only_activation): orig_x = x x = self._batch_norm(init_bn, x) x = self._relu(x, self.hps.relu_leakiness) with tf.variable_scope(sub1): x = self._conv(conv1, x, 3, in_filter, out_filter, stride) with tf.variable_scope(sub2): x = self._batch_norm(bn2, x) x = self._relu(x, self.hps.relu_leakiness) x = self._conv(conv2, x, 3, out_filter, out_filter, [1, 1, 1, 1]) with tf.variable_scope(sub_add): if in_filter != out_filter: orig_x = tf.nn.avg_pool(orig_x, stride, stride, VALID) orig_x = tf.pad( orig_x, [[0, 0], [0, 0], [0, 0], [(out_filter - in_filter) // 2, (out_filter - in_filter) // 2]]) x += orig_x tf.logging.info(image after unit %s, x.get_shape()) return x

殘差運算單元（3層conv為一個單元），分為兩種用參數activate_before_residual控制。

def _bottleneck_residual(self, x, in_filter, out_filter, stride, activate_before_residual=False): """Bottleneck resisual unit with 3 sub layers.""" if activate_before_residual: with tf.variable_scope(common_bn_relu): x = self._batch_norm(init_bn, x) x = self._relu(x, self.hps.relu_leakiness) orig_x = x else: with tf.variable_scope(residual_bn_relu): orig_x = x x = self._batch_norm(init_bn, x) x = self._relu(x, self.hps.relu_leakiness) with tf.variable_scope(sub1): x = self._conv(conv1, x, 1, in_filter, out_filter / 4, stride) with tf.variable_scope(sub2): x = self._batch_norm(bn2, x) x = self._relu(x, self.hps.relu_leakiness) x = self._conv(conv2, x, 3, out_filter / 4, out_filter / 4, [1, 1, 1, 1]) with tf.variable_scope(sub3): x = self._batch_norm(bn3, x) x = self._relu(x, self.hps.relu_leakiness) x = self._conv(conv3, x, 1, out_filter / 4, out_filter, [1, 1, 1, 1]) with tf.variable_scope(sub_add): if in_filter != out_filter: orig_x = self._conv(project, orig_x, 1, in_filter, out_filter, stride) x += orig_x tf.logging.info(image after unit %s, x.get_shape()) return x

2.網路結構

resNet網路結構示意圖

網路結構原理相信大家比較了解，這裡就不多說了。可以看出ResNet是由一個個的殘差單元組成的。

最後就是ResNet的核心部分，即模型的前向傳播部分（定義了模型）。我將代碼的注釋改為中文，且增加了自己的理解。

def _build_model(self): """Build the core model within the graph.""" with tf.variable_scope(init): x = self._images """ 卷積核大小為3 輸入有3通道輸出神經元16個即16個通道 """ x = self._conv(init_conv, x, 3, 3, 16, self._stride_arr(1)) # 步長三個大單元的第一個小單元的步長後n-1個小單元步長均為1 strides = [1, 2, 2] # 三個大單元的第一個小單元是 T(B)R還是F(B)R activate_before_residual = [True, False, False] if self.hps.use_bottleneck: res_func = self._bottleneck_residual filters = [16, 64, 128, 256] # 第一個是輸入個數小單元神經元個數 else: res_func = self._residual filters = [16, 16, 32, 64] # 第一個是輸入個數小單元神經元個數 with tf.variable_scope(unit_1_0): x = res_func(x, filters[0], filters[1], self._stride_arr(strides[0]), activate_before_residual[0]) for i in range(1, self.hps.num_residual_units): with tf.variable_scope(unit_1_%d % i): x = res_func(x, filters[1], filters[1], self._stride_arr(1), False) with tf.variable_scope(unit_2_0): x = res_func(x, filters[1], filters[2], self._stride_arr(strides[1]), activate_before_residual[1]) for i in range(1, self.hps.num_residual_units): with tf.variable_scope(unit_2_%d % i): x = res_func(x, filters[2], filters[2], self._stride_arr(1), False) with tf.variable_scope(unit_3_0): x = res_func(x, filters[2], filters[3], self._stride_arr(strides[2]), activate_before_residual[2]) for i in range(1, self.hps.num_residual_units): with tf.variable_scope(unit_3_%d % i): x = res_func(x, filters[3], filters[3], self._stride_arr(1), False) with tf.variable_scope(unit_last): x = self._batch_norm(final_bn, x) x = self._relu(x, self.hps.relu_leakiness) x = self._global_avg_pool(x) with tf.variable_scope(logit): logits = self._fully_connected(x, self.hps.num_classes) self.predictions = tf.nn.softmax(logits) # 計算loss with tf.variable_scope(costs): xent = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=self.labels) self.cost = tf.reduce_mean(xent, name=xent) self.cost += self._decay() tf.summary.scalar(cost, self.cost) # 計算準確率 with tf.variable_scope(acc): correct_prediction = tf.equal( tf.cast(tf.argmax(logits, 1), tf.int32), self.labels) self.acc = tf.reduce_mean( tf.cast(correct_prediction, tf.float32), name=accu) tf.summary.scalar(accuracy, self.acc)

下面就是訓練操作函數。其中self.hps.lrn_rate是學習率，self.hps.optimizer是優化方法，sgd代表SGD、mom代表Momentum。

def _build_train_op(self): """Build training specific ops for the graph.""" self.lrn_rate = tf.constant(self.hps.lrn_rate, tf.float32) tf.summary.scalar(learning rate, self.lrn_rate) trainable_variables = tf.trainable_variables() grads = tf.gradients(self.cost, trainable_variables) if self.hps.optimizer == sgd: optimizer = tf.train.GradientDescentOptimizer(self.lrn_rate) elif self.hps.optimizer == mom: optimizer = tf.train.MomentumOptimizer(self.lrn_rate, 0.9) apply_op = optimizer.apply_gradients( zip(grads, trainable_variables), global_step=self.global_step, name=train_step) train_ops = [apply_op] + self._extra_train_ops self.train_op = tf.group(*train_ops)

有錯誤望指出，不過我想也沒多少人會看到吧，算作為筆記了。

本文源碼在此，大家感興趣可以star一下項目，之後我會專門寫一篇文章解釋AlphaGo Zero的。