TF Boys (TensorFlow Boys ) 養成記(五):CIFAR10 Model 和 TensorFlow 的四種交叉熵介紹

有了數據,有了網路結構,下面我們就來寫 cifar10 的代碼。

首先處理輸入,在 /home/your_name/TensorFlow/cifar10/ 下建立 cifar10_input.py,輸入如下代碼:

from __future__ import absolute_import # 絕對導入nfrom __future__ import division # 精確除法,/是精確除,//是取整除nfrom __future__ import print_function # 列印函數nnimport osnimport tensorflow as tfnn# 建立一個 cifar10_data 的類, 輸入文件名隊列,輸出 labels 和imagesnclass cifar10_data(object):nn def __init__(self, filename_queue): # 類初始化n n # 根據上一篇文章介紹的文件格式,定義初始化參數n self.height = 32n self.width = 32n self.depth = 3n # label 一個位元組n self.label_bytes = 1n # 圖像 32*32*3 = 3072 位元組n self.image_bytes = self.height * self.width * self.depthn # 讀取的固定位元組長度為 3072 + 1 = 3073 n self.record_bytes = self.label_bytes + self.image_bytesn self.label, self.image = self.read_cifar10(filename_queue)n n def read_cifar10(self, filename_queue):nn # 讀取固定長度文件n reader = tf.FixedLengthRecordReader(record_bytes = self.record_bytes)n key, value = reader.read(filename_queue)n record_bytes = tf.decode_raw(value, tf.uint8)n # tf.slice(record_bytes, 起始位置, 長度)n label = tf.cast(tf.slice(record_bytes, [0], [self.label_bytes]), tf.int32)n # 從 label 起,切片 self.image_bytes = 3072 長度為圖像n image_raw = tf.slice(record_bytes, [self.label_bytes], [self.image_bytes])n # 圖片轉化成 3*32*32n image_raw = tf.reshape(image_raw, [self.depth, self.height, self.width])n # 圖片轉化成 32*32*3n image = tf.transpose(image_raw, (1,2,0)) n image = tf.cast(image, tf.float32)n return label, imagenn ndef inputs(data_dir, batch_size, train = True, name = input):nn # 建議加上 tf.name_scope, 可以畫出漂亮的流程圖。n with tf.name_scope(name):n if train: n # 要讀取的文件的名字n filenames = [os.path.join(data_dir,data_batch_%d.bin % ii) n for ii in range(1,6)]n # 不存在該文件的時候報錯n for f in filenames:n if not tf.gfile.Exists(f):n raise ValueError(Failed to find file: + f)n # 用文件名生成文件名隊列n filename_queue = tf.train.string_input_producer(filenames)n # 送入 cifar10_data 類中n read_input = cifar10_data(filename_queue)n images = read_input.imagen # 圖像白化操作,由於網路結構簡單,不加這句正確率很低。n images = tf.image.per_image_whitening(images)n labels = read_input.labeln # 生成 batch 隊列,16 線程操作,容量 20192,min_after_dequeue 是n # 離隊操作後,隊列中剩餘的最少的元素,確保隊列中一直有 min_after_dequeuen # 以上元素,建議設置 capacity = min_after_dequeue + batch_size * 3n num_preprocess_threads = 16n image, label = tf.train.shuffle_batch(n [images,labels], batch_size = batch_size, n num_threads = num_preprocess_threads, n min_after_dequeue = 20000, capacity = 20192)n n n return image, tf.reshape(label, [batch_size])n n else:n filenames = [os.path.join(data_dir,test_batch.bin)]n for f in filenames:n if not tf.gfile.Exists(f):n raise ValueError(Failed to find file: + f)n n filename_queue = tf.train.string_input_producer(filenames)n read_input = cifar10_data(filename_queue)n images = read_input.imagen images = tf.image.per_image_whitening(images)n labels = read_input.labeln num_preprocess_threads = 16n image, label = tf.train.shuffle_batch(n [images,labels], batch_size = batch_size, n num_threads = num_preprocess_threads, n min_after_dequeue = 20000, capacity = 20192)n n n return image, tf.reshape(label, [batch_size])n

在 /home/your_name/TensorFlow/cifar10/ 下建立 cifar10.py,輸入如下代碼:

from __future__ import absolute_importnfrom __future__ import divisionnfrom __future__ import print_functionnnimport osnimport os.pathnimport timenfrom datetime import datetimennimport numpy as npnfrom six.moves import xrangenimport tensorflow as tfnnimport my_cifar10_inputnnBATCH_SIZE = 64nLEARNING_RATE = 0.1nMAX_STEP = 50000nTRAIN = Truennn# 用 get_variable 在 CPU 上定義常量ndef variable_on_cpu(name, shape, initializer = tf.constant_initializer(0.1)):n with tf.device(/cpu:0):n dtype = tf.float32n var = tf.get_variable(name, shape, initializer = initializer, n dtype = dtype)n return varnn # 用 get_variable 在 CPU 上定義變數ndef variables(name, shape, stddev): n dtype = tf.float32n var = variable_on_cpu(name, shape, n tf.truncated_normal_initializer(stddev = stddev, n dtype = dtype))n return varn n# 定義網路結構ndef inference(images):n with tf.variable_scope(conv1) as scope:n # 用 5*5 的卷積核,64 個 Feature mapsn weights = variables(weights, [5,5,3,64], 5e-2)n # 卷積,步長為 1*1n conv = tf.nn.conv2d(images, weights, [1,1,1,1], padding = SAME)n biases = variable_on_cpu(biases, [64])n # 加上偏置n bias = tf.nn.bias_add(conv, biases)n # 通過 ReLu 激活函數n conv1 = tf.nn.relu(bias, name = scope.name)n # 柱狀圖總結 conv1n tf.histogram_summary(scope.name + /activations, conv1)n with tf.variable_scope(pooling1_lrn) as scope:n # 最大池化,3*3 的卷積核,2*2 的卷積n pool1 = tf.nn.max_pool(conv1, ksize = [1,3,3,1], strides = [1,2,2,1],n padding = SAME, name=pool1)n # 局部響應歸一化n norm1 = tf.nn.lrn(pool1, 4, bias = 1.0, alpha = 0.001/9.0, n beta = 0.75, name = norm1)nn with tf.variable_scope(conv2) as scope:n weights = variables(weights, [5,5,64,64], 5e-2)n conv = tf.nn.conv2d(norm1, weights, [1,1,1,1], padding = SAME)n biases = variable_on_cpu(biases, [64])n bias = tf.nn.bias_add(conv, biases)n conv2 = tf.nn.relu(bias, name = scope.name)n tf.histogram_summary(scope.name + /activations, conv2)n with tf.variable_scope(pooling2_lrn) as scope:n norm2 = tf.nn.lrn(conv2, 4, bias = 1.0, alpha = 0.001/9.0, n beta = 0.75, name = norm1) n pool2 = tf.nn.max_pool(norm2, ksize = [1,3,3,1], strides = [1,2,2,1],n padding = SAME, name=pool1)nn with tf.variable_scope(local3) as scope:n # 第一層全連接n reshape = tf.reshape(pool2, [BATCH_SIZE,-1])n dim = reshape.get_shape()[1].valuen weights = variables(weights, shape=[dim,384], stddev=0.004)n biases = variable_on_cpu(biases, [384])n # ReLu 激活函數n local3 = tf.nn.relu(tf.matmul(reshape, weights)+biases, n name = scope.name)n # 柱狀圖總結 local3n tf.histogram_summary(scope.name + /activations, local3)n n with tf.variable_scope(local4) as scope:n # 第二層全連接n weights = variables(weights, shape=[384,192], stddev=0.004)n biases = variable_on_cpu(biases, [192])n local4 = tf.nn.relu(tf.matmul(local3, weights)+biases, n name = scope.name)n tf.histogram_summary(scope.name + /activations, local4)n n with tf.variable_scope(softmax_linear) as scope:n # softmax 層,實際上不是嚴格的 softmax ,真正的 softmax 在損失層n weights = variables(weights, [192, 10], stddev=1/192.0)n biases = variable_on_cpu(biases, [10])n softmax_linear = tf.add(tf.matmul(local4, weights), biases, n name = scope.name)n tf.histogram_summary(scope.name + /activations, softmax_linear)n n return softmax_linearn# 交叉熵損失層 ndef losses(logits, labels):n with tf.variable_scope(loss) as scope:n labels = tf.cast(labels, tf.int64)n # 交叉熵損失,至於為什麼是這個函數,後面會說明。n cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logitsn (logits, labels, name=cross_entropy_per_example)n loss = tf.reduce_mean(cross_entropy, name = loss)n tf.scalar_summary(scope.name + /x_entropy, loss)n n return lossn

現在來看下為什麼要用 tf.nn.sparse_softmax_cross_entropy_with_logits 這麼長的一個函數,在官方文檔中,一共有4中交叉熵損失函數:

1. tf.nn.sigmoid_cross_entropy_with_logits(logits, targets,name=None)

2. tf.nn.softmax_cross_entropy_with_logits(logits, labels,dim=-1, name=None)

3. tf.nn.sparse_softmax_cross_entropy_with_logits(logits,labels, name=None)

4. tf.nn.weighted_cross_entropy_with_logits(logits, targets,pos_weight, name=None)

分別來看一下:

1)第一個函數就是傳統的 sigmoid 交叉熵,假設 x = logits, z = targets,那麼第一個函數的交叉熵損失可以寫作:

z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))n

注意,sigmoid 用於二分類,logits 和 targets 維度要相同。

2)第二個函數是 softmax 交叉熵,用於多分類,並且類間相互獨立,不能一個元素既屬於這個類又屬於那個類。並且,也是要求logits 和 targets 維度要相同。

例如,上面的 losses 代碼中目標分為10類,logits 是 64*10 維度的,而 targets(也就是labels) 是 [64] 維度的,就不能用這個函數,要想使用這個函數,得把 labels 變成 64*10 的 onehot encoding (獨熱編碼),假設 labels 的 64 個值分別是:[1,5,2,3,0,4,9,8,7,5,6,4,5,8...],那麼 labels 變成獨熱編碼以後,第一行變成:[0,1,0,0,0,0,0,0,0,0],第二行變為:[0,0,0,0,0,1,0,0,0,0],第三行:[0,0,1,0,0,0,0,0,0,0],也就是:每行的第 label 個值變為1,其他是0,用代碼可以如下寫:

targets = np.zeros([64,10], dtype = np.float)nfor index, value in enumerate(labels):n targets[index, value] = 1.0n

3)也就是我們所使用的函數,與第二個函數不同的一點是,不要求維度相同,只要求第 0 維相同,若 logits 是 64*10 維度的, targets(也就是labels) 是 [64] 維度的,那麼第 0 個維度相同,就可以使用這個函數了,不需要進行 onehot encoding ,從上一篇文章我們所畫出來的流程圖可以明顯看出來,loss 層的輸入,一個是 64*10 維,一個是 64 維。並且這個函數,自帶了 softmax 的計算,所以,在 inference 的最後一層,我們實際上計算的不是真正的 softmax。

4)和第一個函數差不多相同,只是可以加一個權重 pos_weight, 假設 x = logits, z = targets, q = pos_weight,那麼第四個函數的交叉熵損失為:

q * z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))n= q * z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))n= q * z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))n= q * z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))n= (1 - z) * x + (qz + 1 - z) * log(1 + exp(-x))n= (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))n

參考文獻:

1. tensorflow.org/api_docs

2. tensorflow/models


推薦閱讀:

Google 開發技術周刊 088 期
TensorFlow從1到2 | 第五章 非專家莫入!TensorFlow實現CNN
Deep Learning的IR「之爭」
TensorFlow 教程 #04 - 保存 & 恢復

TAG:TensorFlow | 深度学习DeepLearning |