Faster RCNN by Xinlei Chen

05-19

1.Shell script

@experiments/scripts/train_faster_rcnn.sh

CUDA_VISIBLE_DEVICES=${GPU_ID} time python ./tools/trainval_net.py --weight data/imagenet_weights/${NET}.ckpt --imdb ${TRAIN_IMDB} --imdbval ${TEST_IMDB} --iters ${ITERS} --cfg experiments/cfgs/${NET}.yml --tag ${EXTRA_ARGS_SLUG} --net ${NET} --set ANCHOR_SCALES ${ANCHORS} ANCHOR_RATIOS ${RATIOS} TRAIN.STEPSIZE ${STEPSIZE} ${EXTRA_ARGS}

2.Data preparation

2.1 Configuration

環境配置主要由cfg_file （@experiments/cfgs/vgg16.yml）和 set_cfgs 完成。首先會import cfg，cfg這個字典在model/config.py中，該文件會執行top-level code來初始化cfg，然後用下面兩個args來更改相關配置。

@tools/trainval_net.py

from model.config import cfg, cfg_from_file, cfg_from_list if args.cfg_file is not None: cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs)

2.2 Training data

由以下代碼實現。返回值imdb為pascal_voc類的對象（該類繼承了imdb類）

imdb, roidb = combined_roidb(args.imdb_name)

get_imdb是datasets/factory.py中的函數，根據imdb_name提取對應數據集。

imdb = get_imdb(imdb_name)

pascal_voc類的初始化

class pascal_voc(imdb): def __init__(self, image_set, year, devkit_path=None): imdb.__init__(self, voc_ + year + _ + image_set) self._year = year self._image_set = image_set self._devkit_path = self._get_default_path() if devkit_path is None else devkit_path self._data_path = os.path.join(self._devkit_path, VOC + self._year) self._classes = (__background__, # always index 0 aeroplane, bicycle, bird, boat, bottle, bus, car, cat, chair, cow, diningtable, dog, horse, motorbike, person, pottedplant, sheep, sofa, train, tvmonitor) #類名映射到index self._class_to_ind = dict(list(zip(self.classes, list(range(self.num_classes))))) self._image_ext = .jpg self._image_index = self._load_image_set_index() # Default to roidb handler self._roidb_handler = self.gt_roidb self._salt = str(uuid.uuid4()) self._comp_id = comp4 # PASCAL specific config options self.config = {cleanup: True, use_salt: True, use_diff: False, matlab_eval: False, rpn_file: None} assert os.path.exists(self._devkit_path), VOCdevkit path does not exist: {}.format(self._devkit_path) assert os.path.exists(self._data_path), Path does not exist: {}.format(self._data_path)

pascal_voc類的部分函數

#返回ground-truth rois def gt_roidb(self): """ Return the database of ground-truth regions of interest. This function loads/saves from/to a cache file to speed up future calls. """ cache_file = os.path.join(self.cache_path, self.name + _gt_roidb.pkl) if os.path.exists(cache_file): with open(cache_file, rb) as fid: try: roidb = pickle.load(fid) except: roidb = pickle.load(fid, encoding=bytes) print({} gt roidb loaded from {}.format(self.name, cache_file)) return roidb gt_roidb = [self._load_pascal_annotation(index) for index in self.image_index] with open(cache_file, wb) as fid: pickle.dump(gt_roidb, fid, pickle.HIGHEST_PROTOCOL) print(wrote gt roidb to {}.format(cache_file)) return gt_roidb def _load_image_set_index(self): Load the indexes listed in this datasets image set file. for example, trainval has 5011 images; test has 4952 images. # Example path to image set file: # self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt image_set_file = os.path.join(self._data_path, ImageSets, Main, self._image_set + .txt) assert os.path.exists(image_set_file), Path does not exist: {}.format(image_set_file) with open(image_set_file) as f: image_index = [x.strip() for x in f.readlines()] return image_index def _load_pascal_annotation(self, index): """ Load image and bounding boxes info from XML file in the PASCAL VOC format. """ filename = os.path.join(self._data_path, Annotations, index + .xml) tree = ET.parse(filename) #閱讀xml文件 objs = tree.findall(object) #找到xml文件中的object標籤 if not self.config[use_diff]: # Exclude the samples labeled as difficult non_diff_objs = [ obj for obj in objs if int(obj.find(difficult).text) == 0] # if len(non_diff_objs) != len(objs): # print Removed {} difficult objects.format( # len(objs) - len(non_diff_objs)) objs = non_diff_objs num_objs = len(objs) boxes = np.zeros((num_objs, 4), dtype=np.uint16) gt_classes = np.zeros((num_objs), dtype=np.int32) overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32) # "Seg" area for pascal is just the box area seg_areas = np.zeros((num_objs), dtype=np.float32) # Load object bounding boxes into a data frame. for ix, obj in enumerate(objs): bbox = obj.find(bndbox) # Make pixel indexes 0-based x1 = float(bbox.find(xmin).text) - 1 y1 = float(bbox.find(ymin).text) - 1 x2 = float(bbox.find(xmax).text) - 1 y2 = float(bbox.find(ymax).text) - 1 #找到類別標籤，轉為類別index cls = self._class_to_ind[obj.find(name).text.lower().strip()] boxes[ix, :] = [x1, y1, x2, y2] gt_classes[ix] = cls #obj與對應cls的IoU是1，其他標為0 overlaps[ix, cls] = 1.0 seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1) #稀疏，下面調用時會稠密 overlaps = scipy.sparse.csr_matrix(overlaps) return {boxes: boxes, gt_classes: gt_classes, gt_overlaps: overlaps, flipped: False, seg_areas: seg_areas}

A roidb is a list of dictionaries (one dictionary corresponds to one image), each with the following keys:

boxes (num_objs, 4) 坐標
gt_overlaps (num_objs , 21) IoU
gt_classes (num_objs,) class index of each box
flipped bool True indicates the image has been flipped

imdb類的初始化

class imdb(object): """Image database.""" def __init__(self, name, classes=None): self._name = name self._num_classes = 0 if not classes: self._classes = [] else: self._classes = classes self._image_index = [] self._obj_proposer = gt self._roidb = None self._roidb_handler = self.default_roidb # Use this dict for storing dataset specific config options self.config = {}

imdb類的部分函數

def append_flipped_images(self): num_images = self.num_images widths = self._get_widths() for i in range(num_images): boxes = self.roidb[i][boxes].copy() oldx1 = boxes[:, 0].copy() oldx2 = boxes[:, 2].copy() #只需改變x1,x2 boxes[:, 0] = widths[i] - oldx2 - 1 boxes[:, 2] = widths[i] - oldx1 - 1 assert (boxes[:, 2] >= boxes[:, 0]).all() entry = {boxes: boxes, gt_overlaps: self.roidb[i][gt_overlaps], gt_classes: self.roidb[i][gt_classes], flipped: True} self.roidb.append(entry) self._image_index = self._image_index * 2

roidb = get_training_roidb(imdb)

@train_val.py 324

def get_training_roidb(imdb): """Returns a roidb (Region of Interest database) for use in training.""" if cfg.TRAIN.USE_FLIPPED: print(Appending horizontally-flipped training examples...) imdb.append_flipped_images() print(done) print(Preparing training data...) #調用下一段代碼 rdl_roidb.prepare_roidb(imdb) print(done) return imdb.roidb

Add some quantities useful for training:

image path of the image
width width of the image
height height of the image
max_classes indices of max value of each row of gt_overlaps
max_overlaps max values in each row of gt_overlaps

@roi_data_layer/roidb.py

def prepare_roidb(imdb): """Enrich the imdbs roidb by adding some derived quantities that are useful for training. This function precomputes the maximum overlap, taken over ground-truth boxes, between each ROI and each ground-truth box. The class with maximum overlap is also recorded. """ roidb = imdb.roidb if not (imdb.name.startswith(coco)): sizes = [PIL.Image.open(imdb.image_path_at(i)).size for i in range(imdb.num_images)] for i in range(len(imdb.image_index)): roidb[i][image] = imdb.image_path_at(i) if not (imdb.name.startswith(coco)): roidb[i][width] = sizes[i][0] roidb[i][height] = sizes[i][1] # need gt_overlaps as a dense array for argmax gt_overlaps = roidb[i][gt_overlaps].toarray() # max overlap with gt over classes (columns) max_overlaps = gt_overlaps.max(axis=1) # gt class that had the max overlap max_classes = gt_overlaps.argmax(axis=1) roidb[i][max_classes] = max_classes roidb[i][max_overlaps] = max_overlaps # sanity checks # max overlap of 0 => class should be zero (background) zero_inds = np.where(max_overlaps == 0)[0] assert all(max_classes[zero_inds] == 0) # max overlap > 0 => class should not be zero (must be a fg class) nonzero_inds = np.where(max_overlaps > 0)[0] assert all(max_classes[nonzero_inds] != 0)

validation數據集生成方式相似，不再贅述。

_, valroidb = combined_roidb(args.imdbval_name)

imdb, roidb生成後過濾掉沒有任何fg或bg的rois。

roidb = filter_roidb(roidb)

def filter_roidb(roidb): """Remove roidb entries that have no usable RoIs.""" def is_valid(entry): # Valid images have: # (1) At least one foreground RoI OR # (2) At least one background RoI overlaps = entry[max_overlaps] # find boxes with sufficient overlap fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0] # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) & (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] # image is only valid if such boxes exist valid = len(fg_inds) > 0 or len(bg_inds) > 0 return valid num = len(roidb) filtered_roidb = [entry for entry in roidb if is_valid(entry)] num_after = len(filtered_roidb) print(Filtered {} roidb entries: {} -> {}.format(num - num_after, num, num_after)) return filtered_roidb

3.Graph Construction

with tf.Session(config=tfconfig) as sess: #生成sw對象，包含train_model類函數 sw = SolverWrapper(sess, network, imdb, roidb, valroidb, output_dir, tb_dir,pretrained_model=pretrained_model) print(Solving...) sw.train_model(sess, max_iters) print(done solving)

3.1 General training graph

def train_model(self, sess, max_iters): # Build data layers for both training and validation set self.data_layer = RoIDataLayer(self.roidb, self.imdb.num_classes) self.data_layer_val = RoIDataLayer(self.valroidb, self.imdb.num_classes, random=True) # Construct the computation graph lr, train_op = self.construct_graph(sess) # Find previous snapshots if there is any to restore from lsf, nfiles, sfiles = self.find_previous() # Initialize the variables or restore them from the last snapshot if lsf == 0: rate, last_snapshot_iter, stepsizes, np_paths, ss_paths = self.initialize(sess) else: rate, last_snapshot_iter, stepsizes, np_paths, ss_paths = self.restore(sess, str(sfiles[-1]), str(nfiles[-1])) timer = Timer() iter = last_snapshot_iter + 1 last_summary_time = time.time() # Make sure the lists are not empty stepsizes.append(max_iters) stepsizes.reverse() next_stepsize = stepsizes.pop() while iter < max_iters + 1: # Learning rate if iter == next_stepsize + 1: # Add snapshot here before reducing the learning rate self.snapshot(sess, iter) rate *= cfg.TRAIN.GAMMA sess.run(tf.assign(lr, rate)) next_stepsize = stepsizes.pop() timer.tic() # Get training data, one batch at a time blobs = self.data_layer.forward() now = time.time() if iter == 1 or now - last_summary_time > cfg.TRAIN.SUMMARY_INTERVAL: # Compute the graph with summary rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, total_loss, summary = self.net.train_step_with_summary(sess, blobs, train_op) self.writer.add_summary(summary, float(iter)) # Also check the summary on the validation set blobs_val = self.data_layer_val.forward() summary_val = self.net.get_summary(sess, blobs_val) self.valwriter.add_summary(summary_val, float(iter)) last_summary_time = now else: # Compute the graph without summary rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, total_loss = self.net.train_step(sess, blobs, train_op) timer.toc() # Display training information if iter % (cfg.TRAIN.DISPLAY) == 0: print(iter: %d / %d, total loss: %.6f >>> rpn_loss_cls: %.6f >>> rpn_loss_box: %.6f >>> loss_cls: %.6f >>> loss_box: %.6f >>> lr: %f % (iter, max_iters, total_loss, rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, lr.eval())) print(speed: {:.3f}s / iter.format(timer.average_time)) # Snapshotting if iter % cfg.TRAIN.SNAPSHOT_ITERS == 0: last_snapshot_iter = iter ss_path, np_path = self.snapshot(sess, iter) np_paths.append(np_path) ss_paths.append(ss_path) # Remove the old snapshots if there are too many if len(np_paths) > cfg.TRAIN.SNAPSHOT_KEPT: self.remove_snapshot(np_paths, ss_paths) iter += 1 if last_snapshot_iter != iter - 1: self.snapshot(sess, iter - 1) self.writer.close() self.valwriter.close()

@model/train_val.py

def construct_graph(self, sess): with sess.graph.as_default(): # Set the random seed for tensorflow tf.set_random_seed(cfg.RNG_SEED) # Build the main computation graph layers = self.net.create_architecture(TRAIN, self.imdb.num_classes, tag=default,anchor_scales=cfg.ANCHOR_SCALES, anchor_ratios=cfg.ANCHOR_RATIOS) # Define the loss loss = layers[total_loss] # Set learning rate and momentum lr = tf.Variable(cfg.TRAIN.LEARNING_RATE, trainable=False) self.optimizer = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM) # Compute the gradients with regard to the loss gvs = self.optimizer.compute_gradients(loss) # Double the gradient of the bias if set if cfg.TRAIN.DOUBLE_BIAS: final_gvs = [] with tf.variable_scope(Gradient_Mult) as scope: for grad, var in gvs: scale = 1. if cfg.TRAIN.DOUBLE_BIAS and /biases: in var.name: scale *= 2. if not np.allclose(scale, 1.0): grad = tf.multiply(grad, scale) final_gvs.append((grad, var)) train_op = self.optimizer.apply_gradients(final_gvs) else: train_op = self.optimizer.apply_gradients(gvs) # We will handle the snapshots ourselves self.saver = tf.train.Saver(max_to_keep=100000) # Write the train and validation information to tensorboard self.writer = tf.summary.FileWriter(self.tbdir, sess.graph) self.valwriter = tf.summary.FileWriter(self.tbvaldir) return lr, train_op

@lib/nets/network.py

def create_architecture(self, mode, num_classes, tag=None, anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2)): #輸入介面 self._image = tf.placeholder(tf.float32, shape=[1, None, None, 3]) #im_info=[M, N, scale_factor] self._im_info = tf.placeholder(tf.float32, shape=[3]) #used by anchor_target_layer & proposal_target_layer #_gt_boxes[,0:4]= [x1,x2,y1,y2]; _gt_boxes[,5]為cls_index(0-20) self._gt_boxes = tf.placeholder(tf.float32, shape=[None, 5]) self._tag = tag self._num_classes = num_classes self._mode = mode self._anchor_scales = anchor_scales self._num_scales = len(anchor_scales) self._anchor_ratios = anchor_ratios self._num_ratios = len(anchor_ratios) self._num_anchors = self._num_scales * self._num_ratios training = mode == TRAIN testing = mode == TEST assert tag != None # handle most of the regularizers here weights_regularizer = tf.contrib.layers.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY) if cfg.TRAIN.BIAS_DECAY: biases_regularizer = weights_regularizer else: biases_regularizer = tf.no_regularizer # list as many types of layers as possible, even if they are not used now with arg_scope([slim.conv2d, slim.conv2d_in_plane, slim.conv2d_transpose, slim.separable_conv2d, slim.fully_connected], weights_regularizer=weights_regularizer, biases_regularizer=biases_regularizer, biases_initializer=tf.constant_initializer(0.0)): rois, cls_prob, bbox_pred = self._build_network(training) layers_to_output = {rois: rois} for var in tf.trainable_variables(): self._train_summaries.append(var) if testing: stds = np.tile(np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (self._num_classes)) means = np.tile(np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (self._num_classes)) self._predictions["bbox_pred"] *= stds self._predictions["bbox_pred"] += means else: #compute losses! self._add_losses() layers_to_output.update(self._losses) val_summaries = [] with tf.device("/cpu:0"): val_summaries.append(self._add_gt_image_summary()) for key, var in self._event_summaries.items(): val_summaries.append(tf.summary.scalar(key, var)) for key, var in self._score_summaries.items(): self._add_score_summary(key, var) for var in self._act_summaries: self._add_act_summary(var) for var in self._train_summaries: self._add_train_summary(var) self._summary_op = tf.summary.merge_all() self._summary_op_val = tf.summary.merge(val_summaries) layers_to_output.update(self._predictions) return layers_to_output

@nets/network.py

def _build_network(self, is_training=True): # select initializers if cfg.TRAIN.TRUNCATED: initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.01) initializer_bbox = tf.truncated_normal_initializer(mean=0.0, stddev=0.001) else: initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01) initializer_bbox = tf.random_normal_initializer(mean=0.0, stddev=0.001) net_conv = self._image_to_head(is_training) with tf.variable_scope(self._scope, self._scope): # build the anchors for the image self._anchor_component() # region proposal network rois = self._region_proposal(net_conv, is_training, initializer) # region of interest pooling if cfg.POOLING_MODE == crop: pool5 = self._crop_pool_layer(net_conv, rois, "pool5") else: raise NotImplementedError fc7 = self._head_to_tail(pool5, is_training) with tf.variable_scope(self._scope, self._scope): # region classification cls_prob, bbox_pred = self._region_classification(fc7, is_training, initializer, initializer_bbox) self._score_summaries.update(self._predictions) return rois, cls_prob, bbox_pred

3.2 Conv net

net_conv = self._image_to_head(is_training)

每個卷積層不會改變輸入的長寬，每個池化層會讓輸入的長寬減半，經過了4次池化，所以feat_stride = 16.

@nets/vgg16.py

def _image_to_head(self, is_training, reuse=False): with tf.variable_scope(self._scope, self._scope, reuse=reuse): net = slim.repeat(self._image, 2, slim.conv2d, 64, [3, 3], trainable=False, scope=conv1) net = slim.max_pool2d(net, [2, 2], padding=SAME, scope=pool1) net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], trainable=False, scope=conv2) net = slim.max_pool2d(net, [2, 2], padding=SAME, scope=pool2) net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], trainable=is_training, scope=conv3) net = slim.max_pool2d(net, [2, 2], padding=SAME, scope=pool3) net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], trainable=is_training, scope=conv4) net = slim.max_pool2d(net, [2, 2], padding=SAME, scope=pool4) net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], trainable=is_training, scope=conv5) self._act_summaries.append(net) self._layers[head] = net return net

3.3 Generate anchors

# build the anchors for the image self._anchor_component()

先生成基於(0, 0, 15, 15)小框的ratio為[0.5，1，2],scale為[8，16，32]的9個anchors。ratio為長寬比，scale是擴大原長寬的倍數。這9個anchors對應conv net輸出的feature map上的左上角第一個點。

@layer_utils/generate_anchors.py

def generate_anchors(base_size=16, ratios=[0.5, 1, 2], scales=2 ** np.arange(3, 6)): """ Generate anchor (reference) windows by enumerating aspect ratios X scales wrt a reference (0, 0, 15, 15) window. """ base_anchor = np.array([1, 1, base_size, base_size]) - 1 ratio_anchors = _ratio_enum(base_anchor, ratios) anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) for i in range(ratio_anchors.shape[0])]) return anchors

然後根據上面9個anchors，生成feature map上其他點對應在原圖上的anchors。

@layer_utils/snippets.py

def generate_anchors_pre(height, width, feat_stride, anchor_scales=(8,16,32), anchor_ratios=(0.5,1,2)): """ A wrapper function to generate anchors given different scales Also return the number of anchors in variable length """ anchors = generate_anchors(ratios=np.array(anchor_ratios), scales=np.array(anchor_scales)) A = anchors.shape[0] shift_x = np.arange(0, width) * feat_stride shift_y = np.arange(0, height) * feat_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() K = shifts.shape[0] # width changes faster, so here it is H, W, C anchors = anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2)) anchors = anchors.reshape((K * A, 4)).astype(np.float32, copy=False) length = np.int32(anchors.shape[0]) return anchors, length

3.4 Region proposal network

rois = self._region_proposal(net_conv, is_training, initializer)

@nets/network.py

def _region_proposal(self, net_conv, is_training, initializer): #RPN_CHANNELS: 512 for vgg16; 256 for ZF rpn = slim.conv2d(net_conv, cfg.RPN_CHANNELS, [3, 3], trainable=is_training, weights_initializer=initializer,scope="rpn_conv/3x3") self._act_summaries.append(rpn) #為上層的輸出打分，沒有激勵函數，size不變 rpn_cls_score = slim.conv2d(rpn, self._num_anchors * 2, [1, 1], trainable=is_training, weights_initializer=initializer,padding=VALID, activation_fn=None, scope=rpn_cls_score) # change it so that the score has 2 as its channel size for softmax #[B, H, W, 2*9] --> [B, 9*H, W, 2] rpn_cls_score_reshape = self._reshape_layer(rpn_cls_score, 2, rpn_cls_score_reshape) rpn_cls_prob_reshape = self._softmax_layer(rpn_cls_score_reshape, "rpn_cls_prob_reshape") rpn_cls_pred = tf.argmax(tf.reshape(rpn_cls_score_reshape, [-1, 2]), axis=1, name="rpn_cls_pred") # reshape to [B, H, W, 2*9] rpn_cls_prob = self._reshape_layer(rpn_cls_prob_reshape, self._num_anchors * 2, "rpn_cls_prob") #compute deltas rpn_bbox_pred = slim.conv2d(rpn, self._num_anchors * 4, [1, 1], trainable=is_training,weights_initializer=initializer,padding=VALID, activation_fn=None, scope=rpn_bbox_pred) if is_training: ##transform original anchors and pick some, rois.shape=(:,5), first is 0, rest is proposal; roi_scores.rank = 1, they are probabilities being fg. rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois") rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor") # Try to have a deterministic order for the computing graph, for reproducibility with tf.control_dependencies([rpn_labels]): rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois") ##produce proposal labels else: if cfg.TEST.MODE == nms: rois, _ = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois") elif cfg.TEST.MODE == top: rois, _ = self._proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, "rois") else: raise NotImplementedError self._predictions["rpn_cls_score"] = rpn_cls_score self._predictions["rpn_cls_score_reshape"] = rpn_cls_score_reshape self._predictions["rpn_cls_prob"] = rpn_cls_prob self._predictions["rpn_cls_pred"] = rpn_cls_pred self._predictions["rpn_bbox_pred"] = rpn_bbox_pred self._predictions["rois"] = rois return rois

3.4.1 proposal layer

根據anchors和rpn_bbox_pred回歸候選框並提取有物體框的分數，然後提取RPN_PRE_NMS_TOP_N個，接著NMS，最後再提取RPN_POST_NMS_TOP_N個，返回變數為rois， rpn_scores。

@layer_utils/proposal_layer.py

def proposal_layer(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors): """A simplified version compared to fast/er RCNN For details please see the technical report """ if type(cfg_key) == bytes: cfg_key = cfg_key.decode(utf-8) pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N nms_thresh = cfg[cfg_key].RPN_NMS_THRESH # Get the scores and bounding boxes scores = rpn_cls_prob[:, :, :, num_anchors:] rpn_bbox_pred = rpn_bbox_pred.reshape((-1, 4)) scores = scores.reshape((-1, 1)) proposals = bbox_transform_inv(anchors, rpn_bbox_pred) proposals = clip_boxes(proposals, im_info[:2]) ##proposals.shape = (:,4) # Pick the top region proposals order = scores.ravel().argsort()[::-1] if pre_nms_topN > 0: order = order[:pre_nms_topN] proposals = proposals[order, :] scores = scores[order] # Non-maximal suppression keep = nms(np.hstack((proposals, scores)), nms_thresh) ##return indices, keep.rank=1 # Pick th top region proposals after NMS if post_nms_topN > 0: keep = keep[:post_nms_topN] proposals = proposals[keep, :] scores = scores[keep] # Only support single image as input batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False))) return blob, scores

3.4.2 anchor_target_layer

滿足以下任一條件的anchor被標記為fg:

對於1個gt_box, IoU最高的；
對於1個anchor，Max IoU > 0.7

輸出為：

rpn_labels (1, 1, 9*height, width) fg:1, bg:0, dont care:-1.

rpn_bbox_targets (1, height, width, 9*4) bbox regression deltas

rpn_bbox_inside_weights (1, height, width, 9*4) fg:(1, 1, 1, 1) others: (0, 0, 0, 0)

rpn_bbox_outside_weights (1, height, width, 9*4) fg&bg: 1/(num_of_fgs + num_of_bgs) dont care: 0

@nets/network.py

def _anchor_target_layer(self, rpn_cls_score, name): with tf.variable_scope(name) as scope: rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = tf.py_func( anchor_target_layer, [rpn_cls_score, self._gt_boxes, self._im_info, self._feat_stride, self._anchors, self._num_anchors], [tf.float32, tf.float32, tf.float32, tf.float32], name="anchor_target") rpn_labels.set_shape([1, 1, None, None]) rpn_bbox_targets.set_shape([1, None, None, self._num_anchors * 4]) rpn_bbox_inside_weights.set_shape([1, None, None, self._num_anchors * 4]) rpn_bbox_outside_weights.set_shape([1, None, None, self._num_anchors * 4]) rpn_labels = tf.to_int32(rpn_labels, name="to_int32") self._anchor_targets[rpn_labels] = rpn_labels self._anchor_targets[rpn_bbox_targets] = rpn_bbox_targets self._anchor_targets[rpn_bbox_inside_weights] = rpn_bbox_inside_weights self._anchor_targets[rpn_bbox_outside_weights] = rpn_bbox_outside_weights self._score_summaries.update(self._anchor_targets) return rpn_labels

@layer_utils/anchor_target-layer.py

def anchor_target_layer(rpn_cls_score, gt_boxes, im_info, _feat_stride, all_anchors, num_anchors): """Same as the anchor target layer in original Fast/er RCNN """ A = num_anchors total_anchors = all_anchors.shape[0] K = total_anchors / num_anchors # allow boxes to sit over the edge by a small amount _allowed_border = 0 # map of shape (..., H, W) height, width = rpn_cls_score.shape[1:3] # only keep anchors inside the image inds_inside = np.where( (all_anchors[:, 0] >= -_allowed_border) & (all_anchors[:, 1] >= -_allowed_border) & (all_anchors[:, 2] < im_info[1] + _allowed_border) & # width (all_anchors[:, 3] < im_info[0] + _allowed_border) # height )[0] # keep only inside anchors anchors = all_anchors[inds_inside, :] # label: 1 is positive, 0 is negative, -1 is dont care labels = np.empty((len(inds_inside),), dtype=np.float32) labels.fill(-1) # overlaps between the anchors and the gt boxes # (N, K)ndarray of overlaps or IoUs overlaps = bbox_overlaps( np.ascontiguousarray(anchors, dtype=np.float), np.ascontiguousarray(gt_boxes, dtype=np.float)) argmax_overlaps = overlaps.argmax(axis=1) max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] gt_argmax_overlaps = overlaps.argmax(axis=0) gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])] gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] if not cfg.TRAIN.RPN_CLOBBER_POSITIVES: # assign bg labels first so that positive labels can clobber them # first set the negatives labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 # fg label: for each gt, anchor with highest overlap labels[gt_argmax_overlaps] = 1 # fg label: above threshold IOU labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1 if cfg.TRAIN.RPN_CLOBBER_POSITIVES: # assign bg labels last so that negative labels can clobber positives labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 # subsample positive labels if we have too many num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE) fg_inds = np.where(labels == 1)[0] if len(fg_inds) > num_fg: disable_inds = npr.choice( fg_inds, size=(len(fg_inds) - num_fg), replace=False) labels[disable_inds] = -1 # subsample negative labels if we have too many num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1) bg_inds = np.where(labels == 0)[0] if len(bg_inds) > num_bg: disable_inds = npr.choice( bg_inds, size=(len(bg_inds) - num_bg), replace=False) labels[disable_inds] = -1 bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :]) bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) # only the positive ones have regression targets bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS) bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0: # uniform weighting of examples (given non-uniform sampling) num_examples = np.sum(labels >= 0) positive_weights = np.ones((1, 4)) * 1.0 / num_examples negative_weights = np.ones((1, 4)) * 1.0 / num_examples else: assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) & (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1)) positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT / np.sum(labels == 1)) negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) / np.sum(labels == 0)) bbox_outside_weights[labels == 1, :] = positive_weights bbox_outside_weights[labels == 0, :] = negative_weights # map up to original set of anchors labels = _unmap(labels, total_anchors, inds_inside, fill=-1) bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0) bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0) bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0) # labels labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2) labels = labels.reshape((1, 1, A * height, width)) rpn_labels = labels # bbox_targets bbox_targets = bbox_targets .reshape((1, height, width, A * 4)) rpn_bbox_targets = bbox_targets # bbox_inside_weights bbox_inside_weights = bbox_inside_weights .reshape((1, height, width, A * 4)) rpn_bbox_inside_weights = bbox_inside_weights # bbox_outside_weights bbox_outside_weights = bbox_outside_weights .reshape((1, height, width, A * 4)) rpn_bbox_outside_weights = bbox_outside_weights return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights

3.4.3 proposal target layer

首先會在提取出來的rois中取樣，然後為classification network生成目標labels，bbox_targets。

@nets/network.py

def _proposal_target_layer(self, rois, roi_scores, name): with tf.variable_scope(name) as scope: rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights = tf.py_func( proposal_target_layer, [rois, roi_scores, self._gt_boxes, self._num_classes], [tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32], name="proposal_target") rois.set_shape([cfg.TRAIN.BATCH_SIZE, 5]) roi_scores.set_shape([cfg.TRAIN.BATCH_SIZE]) labels.set_shape([cfg.TRAIN.BATCH_SIZE, 1]) bbox_targets.set_shape([cfg.TRAIN.BATCH_SIZE, self._num_classes * 4]) bbox_inside_weights.set_shape([cfg.TRAIN.BATCH_SIZE, self._num_classes * 4]) bbox_outside_weights.set_shape([cfg.TRAIN.BATCH_SIZE, self._num_classes * 4]) self._proposal_targets[rois] = rois self._proposal_targets[labels] = tf.to_int32(labels, name="to_int32") self._proposal_targets[bbox_targets] = bbox_targets self._proposal_targets[bbox_inside_weights] = bbox_inside_weights self._proposal_targets[bbox_outside_weights] = bbox_outside_weights self._score_summaries.update(self._proposal_targets) return rois, roi_scores

@layer_utils/proposal_target_layer.py

def proposal_target_layer(rpn_rois, rpn_scores, gt_boxes, _num_classes): """ Assign object detection proposals to ground-truth targets. Produces proposal classification labels and bounding-box regression targets. """ # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN # (i.e., rpn.proposal_layer.ProposalLayer), or any other source all_rois = rpn_rois all_scores = rpn_scores # Include ground-truth boxes in the set of candidate rois if cfg.TRAIN.USE_GT: zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype) all_rois = np.vstack( (all_rois, np.hstack((zeros, gt_boxes[:, :-1]))) ) # not sure if it a wise appending, but anyway i am not using it all_scores = np.vstack((all_scores, zeros)) num_images = 1 rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) # Sample rois with classification labels and bounding box regression # targets labels, rois, roi_scores, bbox_targets, bbox_inside_weights = _sample_rois( all_rois, all_scores, gt_boxes, fg_rois_per_image, rois_per_image, _num_classes) rois = rois.reshape(-1, 5) roi_scores = roi_scores.reshape(-1) labels = labels.reshape(-1, 1) bbox_targets = bbox_targets.reshape(-1, _num_classes * 4) bbox_inside_weights = bbox_inside_weights.reshape(-1, _num_classes * 4) bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32) return rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weightsdef _get_bbox_regression_labels(bbox_target_data, num_classes): """Bounding-box regression targets (bbox_target_data) are stored in a compact form N x (class, tx, ty, tw, th) This function expands those targets into the 4-of-4*K representation used by the network (i.e. only one class has non-zero targets). Returns: K:num_classes,每行對應cls位置為target，其他位置為0 bbox_target (ndarray): N x 4K blob of regression targets bbox_inside_weights (ndarray): N x 4K blob of loss weights """ clss = bbox_target_data[:, 0] bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32) bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32) inds = np.where(clss > 0)[0] for ind in inds: cls = clss[ind] start = int(4 * cls) end = start + 4 bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS return bbox_targets, bbox_inside_weightsdef _compute_targets(ex_rois, gt_rois, labels): """Compute bounding-box regression targets for an image.""" assert ex_rois.shape[0] == gt_rois.shape[0] assert ex_rois.shape[1] == 4 assert gt_rois.shape[1] == 4 #function for computing deltas targets = bbox_transform(ex_rois, gt_rois) if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS)) / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS)) return np.hstack( (labels[:, np.newaxis], targets)).astype(np.float32, copy=False)def _sample_rois(all_rois, all_scores, gt_boxes, fg_rois_per_image, rois_per_image, num_classes): """Generate a random sample of RoIs comprising foreground and background examples. """ # overlaps: (rois x gt_boxes) overlaps = bbox_overlaps( np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float), np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float)) gt_assignment = overlaps.argmax(axis=1) max_overlaps = overlaps.max(axis=1) labels = gt_boxes[gt_assignment, 4] # Select foreground RoIs as those with >= FG_THRESH=.5 overlap fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0] # Guard against the case when an image has fewer than fg_rois_per_image #Select background RoIs as those within [BG_THRESH_LO=0,BG_THRESH_HI=.5) bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) & (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] # Small modification to the original version where we ensure a fixed number of regions are sampled if fg_inds.size > 0 and bg_inds.size > 0: fg_rois_per_image = min(fg_rois_per_image, fg_inds.size) fg_inds = npr.choice(fg_inds, size=int(fg_rois_per_image), replace=False) bg_rois_per_image = rois_per_image - fg_rois_per_image to_replace = bg_inds.size < bg_rois_per_image bg_inds = npr.choice(bg_inds, size=int(bg_rois_per_image), replace=to_replace) elif fg_inds.size > 0: to_replace = fg_inds.size < rois_per_image fg_inds = npr.choice(fg_inds, size=int(rois_per_image), replace=to_replace) fg_rois_per_image = rois_per_image elif bg_inds.size > 0: to_replace = bg_inds.size < rois_per_image bg_inds = npr.choice(bg_inds, size=int(rois_per_image), replace=to_replace) fg_rois_per_image = 0 else: import pdb pdb.set_trace() # The indices that were selecting (both fg and bg) keep_inds = np.append(fg_inds, bg_inds) # Select sampled values from various arrays: labels = labels[keep_inds] # Clamp labels for the background RoIs to 0 labels[int(fg_rois_per_image):] = 0 rois = all_rois[keep_inds] roi_scores = all_scores[keep_inds] bbox_target_data = _compute_targets( rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels) bbox_targets, bbox_inside_weights = _get_bbox_regression_labels(bbox_target_data, num_classes) return labels, rois, roi_scores, bbox_targets, bbox_inside_weights

3.5 Crop

pool5 = self._crop_pool_layer(net_conv, rois, "pool5")

先把roi crop成 14 * 14，再池化成 7 * 7。

@nets/network.py

def _crop_pool_layer(self, bottom, rois, name): with tf.variable_scope(name) as scope: batch_ids = tf.squeeze(tf.slice(rois, [0, 0], [-1, 1], name="batch_id"), [1]) # Get the normalized coordinates of bounding boxes bottom_shape = tf.shape(bottom) height = (tf.to_float(bottom_shape[1]) - 1.) * np.float32(self._feat_stride[0]) width = (tf.to_float(bottom_shape[2]) - 1.) * np.float32(self._feat_stride[0]) x1 = tf.slice(rois, [0, 1], [-1, 1], name="x1") / width y1 = tf.slice(rois, [0, 2], [-1, 1], name="y1") / height x2 = tf.slice(rois, [0, 3], [-1, 1], name="x2") / width y2 = tf.slice(rois, [0, 4], [-1, 1], name="y2") / height # Wont be back-propagated to rois anyway, but to save time bboxes = tf.stop_gradient(tf.concat([y1, x1, y2, x2], axis=1)) pre_pool_size = cfg.POOLING_SIZE * 2 crops = tf.image.crop_and_resize(bottom, bboxes, tf.to_int32(batch_ids), [pre_pool_size, pre_pool_size], name="crops") return slim.max_pool2d(crops, [2, 2], padding=SAME)

3.6 Classification

先送入2個全聯接層fc6, fc7；

fc7 = self._head_to_tail(pool5, is_training)

@nets/vgg16.py

def _head_to_tail(self, pool5, is_training, reuse=False): with tf.variable_scope(self._scope, self._scope, reuse=reuse): pool5_flat = slim.flatten(pool5, scope=flatten) fc6 = slim.fully_connected(pool5_flat, 4096, scope=fc6) if is_training: fc6 = slim.dropout(fc6, keep_prob=0.5, is_training=True, scope=dropout6) fc7 = slim.fully_connected(fc6, 4096, scope=fc7) if is_training: fc7 = slim.dropout(fc7, keep_prob=0.5, is_training=True, scope=dropout7) return fc7

然後分別進行分類（21類）和圖形框精修。

cls_prob, bbox_pred = self._region_classification(fc7, is_training, initializer, initializer_bbox)

def _region_classification(self, fc7, is_training, initializer, initializer_bbox): cls_score = slim.fully_connected(fc7, self._num_classes, weights_initializer=initializer, trainable=is_training, activation_fn=None, scope=cls_score) cls_prob = self._softmax_layer(cls_score, "cls_prob") cls_pred = tf.argmax(cls_score, axis=1, name="cls_pred") bbox_pred = slim.fully_connected(fc7, self._num_classes * 4, weights_initializer=initializer_bbox, trainable=is_training, activation_fn=None, scope=bbox_pred) self._predictions["cls_score"] = cls_score self._predictions["cls_pred"] = cls_pred self._predictions["cls_prob"] = cls_prob self._predictions["bbox_pred"] = bbox_pred return cls_prob, bbox_pred

3.7 Losses

@nets/network.py

def create_architecture

self._add_losses()

$L({p_{i}},{t_{i}})=frac{1}{N_{cls}}sum_{i}^{}{L_{cls}}(p_{i},p_{i}^{*})+lambdafrac{1}{N_{reg}}sum_{i}^{}{p_{i}^{*}L_{reg}(t_{i},t_{i}^{*})}$

def _add_losses(self, sigma_rpn=3.0): #sigma對應lamda with tf.variable_scope(LOSS_ + self._tag) as scope: # RPN, class loss rpn_cls_score = tf.reshape(self._predictions[rpn_cls_score_reshape], [-1, 2]) rpn_label = tf.reshape(self._anchor_targets[rpn_labels], [-1]) rpn_select = tf.where(tf.not_equal(rpn_label, -1)) rpn_cls_score = tf.reshape(tf.gather(rpn_cls_score, rpn_select), [-1, 2]) rpn_label = tf.reshape(tf.gather(rpn_label, rpn_select), [-1]) rpn_cross_entropy = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=rpn_cls_score, labels=rpn_label)) # RPN, bbox loss rpn_bbox_pred = self._predictions[rpn_bbox_pred] rpn_bbox_targets = self._anchor_targets[rpn_bbox_targets] rpn_bbox_inside_weights = self._anchor_targets[rpn_bbox_inside_weights] rpn_bbox_outside_weights = self._anchor_targets[rpn_bbox_outside_weights] rpn_loss_box = self._smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights, sigma=sigma_rpn, dim=[1, 2, 3]) # RCNN, class loss cls_score = self._predictions["cls_score"] label = tf.reshape(self._proposal_targets["labels"], [-1]) cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=cls_score, labels=label)) # RCNN, bbox loss bbox_pred = self._predictions[bbox_pred] bbox_targets = self._proposal_targets[bbox_targets] bbox_inside_weights = self._proposal_targets[bbox_inside_weights] bbox_outside_weights = self._proposal_targets[bbox_outside_weights] loss_box = self._smooth_l1_loss(bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights) self._losses[cross_entropy] = cross_entropy self._losses[loss_box] = loss_box self._losses[rpn_cross_entropy] = rpn_cross_entropy self._losses[rpn_loss_box] = rpn_loss_box loss = cross_entropy + loss_box + rpn_cross_entropy + rpn_loss_box self._losses[total_loss] = loss self._event_summaries.update(self._losses) return loss

4.Computing graph

@train_val.py

def train_model

rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, total_loss, summary = self.net.train_step_with_summary(sess, blobs, train_op)

@nets/network.py

def train_step_with_summary(self, sess, blobs, train_op): feed_dict = {self._image: blobs[data], self._im_info: blobs[im_info], self._gt_boxes: blobs[gt_boxes]} rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, loss, summary, _ = sess.run([self._losses["rpn_cross_entropy"], self._losses[rpn_loss_box], self._losses[cross_entropy], self._losses[loss_box], self._losses[total_loss], self._summary_op, train_op], feed_dict=feed_dict) return rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, loss, summary

5.Results

Referenced from https://arxiv.org/pdf/1702.02138.pdf