Showing
6 changed files
with
2019 additions
and
29 deletions
... | @@ -24,10 +24,10 @@ anchor_path = data_path + 'yolo_anchors.txt' # The path of the anchor txt file. | ... | @@ -24,10 +24,10 @@ anchor_path = data_path + 'yolo_anchors.txt' # The path of the anchor txt file. |
24 | class_name_path = data_path + 'classes.txt' # The path of the class names. | 24 | class_name_path = data_path + 'classes.txt' # The path of the class names. |
25 | 25 | ||
26 | ### Training releated numbers | 26 | ### Training releated numbers |
27 | -batch_size = 6 | 27 | +batch_size = 10 |
28 | img_size = [416, 416] # Images will be resized to `img_size` and fed to the network, size format: [width, height] | 28 | img_size = [416, 416] # Images will be resized to `img_size` and fed to the network, size format: [width, height] |
29 | letterbox_resize = True # Whether to use the letterbox resize, i.e., keep the original aspect ratio in the resized image. | 29 | letterbox_resize = True # Whether to use the letterbox resize, i.e., keep the original aspect ratio in the resized image. |
30 | -total_epoches = 50 | 30 | +total_epoches = 20 |
31 | train_evaluation_step = 10 # Evaluate on the training batch after some steps. | 31 | train_evaluation_step = 10 # Evaluate on the training batch after some steps. |
32 | val_evaluation_epoch = 2 # Evaluate on the whole validation dataset after some epochs. Set to None to evaluate every epoch. | 32 | val_evaluation_epoch = 2 # Evaluate on the whole validation dataset after some epochs. Set to None to evaluate every epoch. |
33 | save_epoch = 5 # Save the model after some epochs. | 33 | save_epoch = 5 # Save the model after some epochs. |
... | @@ -73,7 +73,7 @@ use_label_smooth = True # Whether to use class label smoothing strategy. | ... | @@ -73,7 +73,7 @@ use_label_smooth = True # Whether to use class label smoothing strategy. |
73 | use_focal_loss = True # Whether to apply focal loss on the conf loss. | 73 | use_focal_loss = True # Whether to apply focal loss on the conf loss. |
74 | use_mix_up = True # Whether to use mix up data augmentation strategy. | 74 | use_mix_up = True # Whether to use mix up data augmentation strategy. |
75 | use_warm_up = True # whether to use warm up strategy to prevent from gradient exploding. | 75 | use_warm_up = True # whether to use warm up strategy to prevent from gradient exploding. |
76 | -warm_up_epoch = 3 # Warm up training epoches. Set to a larger value if gradient explodes. | 76 | +warm_up_epoch = 2 # Warm up training epoches. Set to a larger value if gradient explodes. |
77 | 77 | ||
78 | ### some constants in validation | 78 | ### some constants in validation |
79 | # nms | 79 | # nms | ... | ... |
... | @@ -2,8 +2,13 @@ changes from https://github.com/wizyoung/YOLOv3_TensorFlow | ... | @@ -2,8 +2,13 @@ changes from https://github.com/wizyoung/YOLOv3_TensorFlow |
2 | 2 | ||
3 | by Seongju Kim, kareus1@khu.ac.kr | 3 | by Seongju Kim, kareus1@khu.ac.kr |
4 | 4 | ||
5 | +I only tested in colab environment yet (2020.05.16), | ||
6 | +so let me know if there are some errors/problems in python code version | ||
7 | +(##last changed: 2020.05.16) | ||
8 | + | ||
5 | 1] changed TextLineDataset to TFRecordDataset. (also changed data parsing in data utils and eval utils) | 9 | 1] changed TextLineDataset to TFRecordDataset. (also changed data parsing in data utils and eval utils) |
6 | 2] fixed restore-does-not-exist problem in train/eval mode | 10 | 2] fixed restore-does-not-exist problem in train/eval mode |
7 | 3] fixed saver to save the parameter only when save-optimizer option is true | 11 | 3] fixed saver to save the parameter only when save-optimizer option is true |
8 | 4] changed parameter 'mode' to bool value 'is_training' in data util functions (string value 'mode' is passed as byte string, so functions do not evaluate if-clauses as expected. ex) 'train' != b'train') | 12 | 4] changed parameter 'mode' to bool value 'is_training' in data util functions (string value 'mode' is passed as byte string, so functions do not evaluate if-clauses as expected. ex) 'train' != b'train') |
9 | -5] wrote TFRecord binary iterator, which runs without tf session (references: https://github.com/pgmmpk/tfrecord ) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
13 | +5] wrote TFRecord binary iterator, which runs without tf session (references: https://github.com/pgmmpk/tfrecord ) | ||
14 | +6] removed logging/tenorboard summary code. (I will add it later if necessary) | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
... | @@ -9,22 +9,20 @@ import random | ... | @@ -9,22 +9,20 @@ import random |
9 | PY_VERSION = sys.version_info[0] | 9 | PY_VERSION = sys.version_info[0] |
10 | iter_cnt = 0 | 10 | iter_cnt = 0 |
11 | 11 | ||
12 | -FEATURE_DESCRIPTION = { | 12 | +def _parse_tfrecord(data): |
13 | - 'index': tf.FixedLenFeature([], tf.int64), | 13 | + example = tf.train.Example() |
14 | - 'image': tf.FixedLenFeature([], tf.string), | 14 | + example.ParseFromString(data) |
15 | - 'width': tf.FixedLenFeature([], tf.int64), | 15 | + features = example.features.feature |
16 | - 'height': tf.FixedLenFeature([], tf.int64), | 16 | + return features |
17 | - 'boxes': tf.VarLenFeature(tf.int64) | ||
18 | -} | ||
19 | 17 | ||
20 | def parse_tfrecord(data): | 18 | def parse_tfrecord(data): |
21 | # tfrecord parser for TFRecordDataset (raw data) | 19 | # tfrecord parser for TFRecordDataset (raw data) |
22 | - features = tf.parse_single_example(data, FEATURE_DESCRIPTION) | 20 | + features = _parse_tfrecord(data) |
23 | - index = int(features['index']) | 21 | + index = features['index'].int64_list.value[0] |
24 | - encoded_image = np.frombuffer(features['image'], dtype = np.uint8) | 22 | + encoded_image = np.frombuffer(features['image'].bytes_list.value[0], dtype = np.uint8) |
25 | - width = int(features['width']) | 23 | + width = features['width'].int64_list.value[0] |
26 | - height = int(features['height']) | 24 | + height = features['height'].int64_list.value[0] |
27 | - boxes = features['boxes'].eval() | 25 | + boxes = features['boxes'].int64_list.value |
28 | 26 | ||
29 | assert len(boxes) % 5 == 0, 'Annotation error occured in box array.' | 27 | assert len(boxes) % 5 == 0, 'Annotation error occured in box array.' |
30 | box_cnt = len(boxes) // 5 | 28 | box_cnt = len(boxes) // 5 |
... | @@ -33,7 +31,7 @@ def parse_tfrecord(data): | ... | @@ -33,7 +31,7 @@ def parse_tfrecord(data): |
33 | labels = [] | 31 | labels = [] |
34 | 32 | ||
35 | for i in range(box_cnt): | 33 | for i in range(box_cnt): |
36 | - label, x_min, y_min, x_max, y_max = int(boxes[i * 5]), float(boxes[i * 5 + 1]), float(boxes[i * 5 + 2]), float(boxes[i * 5 + 3]) ## do we need to change int to float? is there float rectangle sample? | 34 | + label, x_min, y_min, x_max, y_max = int(boxes[i * 5]), float(boxes[i * 5 + 1]), float(boxes[i * 5 + 2]), float(boxes[i * 5 + 3]), float(boxes[i * 5 + 4]) ## do we need to change int to float? is there float rectangle sample? |
37 | aligned_boxes.append([x_min, y_min, x_max, y_max]) | 35 | aligned_boxes.append([x_min, y_min, x_max, y_max]) |
38 | labels.append(label) | 36 | labels.append(label) |
39 | 37 | ... | ... |
... | @@ -99,6 +99,8 @@ with tf.Session() as sess: | ... | @@ -99,6 +99,8 @@ with tf.Session() as sess: |
99 | sess.run([tf.global_variables_initializer()]) | 99 | sess.run([tf.global_variables_initializer()]) |
100 | if os.path.exists(args.restore_path): | 100 | if os.path.exists(args.restore_path): |
101 | saver_to_restore.restore(sess, args.restore_path) | 101 | saver_to_restore.restore(sess, args.restore_path) |
102 | + else: | ||
103 | + raise ValueError('there is no model to evaluate. You should move/create the checkpoint file to restore path') | ||
102 | 104 | ||
103 | print('\nStart evaluation...\n') | 105 | print('\nStart evaluation...\n') |
104 | 106 | ... | ... |
... | @@ -22,18 +22,18 @@ pred_scores_flag = tf.placeholder(tf.float32, [1, None, None]) | ... | @@ -22,18 +22,18 @@ pred_scores_flag = tf.placeholder(tf.float32, [1, None, None]) |
22 | gpu_nms_op = gpu_nms(pred_boxes_flag, pred_scores_flag, args.class_num, args.nms_topk, args.score_threshold, args.nms_threshold) | 22 | gpu_nms_op = gpu_nms(pred_boxes_flag, pred_scores_flag, args.class_num, args.nms_topk, args.score_threshold, args.nms_threshold) |
23 | 23 | ||
24 | ### tf.data pipeline | 24 | ### tf.data pipeline |
25 | -train_dataset = tf.data.TFRecordDataset(filenames=train_file, compression_type='GZIP') | 25 | +train_dataset = tf.data.TFRecordDataset(filenames=args.train_file, compression_type='GZIP') |
26 | -train_dataset = train_dataset.shuffle(train_img_cnt) | 26 | +train_dataset = train_dataset.shuffle(args.train_img_cnt) |
27 | -train_dataset = train_dataset.batch(batch_size) | 27 | +train_dataset = train_dataset.batch(args.batch_size) |
28 | train_dataset = train_dataset.map( | 28 | train_dataset = train_dataset.map( |
29 | lambda x: tf.py_func(get_batch_data, | 29 | lambda x: tf.py_func(get_batch_data, |
30 | inp=[x, args.class_num, args.img_size, args.anchors, True, args.multi_scale_train, args.use_mix_up, args.letterbox_resize], | 30 | inp=[x, args.class_num, args.img_size, args.anchors, True, args.multi_scale_train, args.use_mix_up, args.letterbox_resize], |
31 | Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]), | 31 | Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]), |
32 | num_parallel_calls=args.num_threads | 32 | num_parallel_calls=args.num_threads |
33 | ) | 33 | ) |
34 | -train_dataset = train_dataset.prefetch(prefetech_buffer) | 34 | +train_dataset = train_dataset.prefetch(args.prefetech_buffer) |
35 | 35 | ||
36 | -val_dataset = tf.data.TFRecordDataset(filenames=val_file, compression_type='GZIP') | 36 | +val_dataset = tf.data.TFRecordDataset(filenames=args.val_file, compression_type='GZIP') |
37 | val_dataset = val_dataset.batch(1) | 37 | val_dataset = val_dataset.batch(1) |
38 | val_dataset = val_dataset.map( | 38 | val_dataset = val_dataset.map( |
39 | lambda x: tf.py_func(get_batch_data, | 39 | lambda x: tf.py_func(get_batch_data, |
... | @@ -41,7 +41,7 @@ val_dataset = val_dataset.map( | ... | @@ -41,7 +41,7 @@ val_dataset = val_dataset.map( |
41 | Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]), | 41 | Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]), |
42 | num_parallel_calls=args.num_threads | 42 | num_parallel_calls=args.num_threads |
43 | ) | 43 | ) |
44 | -val_dataset.prefetch(prefetech_buffer) | 44 | +val_dataset.prefetch(args.prefetech_buffer) |
45 | 45 | ||
46 | iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) | 46 | iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) |
47 | train_init_op = iterator.make_initializer(train_dataset) | 47 | train_init_op = iterator.make_initializer(train_dataset) |
... | @@ -71,13 +71,13 @@ saver_to_restore = tf.train.Saver(var_list=tf.contrib.framework.get_variables_to | ... | @@ -71,13 +71,13 @@ saver_to_restore = tf.train.Saver(var_list=tf.contrib.framework.get_variables_to |
71 | update_vars = tf.contrib.framework.get_variables_to_restore(include=update_part) | 71 | update_vars = tf.contrib.framework.get_variables_to_restore(include=update_part) |
72 | 72 | ||
73 | 73 | ||
74 | -global_step = tf.Variable(float(global_step), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES]) | 74 | +global_step = tf.Variable(float(args.global_step), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES]) |
75 | if use_warm_up: | 75 | if use_warm_up: |
76 | learning_rate = tf.cond(tf.less(global_step, train_batch_num * warm_up_epoch), | 76 | learning_rate = tf.cond(tf.less(global_step, train_batch_num * warm_up_epoch), |
77 | lambda: learning_rate_init * global_step / (train_batch_num * warm_up_epoch), | 77 | lambda: learning_rate_init * global_step / (train_batch_num * warm_up_epoch), |
78 | - lambda: config_learning_rate(global_step - args.train_batch_num * args.warm_up_epoch)) | 78 | + lambda: config_learning_rate(args, global_step - args.train_batch_num * args.warm_up_epoch)) |
79 | else: | 79 | else: |
80 | - learning_rate = config_learning_rate(global_step) | 80 | + learning_rate = config_learning_rate(args, global_step) |
81 | 81 | ||
82 | optimizer = config_optimizer(args.optimizer_name, learning_rate) | 82 | optimizer = config_optimizer(args.optimizer_name, learning_rate) |
83 | 83 | ||
... | @@ -105,7 +105,7 @@ with tf.Session() as sess: | ... | @@ -105,7 +105,7 @@ with tf.Session() as sess: |
105 | if os.path.exists(args.restore_path): | 105 | if os.path.exists(args.restore_path): |
106 | saver_to_restore.restore(sess, args.restore_path) | 106 | saver_to_restore.restore(sess, args.restore_path) |
107 | 107 | ||
108 | - print('\nStart training...\n') | 108 | + print('\nStart training...: Total epoches =', args.total_epoches, '\n') |
109 | 109 | ||
110 | best_mAP = -np.Inf | 110 | best_mAP = -np.Inf |
111 | 111 | ||
... | @@ -163,7 +163,7 @@ with tf.Session() as sess: | ... | @@ -163,7 +163,7 @@ with tf.Session() as sess: |
163 | 163 | ||
164 | # calc mAP | 164 | # calc mAP |
165 | rec_total, prec_total, ap_total = AverageMeter(), AverageMeter(), AverageMeter() | 165 | rec_total, prec_total, ap_total = AverageMeter(), AverageMeter(), AverageMeter() |
166 | - gt_dict = parse_gt_rec(args.val_file, args.img_size, args.letterbox_resize) | 166 | + gt_dict = parse_gt_rec(args.val_file, 'GZIP', args.img_size, args.letterbox_resize) |
167 | 167 | ||
168 | info = '======> Epoch: {}, global_step: {}, lr: {:.6g} <======\n'.format(epoch, __global_step, __lr) | 168 | info = '======> Epoch: {}, global_step: {}, lr: {:.6g} <======\n'.format(epoch, __global_step, __lr) |
169 | 169 | ... | ... |
code/yolov3/yolov3.ipynb
0 → 100644
1 | +{ | ||
2 | + "nbformat": 4, | ||
3 | + "nbformat_minor": 0, | ||
4 | + "metadata": { | ||
5 | + "colab": { | ||
6 | + "name": "yolov3.ipynb", | ||
7 | + "provenance": [], | ||
8 | + "collapsed_sections": [] | ||
9 | + }, | ||
10 | + "kernelspec": { | ||
11 | + "name": "python3", | ||
12 | + "display_name": "Python 3" | ||
13 | + } | ||
14 | + }, | ||
15 | + "cells": [ | ||
16 | + { | ||
17 | + "cell_type": "code", | ||
18 | + "metadata": { | ||
19 | + "id": "p0y3wIkfSuIT", | ||
20 | + "colab_type": "code", | ||
21 | + "outputId": "eeedd664-406a-43ff-aa5e-bd48963494c4", | ||
22 | + "colab": { | ||
23 | + "base_uri": "https://localhost:8080/", | ||
24 | + "height": 53 | ||
25 | + } | ||
26 | + }, | ||
27 | + "source": [ | ||
28 | + "%tensorflow_version 1.x\n", | ||
29 | + "## Check your google colab/drive settings!!! (libraries, argument paths, ...)\n", | ||
30 | + "from google.colab import drive\n", | ||
31 | + "drive.mount('/content/gdrive')\n", | ||
32 | + "\n", | ||
33 | + "## variables for notebook\n", | ||
34 | + "training = True\n", | ||
35 | + "\n", | ||
36 | + "##### changes\n", | ||
37 | + "### changed some variable names because of argument conflicts\n", | ||
38 | + "### last two parts are train, test mode code. you can switch the mode with above variable, 'training'\n", | ||
39 | + "### there are some difficulties for separating train/eval code (making into functions), because of variable dependencies" | ||
40 | + ], | ||
41 | + "execution_count": 1, | ||
42 | + "outputs": [ | ||
43 | + { | ||
44 | + "output_type": "stream", | ||
45 | + "text": [ | ||
46 | + "TensorFlow 1.x selected.\n", | ||
47 | + "Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount(\"/content/gdrive\", force_remount=True).\n" | ||
48 | + ], | ||
49 | + "name": "stdout" | ||
50 | + } | ||
51 | + ] | ||
52 | + }, | ||
53 | + { | ||
54 | + "cell_type": "code", | ||
55 | + "metadata": { | ||
56 | + "id": "Yh3RWBkgAjZx", | ||
57 | + "colab_type": "code", | ||
58 | + "colab": {} | ||
59 | + }, | ||
60 | + "source": [ | ||
61 | + "## TFRecord utils here\n", | ||
62 | + "import tensorflow as tf\n", | ||
63 | + "from itertools import tee\n", | ||
64 | + "\n", | ||
65 | + "class TFRecordIterator:\n", | ||
66 | + " def __init__(self, path, compression=None):\n", | ||
67 | + " self._core = tf.python_io.tf_record_iterator(path, tf.python_io.TFRecordOptions(compression))\n", | ||
68 | + " self._iterator = iter(self._core)\n", | ||
69 | + " self._iterator, self._iterator_temp = tee(self._iterator)\n", | ||
70 | + " self._total_cnt = sum(1 for _ in self._iterator_temp)\n", | ||
71 | + "\n", | ||
72 | + " def _read_value(self, feature):\n", | ||
73 | + " if len(feature.int64_list.value) > 0:\n", | ||
74 | + " return feature.int64_list.value\n", | ||
75 | + "\n", | ||
76 | + " if len(feature.bytes_list.value) > 0:\n", | ||
77 | + " return feature.bytes_list.value\n", | ||
78 | + "\n", | ||
79 | + " if len(feature.float_list.value) > 0:\n", | ||
80 | + " return feature.float_list.value\n", | ||
81 | + "\n", | ||
82 | + " return None\n", | ||
83 | + "\n", | ||
84 | + " def _read_features(self, features):\n", | ||
85 | + " d = dict()\n", | ||
86 | + " for data in features:\n", | ||
87 | + " d[data] = self._read_value(features[data])\n", | ||
88 | + " return d\n", | ||
89 | + "\n", | ||
90 | + " def __enter__(self):\n", | ||
91 | + " return self\n", | ||
92 | + "\n", | ||
93 | + " def __exit__(self, exception_type, exception_value, traceback):\n", | ||
94 | + " pass\n", | ||
95 | + "\n", | ||
96 | + " def __iter__(self):\n", | ||
97 | + " return self\n", | ||
98 | + "\n", | ||
99 | + " def __next__(self):\n", | ||
100 | + " record = next(self._iterator)\n", | ||
101 | + " example = tf.train.Example()\n", | ||
102 | + " example.ParseFromString(record)\n", | ||
103 | + " return self._read_features(example.features.feature)\n", | ||
104 | + "\n", | ||
105 | + " def count(self):\n", | ||
106 | + " return self._total_cnt\n" | ||
107 | + ], | ||
108 | + "execution_count": 0, | ||
109 | + "outputs": [] | ||
110 | + }, | ||
111 | + { | ||
112 | + "cell_type": "code", | ||
113 | + "metadata": { | ||
114 | + "id": "oCVOPE2XC3qE", | ||
115 | + "colab_type": "code", | ||
116 | + "colab": {} | ||
117 | + }, | ||
118 | + "source": [ | ||
119 | + "## plot utils\n", | ||
120 | + "from __future__ import division, print_function\n", | ||
121 | + "\n", | ||
122 | + "import cv2\n", | ||
123 | + "import random\n", | ||
124 | + "\n", | ||
125 | + "def get_color_table(class_num, seed=2):\n", | ||
126 | + " random.seed(seed)\n", | ||
127 | + " color_table = {}\n", | ||
128 | + " for i in range(class_num):\n", | ||
129 | + " color_table[i] = [random.randint(0, 255) for _ in range(3)]\n", | ||
130 | + " return color_table\n", | ||
131 | + "\n", | ||
132 | + "\n", | ||
133 | + "def plot_one_box(img, coord, label=None, color=None, line_thickness=None):\n", | ||
134 | + " tl = line_thickness or int(round(0.002 * max(img.shape[0:2]))) # line thickness\n", | ||
135 | + " color = color or [random.randint(0, 255) for _ in range(3)]\n", | ||
136 | + " c1, c2 = (int(coord[0]), int(coord[1])), (int(coord[2]), int(coord[3]))\n", | ||
137 | + " cv2.rectangle(img, c1, c2, color, thickness=tl)\n", | ||
138 | + " if label:\n", | ||
139 | + " tf = max(tl - 1, 1) # font thickness\n", | ||
140 | + " t_size = cv2.getTextSize(label, 0, fontScale=float(tl) / 3, thickness=tf)[0]\n", | ||
141 | + " c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3\n", | ||
142 | + " cv2.rectangle(img, c1, c2, color, -1) # filled\n", | ||
143 | + " cv2.putText(img, label, (c1[0], c1[1] - 2), 0, float(tl) / 3, [0, 0, 0], thickness=tf, lineType=cv2.LINE_AA)" | ||
144 | + ], | ||
145 | + "execution_count": 0, | ||
146 | + "outputs": [] | ||
147 | + }, | ||
148 | + { | ||
149 | + "cell_type": "code", | ||
150 | + "metadata": { | ||
151 | + "id": "SY10K9LoDJOZ", | ||
152 | + "colab_type": "code", | ||
153 | + "colab": {} | ||
154 | + }, | ||
155 | + "source": [ | ||
156 | + "## nms utils\n", | ||
157 | + "import numpy as np\n", | ||
158 | + "\n", | ||
159 | + "def gpu_nms(boxes, scores, num_classes, max_boxes=50, score_thresh=0.5, nms_thresh=0.5):\n", | ||
160 | + " boxes_list, label_list, score_list = [], [], []\n", | ||
161 | + " max_boxes = tf.constant(max_boxes, dtype='int32')\n", | ||
162 | + "\n", | ||
163 | + " boxes = tf.reshape(boxes, [-1, 4]) # '-1' means we don't konw the exact number of boxes\n", | ||
164 | + " score = tf.reshape(scores, [-1, num_classes])\n", | ||
165 | + "\n", | ||
166 | + " # Step 1: Create a filtering mask based on \"box_class_scores\" by using \"threshold\".\n", | ||
167 | + " mask = tf.greater_equal(score, tf.constant(score_thresh))\n", | ||
168 | + " # Step 2: Do non_max_suppression for each class\n", | ||
169 | + " for i in range(num_classes):\n", | ||
170 | + " # Step 3: Apply the mask to scores, boxes and pick them out\n", | ||
171 | + " filter_boxes = tf.boolean_mask(boxes, mask[:,i])\n", | ||
172 | + " filter_score = tf.boolean_mask(score[:,i], mask[:,i])\n", | ||
173 | + " nms_indices = tf.image.non_max_suppression(boxes=filter_boxes,\n", | ||
174 | + " scores=filter_score,\n", | ||
175 | + " max_output_size=max_boxes,\n", | ||
176 | + " iou_threshold=nms_thresh, name='nms_indices')\n", | ||
177 | + " label_list.append(tf.ones_like(tf.gather(filter_score, nms_indices), 'int32')*i)\n", | ||
178 | + " boxes_list.append(tf.gather(filter_boxes, nms_indices))\n", | ||
179 | + " score_list.append(tf.gather(filter_score, nms_indices))\n", | ||
180 | + "\n", | ||
181 | + " boxes = tf.concat(boxes_list, axis=0)\n", | ||
182 | + " score = tf.concat(score_list, axis=0)\n", | ||
183 | + " label = tf.concat(label_list, axis=0)\n", | ||
184 | + "\n", | ||
185 | + " return boxes, score, label\n", | ||
186 | + "\n", | ||
187 | + "\n", | ||
188 | + "def py_nms(boxes, scores, max_boxes=50, iou_thresh=0.5):\n", | ||
189 | + " assert boxes.shape[1] == 4 and len(scores.shape) == 1\n", | ||
190 | + "\n", | ||
191 | + " x1 = boxes[:, 0]\n", | ||
192 | + " y1 = boxes[:, 1]\n", | ||
193 | + " x2 = boxes[:, 2]\n", | ||
194 | + " y2 = boxes[:, 3]\n", | ||
195 | + "\n", | ||
196 | + " areas = (x2 - x1) * (y2 - y1)\n", | ||
197 | + " order = scores.argsort()[::-1]\n", | ||
198 | + "\n", | ||
199 | + " keep = []\n", | ||
200 | + " while order.size > 0:\n", | ||
201 | + " i = order[0]\n", | ||
202 | + " keep.append(i)\n", | ||
203 | + " xx1 = np.maximum(x1[i], x1[order[1:]])\n", | ||
204 | + " yy1 = np.maximum(y1[i], y1[order[1:]])\n", | ||
205 | + " xx2 = np.minimum(x2[i], x2[order[1:]])\n", | ||
206 | + " yy2 = np.minimum(y2[i], y2[order[1:]])\n", | ||
207 | + "\n", | ||
208 | + " w = np.maximum(0.0, xx2 - xx1 + 1)\n", | ||
209 | + " h = np.maximum(0.0, yy2 - yy1 + 1)\n", | ||
210 | + " inter = w * h\n", | ||
211 | + " ovr = inter / (areas[i] + areas[order[1:]] - inter)\n", | ||
212 | + "\n", | ||
213 | + " inds = np.where(ovr <= iou_thresh)[0]\n", | ||
214 | + " order = order[inds + 1]\n", | ||
215 | + "\n", | ||
216 | + " return keep[:max_boxes]\n", | ||
217 | + "\n", | ||
218 | + "\n", | ||
219 | + "def cpu_nms(boxes, scores, num_classes, max_boxes=50, score_thresh=0.5, iou_thresh=0.5):\n", | ||
220 | + " boxes = boxes.reshape(-1, 4)\n", | ||
221 | + " scores = scores.reshape(-1, num_classes)\n", | ||
222 | + " picked_boxes, picked_score, picked_label = [], [], []\n", | ||
223 | + "\n", | ||
224 | + " for i in range(num_classes):\n", | ||
225 | + " indices = np.where(scores[:,i] >= score_thresh)\n", | ||
226 | + " filter_boxes = boxes[indices]\n", | ||
227 | + " filter_scores = scores[:,i][indices]\n", | ||
228 | + " if len(filter_boxes) == 0: \n", | ||
229 | + " continue\n", | ||
230 | + "\n", | ||
231 | + " indices = py_nms(filter_boxes, filter_scores,\n", | ||
232 | + " max_boxes=max_boxes, iou_thresh=iou_thresh)\n", | ||
233 | + " picked_boxes.append(filter_boxes[indices])\n", | ||
234 | + " picked_score.append(filter_scores[indices])\n", | ||
235 | + " picked_label.append(np.ones(len(indices), dtype='int32')*i)\n", | ||
236 | + " if len(picked_boxes) == 0: \n", | ||
237 | + " return None, None, None\n", | ||
238 | + "\n", | ||
239 | + " boxes = np.concatenate(picked_boxes, axis=0)\n", | ||
240 | + " score = np.concatenate(picked_score, axis=0)\n", | ||
241 | + " label = np.concatenate(picked_label, axis=0)\n", | ||
242 | + "\n", | ||
243 | + " return boxes, score, label" | ||
244 | + ], | ||
245 | + "execution_count": 0, | ||
246 | + "outputs": [] | ||
247 | + }, | ||
248 | + { | ||
249 | + "cell_type": "code", | ||
250 | + "metadata": { | ||
251 | + "id": "Dg-ZKHmRDlPp", | ||
252 | + "colab_type": "code", | ||
253 | + "colab": {} | ||
254 | + }, | ||
255 | + "source": [ | ||
256 | + "## misc utils\n", | ||
257 | + "class AverageMeter(object):\n", | ||
258 | + " def __init__(self):\n", | ||
259 | + " self.reset()\n", | ||
260 | + "\n", | ||
261 | + " def reset(self):\n", | ||
262 | + " self.val = 0\n", | ||
263 | + " self.average = 0\n", | ||
264 | + " self.sum = 0\n", | ||
265 | + " self.count = 0\n", | ||
266 | + "\n", | ||
267 | + " def update(self, val, n=1):\n", | ||
268 | + " self.val = val\n", | ||
269 | + " self.sum += val * n\n", | ||
270 | + " self.count += n\n", | ||
271 | + " self.average = self.sum / float(self.count)\n", | ||
272 | + "\n", | ||
273 | + "\n", | ||
274 | + "def parse_anchors(anchor_path):\n", | ||
275 | + " anchors = np.reshape(np.asarray(open(anchor_path, 'r').read().split(','), np.float32), [-1, 2])\n", | ||
276 | + " return anchors\n", | ||
277 | + "\n", | ||
278 | + "\n", | ||
279 | + "def read_class_names(class_name_path):\n", | ||
280 | + " names = {}\n", | ||
281 | + " with open(class_name_path, 'r') as data:\n", | ||
282 | + " for ID, name in enumerate(data):\n", | ||
283 | + " names[ID] = name.strip('\\n')\n", | ||
284 | + " return names\n", | ||
285 | + "\n", | ||
286 | + "\n", | ||
287 | + "def shuffle_and_overwrite(file_name):\n", | ||
288 | + " content = open(file_name, 'r').readlines()\n", | ||
289 | + " random.shuffle(content)\n", | ||
290 | + " with open(file_name, 'w') as f:\n", | ||
291 | + " for line in content:\n", | ||
292 | + " f.write(line)\n", | ||
293 | + "\n", | ||
294 | + "\n", | ||
295 | + "def update_dict(ori_dict, new_dict):\n", | ||
296 | + " if not ori_dict:\n", | ||
297 | + " return new_dict\n", | ||
298 | + " for key in ori_dict:\n", | ||
299 | + " ori_dict[key] += new_dict[key]\n", | ||
300 | + " return ori_dict\n", | ||
301 | + "\n", | ||
302 | + "\n", | ||
303 | + "def list_add(ori_list, new_list):\n", | ||
304 | + " for i in range(len(ori_list)):\n", | ||
305 | + " ori_list[i] += new_list[i]\n", | ||
306 | + " return ori_list\n", | ||
307 | + "\n", | ||
308 | + "\n", | ||
309 | + "def load_weights(var_list, weights_file):\n", | ||
310 | + " with open(weights_file, \"rb\") as fp:\n", | ||
311 | + " np.fromfile(fp, dtype=np.int32, count=5)\n", | ||
312 | + " weights = np.fromfile(fp, dtype=np.float32)\n", | ||
313 | + "\n", | ||
314 | + " ptr = 0\n", | ||
315 | + " i = 0\n", | ||
316 | + " assign_ops = []\n", | ||
317 | + " while i < len(var_list) - 1:\n", | ||
318 | + " var1 = var_list[i]\n", | ||
319 | + " var2 = var_list[i + 1]\n", | ||
320 | + " if 'Conv' in var1.name.split('/')[-2]:\n", | ||
321 | + " if 'BatchNorm' in var2.name.split('/')[-2]:\n", | ||
322 | + " gamma, beta, mean, var = var_list[i + 1:i + 5]\n", | ||
323 | + " batch_norm_vars = [beta, gamma, mean, var]\n", | ||
324 | + " for var in batch_norm_vars:\n", | ||
325 | + " shape = var.shape.as_list()\n", | ||
326 | + " num_params = np.prod(shape)\n", | ||
327 | + " var_weights = weights[ptr:ptr + num_params].reshape(shape)\n", | ||
328 | + " ptr += num_params\n", | ||
329 | + " assign_ops.append(tf.assign(var, var_weights, validate_shape=True))\n", | ||
330 | + " i += 4\n", | ||
331 | + " elif 'Conv' in var2.name.split('/')[-2]:\n", | ||
332 | + " # load biases\n", | ||
333 | + " bias = var2\n", | ||
334 | + " bias_shape = bias.shape.as_list()\n", | ||
335 | + " bias_params = np.prod(bias_shape)\n", | ||
336 | + " bias_weights = weights[ptr:ptr +\n", | ||
337 | + " bias_params].reshape(bias_shape)\n", | ||
338 | + " ptr += bias_params\n", | ||
339 | + " assign_ops.append(tf.assign(bias, bias_weights, validate_shape=True))\n", | ||
340 | + " i += 1\n", | ||
341 | + "\n", | ||
342 | + " shape = var1.shape.as_list()\n", | ||
343 | + " num_params = np.prod(shape)\n", | ||
344 | + "\n", | ||
345 | + " var_weights = weights[ptr:ptr + num_params].reshape(\n", | ||
346 | + " (shape[3], shape[2], shape[0], shape[1]))\n", | ||
347 | + "\n", | ||
348 | + " var_weights = np.transpose(var_weights, (2, 3, 1, 0))\n", | ||
349 | + " ptr += num_params\n", | ||
350 | + " assign_ops.append(\n", | ||
351 | + " tf.assign(var1, var_weights, validate_shape=True))\n", | ||
352 | + " i += 1\n", | ||
353 | + "\n", | ||
354 | + " return assign_ops\n", | ||
355 | + "\n", | ||
356 | + "\n", | ||
357 | + "def config_learning_rate(global_step):\n", | ||
358 | + " ## fixes for removing arg paramter\n", | ||
359 | + " global lr_type, learning_rate_init, lr_decay_freq, lr_decay_factor, lr_lower_bound, total_epoches, use_warm_up, warm_up_epoch, train_batch_num, lr_lower_bound, pw_boundaries, pw_values\n", | ||
360 | + "\n", | ||
361 | + " if lr_type == 'exponential':\n", | ||
362 | + " lr_tmp = tf.train.exponential_decay(learning_rate_init, global_step, lr_decay_freq,\n", | ||
363 | + " lr_decay_factor, staircase=True, name='exponential_learning_rate')\n", | ||
364 | + " return tf.maximum(lr_tmp, lr_lower_bound)\n", | ||
365 | + " elif lr_type == 'cosine_decay':\n", | ||
366 | + " train_steps = (total_epoches - float(use_warm_up) * warm_up_epoch) * train_batch_num\n", | ||
367 | + " return lr_lower_bound + 0.5 * (learning_rate_init - lr_lower_bound) * \\\n", | ||
368 | + " (1 + tf.cos(global_step / train_steps * np.pi))\n", | ||
369 | + " elif lr_type == 'cosine_decay_restart':\n", | ||
370 | + " return tf.train.cosine_decay_restarts(learning_rate_init, global_step, \n", | ||
371 | + " lr_decay_freq, t_mul=2.0, m_mul=1.0, \n", | ||
372 | + " name='cosine_decay_learning_rate_restart')\n", | ||
373 | + " elif lr_type == 'fixed':\n", | ||
374 | + " return tf.convert_to_tensor(learning_rate_init, name='fixed_learning_rate')\n", | ||
375 | + " elif lr_type == 'piecewise':\n", | ||
376 | + " return tf.train.piecewise_constant(global_step, boundaries=pw_boundaries, values=pw_values,\n", | ||
377 | + " name='piecewise_learning_rate')\n", | ||
378 | + " else:\n", | ||
379 | + " raise ValueError('Unsupported learning rate type!')\n", | ||
380 | + "\n", | ||
381 | + "\n", | ||
382 | + "def config_optimizer(optimizer_name, learning_rate, decay=0.9, momentum=0.9):\n", | ||
383 | + " if optimizer_name == 'momentum':\n", | ||
384 | + " return tf.train.MomentumOptimizer(learning_rate, momentum=momentum)\n", | ||
385 | + " elif optimizer_name == 'rmsprop':\n", | ||
386 | + " return tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=momentum)\n", | ||
387 | + " elif optimizer_name == 'adam':\n", | ||
388 | + " return tf.train.AdamOptimizer(learning_rate)\n", | ||
389 | + " elif optimizer_name == 'sgd':\n", | ||
390 | + " return tf.train.GradientDescentOptimizer(learning_rate)\n", | ||
391 | + " else:\n", | ||
392 | + " raise ValueError('Unsupported optimizer type!')" | ||
393 | + ], | ||
394 | + "execution_count": 0, | ||
395 | + "outputs": [] | ||
396 | + }, | ||
397 | + { | ||
398 | + "cell_type": "code", | ||
399 | + "metadata": { | ||
400 | + "id": "YIlZhFLYD0d8", | ||
401 | + "colab_type": "code", | ||
402 | + "colab": {} | ||
403 | + }, | ||
404 | + "source": [ | ||
405 | + "## data utils\n", | ||
406 | + "\n", | ||
407 | + "import sys\n", | ||
408 | + "\n", | ||
409 | + "PY_VERSION = sys.version_info[0]\n", | ||
410 | + "iter_cnt = 0\n", | ||
411 | + "\n", | ||
412 | + "def _parse_tfrecord(data):\n", | ||
413 | + " example = tf.train.Example()\n", | ||
414 | + " example.ParseFromString(data)\n", | ||
415 | + " features = example.features.feature\n", | ||
416 | + " return features\n", | ||
417 | + "\n", | ||
418 | + "def parse_tfrecord(data):\n", | ||
419 | + " # tfrecord parser for TFRecordDataset (raw data)\n", | ||
420 | + " features = _parse_tfrecord(data)\n", | ||
421 | + " index = features['index'].int64_list.value[0]\n", | ||
422 | + " encoded_image = np.frombuffer(features['image'].bytes_list.value[0], dtype = np.uint8)\n", | ||
423 | + " width = features['width'].int64_list.value[0]\n", | ||
424 | + " height = features['height'].int64_list.value[0]\n", | ||
425 | + " boxes = features['boxes'].int64_list.value\n", | ||
426 | + "\n", | ||
427 | + " assert len(boxes) % 5 == 0, 'Annotation error occured in box array.'\n", | ||
428 | + " box_cnt = len(boxes) // 5\n", | ||
429 | + "\n", | ||
430 | + " aligned_boxes = []\n", | ||
431 | + " labels = []\n", | ||
432 | + "\n", | ||
433 | + " for i in range(box_cnt):\n", | ||
434 | + " label, x_min, y_min, x_max, y_max = int(boxes[i * 5]), float(boxes[i * 5 + 1]), float(boxes[i * 5 + 2]), float(boxes[i * 5 + 3]), float(boxes[i * 5 + 4]) ## do we need to change int to float? is there float rectangle sample?\n", | ||
435 | + " aligned_boxes.append([x_min, y_min, x_max, y_max])\n", | ||
436 | + " labels.append(label)\n", | ||
437 | + "\n", | ||
438 | + " aligned_boxes = np.asarray(aligned_boxes, np.float32)\n", | ||
439 | + " labels = np.asarray(labels, np.int64)\n", | ||
440 | + "\n", | ||
441 | + " return index, encoded_image, aligned_boxes, labels, width, height\n", | ||
442 | + "\n", | ||
443 | + "def parse_record(features):\n", | ||
444 | + " # tfrecord parser for TFRecordIterator (primitive data)\n", | ||
445 | + "\n", | ||
446 | + " index = int(features['index'][0])\n", | ||
447 | + " encoded_image = np.frombuffer(features['image'][0], dtype = np.uint8)\n", | ||
448 | + " width = int(features['width'][0])\n", | ||
449 | + " height = int(features['height'][0])\n", | ||
450 | + " boxes = features['boxes']\n", | ||
451 | + "\n", | ||
452 | + " assert len(boxes) % 5 == 0, 'Annotation error occured in box array.'\n", | ||
453 | + " box_cnt = len(boxes) // 5\n", | ||
454 | + "\n", | ||
455 | + " aligned_boxes = []\n", | ||
456 | + " labels = []\n", | ||
457 | + "\n", | ||
458 | + " for i in range(box_cnt):\n", | ||
459 | + " label, x_min, y_min, x_max, y_max = int(boxes[i * 5]), float(boxes[i * 5 + 1]), float(boxes[i * 5 + 2]), float(boxes[i * 5 + 3]), float(boxes[i * 5 + 4])\n", | ||
460 | + " aligned_boxes.append([x_min, y_min, x_max, y_max])\n", | ||
461 | + " labels.append(label)\n", | ||
462 | + "\n", | ||
463 | + " aligned_boxes = np.asarray(aligned_boxes, np.float32)\n", | ||
464 | + " labels = np.asarray(labels, np.int64)\n", | ||
465 | + "\n", | ||
466 | + " return index, encoded_image, aligned_boxes, labels, width, height\n", | ||
467 | + "\n", | ||
468 | + "def bbox_crop(bbox, crop_box=None, allow_outside_center=True):\n", | ||
469 | + " bbox = bbox.copy()\n", | ||
470 | + " if crop_box is None:\n", | ||
471 | + " return bbox\n", | ||
472 | + " if not len(crop_box) == 4:\n", | ||
473 | + " raise ValueError(\n", | ||
474 | + " \"Invalid crop_box parameter, requires length 4, given {}\".format(str(crop_box)))\n", | ||
475 | + " if sum([int(c is None) for c in crop_box]) == 4:\n", | ||
476 | + " return bbox\n", | ||
477 | + "\n", | ||
478 | + " l, t, w, h = crop_box\n", | ||
479 | + "\n", | ||
480 | + " left = l if l else 0\n", | ||
481 | + " top = t if t else 0\n", | ||
482 | + " right = left + (w if w else np.inf)\n", | ||
483 | + " bottom = top + (h if h else np.inf)\n", | ||
484 | + " crop_bbox = np.array((left, top, right, bottom))\n", | ||
485 | + "\n", | ||
486 | + " if allow_outside_center:\n", | ||
487 | + " mask = np.ones(bbox.shape[0], dtype=bool)\n", | ||
488 | + " else:\n", | ||
489 | + " centers = (bbox[:, :2] + bbox[:, 2:4]) / 2\n", | ||
490 | + " mask = np.logical_and(crop_bbox[:2] <= centers, centers < crop_bbox[2:]).all(axis=1)\n", | ||
491 | + "\n", | ||
492 | + " # transform borders\n", | ||
493 | + " bbox[:, :2] = np.maximum(bbox[:, :2], crop_bbox[:2])\n", | ||
494 | + " bbox[:, 2:4] = np.minimum(bbox[:, 2:4], crop_bbox[2:4])\n", | ||
495 | + " bbox[:, :2] -= crop_bbox[:2]\n", | ||
496 | + " bbox[:, 2:4] -= crop_bbox[:2]\n", | ||
497 | + "\n", | ||
498 | + " mask = np.logical_and(mask, (bbox[:, :2] < bbox[:, 2:4]).all(axis=1))\n", | ||
499 | + " bbox = bbox[mask]\n", | ||
500 | + " return bbox\n", | ||
501 | + "\n", | ||
502 | + "def bbox_iou(bbox_a, bbox_b, offset=0):\n", | ||
503 | + " if bbox_a.shape[1] < 4 or bbox_b.shape[1] < 4:\n", | ||
504 | + " raise IndexError(\"Bounding boxes axis 1 must have at least length 4\")\n", | ||
505 | + "\n", | ||
506 | + " tl = np.maximum(bbox_a[:, None, :2], bbox_b[:, :2])\n", | ||
507 | + " br = np.minimum(bbox_a[:, None, 2:4], bbox_b[:, 2:4])\n", | ||
508 | + "\n", | ||
509 | + " area_i = np.prod(br - tl + offset, axis=2) * (tl < br).all(axis=2)\n", | ||
510 | + " area_a = np.prod(bbox_a[:, 2:4] - bbox_a[:, :2] + offset, axis=1)\n", | ||
511 | + " area_b = np.prod(bbox_b[:, 2:4] - bbox_b[:, :2] + offset, axis=1)\n", | ||
512 | + " return area_i / (area_a[:, None] + area_b - area_i)\n", | ||
513 | + "\n", | ||
514 | + "\n", | ||
515 | + "def random_crop_with_constraints(bbox, size, min_scale=0.3, max_scale=1,\n", | ||
516 | + " max_aspect_ratio=2, constraints=None,\n", | ||
517 | + " max_trial=50):\n", | ||
518 | + " # default params in paper\n", | ||
519 | + " if constraints is None:\n", | ||
520 | + " constraints = (\n", | ||
521 | + " (0.1, None),\n", | ||
522 | + " (0.3, None),\n", | ||
523 | + " (0.5, None),\n", | ||
524 | + " (0.7, None),\n", | ||
525 | + " (0.9, None),\n", | ||
526 | + " (None, 1),\n", | ||
527 | + " )\n", | ||
528 | + "\n", | ||
529 | + " w, h = size\n", | ||
530 | + "\n", | ||
531 | + " candidates = [(0, 0, w, h)]\n", | ||
532 | + " for min_iou, max_iou in constraints:\n", | ||
533 | + " min_iou = -np.inf if min_iou is None else min_iou\n", | ||
534 | + " max_iou = np.inf if max_iou is None else max_iou\n", | ||
535 | + "\n", | ||
536 | + " for _ in range(max_trial):\n", | ||
537 | + " scale = random.uniform(min_scale, max_scale)\n", | ||
538 | + " aspect_ratio = random.uniform(\n", | ||
539 | + " max(1 / max_aspect_ratio, scale * scale),\n", | ||
540 | + " min(max_aspect_ratio, 1 / (scale * scale)))\n", | ||
541 | + " crop_h = int(h * scale / np.sqrt(aspect_ratio))\n", | ||
542 | + " crop_w = int(w * scale * np.sqrt(aspect_ratio))\n", | ||
543 | + "\n", | ||
544 | + " crop_t = random.randrange(h - crop_h)\n", | ||
545 | + " crop_l = random.randrange(w - crop_w)\n", | ||
546 | + " crop_bb = np.array((crop_l, crop_t, crop_l + crop_w, crop_t + crop_h))\n", | ||
547 | + "\n", | ||
548 | + " if len(bbox) == 0:\n", | ||
549 | + " top, bottom = crop_t, crop_t + crop_h\n", | ||
550 | + " left, right = crop_l, crop_l + crop_w\n", | ||
551 | + " return bbox, (left, top, right-left, bottom-top)\n", | ||
552 | + "\n", | ||
553 | + " iou = bbox_iou(bbox, crop_bb[np.newaxis])\n", | ||
554 | + " if min_iou <= iou.min() and iou.max() <= max_iou:\n", | ||
555 | + " top, bottom = crop_t, crop_t + crop_h\n", | ||
556 | + " left, right = crop_l, crop_l + crop_w\n", | ||
557 | + " candidates.append((left, top, right-left, bottom-top))\n", | ||
558 | + " break\n", | ||
559 | + "\n", | ||
560 | + " # random select one\n", | ||
561 | + " while candidates:\n", | ||
562 | + " crop = candidates.pop(np.random.randint(0, len(candidates)))\n", | ||
563 | + " new_bbox = bbox_crop(bbox, crop, allow_outside_center=False)\n", | ||
564 | + " if new_bbox.size < 1:\n", | ||
565 | + " continue\n", | ||
566 | + " new_crop = (crop[0], crop[1], crop[2], crop[3])\n", | ||
567 | + " return new_bbox, new_crop\n", | ||
568 | + " return bbox, (0, 0, w, h)\n", | ||
569 | + "\n", | ||
570 | + "\n", | ||
571 | + "def random_color_distort(img, brightness_delta=32, hue_vari=18, sat_vari=0.5, val_vari=0.5):\n", | ||
572 | + " def random_hue(img_hsv, hue_vari, p=0.5):\n", | ||
573 | + " if np.random.uniform(0, 1) > p:\n", | ||
574 | + " hue_delta = np.random.randint(-hue_vari, hue_vari)\n", | ||
575 | + " img_hsv[:, :, 0] = (img_hsv[:, :, 0] + hue_delta) % 180\n", | ||
576 | + " return img_hsv\n", | ||
577 | + "\n", | ||
578 | + " def random_saturation(img_hsv, sat_vari, p=0.5):\n", | ||
579 | + " if np.random.uniform(0, 1) > p:\n", | ||
580 | + " sat_mult = 1 + np.random.uniform(-sat_vari, sat_vari)\n", | ||
581 | + " img_hsv[:, :, 1] *= sat_mult\n", | ||
582 | + " return img_hsv\n", | ||
583 | + "\n", | ||
584 | + " def random_value(img_hsv, val_vari, p=0.5):\n", | ||
585 | + " if np.random.uniform(0, 1) > p:\n", | ||
586 | + " val_mult = 1 + np.random.uniform(-val_vari, val_vari)\n", | ||
587 | + " img_hsv[:, :, 2] *= val_mult\n", | ||
588 | + " return img_hsv\n", | ||
589 | + "\n", | ||
590 | + " def random_brightness(img, brightness_delta, p=0.5):\n", | ||
591 | + " if np.random.uniform(0, 1) > p:\n", | ||
592 | + " img = img.astype(np.float32)\n", | ||
593 | + " brightness_delta = int(np.random.uniform(-brightness_delta, brightness_delta))\n", | ||
594 | + " img = img + brightness_delta\n", | ||
595 | + " return np.clip(img, 0, 255)\n", | ||
596 | + "\n", | ||
597 | + " # brightness\n", | ||
598 | + " img = random_brightness(img, brightness_delta)\n", | ||
599 | + " img = img.astype(np.uint8)\n", | ||
600 | + "\n", | ||
601 | + " # color jitter\n", | ||
602 | + " img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.float32)\n", | ||
603 | + "\n", | ||
604 | + " if np.random.randint(0, 2):\n", | ||
605 | + " img_hsv = random_value(img_hsv, val_vari)\n", | ||
606 | + " img_hsv = random_saturation(img_hsv, sat_vari)\n", | ||
607 | + " img_hsv = random_hue(img_hsv, hue_vari)\n", | ||
608 | + " else:\n", | ||
609 | + " img_hsv = random_saturation(img_hsv, sat_vari)\n", | ||
610 | + " img_hsv = random_hue(img_hsv, hue_vari)\n", | ||
611 | + " img_hsv = random_value(img_hsv, val_vari)\n", | ||
612 | + "\n", | ||
613 | + " img_hsv = np.clip(img_hsv, 0, 255)\n", | ||
614 | + " img = cv2.cvtColor(img_hsv.astype(np.uint8), cv2.COLOR_HSV2BGR)\n", | ||
615 | + "\n", | ||
616 | + " return img\n", | ||
617 | + "\n", | ||
618 | + "\n", | ||
619 | + "def letterbox_resize(img, new_width, new_height, interp=0):\n", | ||
620 | + " ori_height, ori_width = img.shape[:2]\n", | ||
621 | + "\n", | ||
622 | + " resize_ratio = min(new_width / ori_width, new_height / ori_height)\n", | ||
623 | + "\n", | ||
624 | + " resize_w = int(resize_ratio * ori_width)\n", | ||
625 | + " resize_h = int(resize_ratio * ori_height)\n", | ||
626 | + "\n", | ||
627 | + " img = cv2.resize(img, (resize_w, resize_h), interpolation=interp)\n", | ||
628 | + " image_padded = np.full((new_height, new_width, 3), 128, np.uint8)\n", | ||
629 | + "\n", | ||
630 | + " dw = int((new_width - resize_w) / 2)\n", | ||
631 | + " dh = int((new_height - resize_h) / 2)\n", | ||
632 | + "\n", | ||
633 | + " image_padded[dh: resize_h + dh, dw: resize_w + dw, :] = img\n", | ||
634 | + "\n", | ||
635 | + " return image_padded, resize_ratio, dw, dh\n", | ||
636 | + "\n", | ||
637 | + "\n", | ||
638 | + "def resize_with_bbox(img, bbox, new_width, new_height, interp=0, letterbox=False):\n", | ||
639 | + " if letterbox:\n", | ||
640 | + " image_padded, resize_ratio, dw, dh = letterbox_resize(img, new_width, new_height, interp)\n", | ||
641 | + "\n", | ||
642 | + " # xmin, xmax\n", | ||
643 | + " bbox[:, [0, 2]] = bbox[:, [0, 2]] * resize_ratio + dw\n", | ||
644 | + " # ymin, ymax\n", | ||
645 | + " bbox[:, [1, 3]] = bbox[:, [1, 3]] * resize_ratio + dh\n", | ||
646 | + "\n", | ||
647 | + " return image_padded, bbox\n", | ||
648 | + " else:\n", | ||
649 | + " ori_height, ori_width = img.shape[:2]\n", | ||
650 | + "\n", | ||
651 | + " img = cv2.resize(img, (new_width, new_height), interpolation=interp)\n", | ||
652 | + "\n", | ||
653 | + " # xmin, xmax\n", | ||
654 | + " bbox[:, [0, 2]] = bbox[:, [0, 2]] / ori_width * new_width\n", | ||
655 | + " # ymin, ymax\n", | ||
656 | + " bbox[:, [1, 3]] = bbox[:, [1, 3]] / ori_height * new_height\n", | ||
657 | + "\n", | ||
658 | + " return img, bbox\n", | ||
659 | + "\n", | ||
660 | + "\n", | ||
661 | + "def random_flip(img, bbox, px=0, py=0):\n", | ||
662 | + " height, width = img.shape[:2]\n", | ||
663 | + " if np.random.uniform(0, 1) < px:\n", | ||
664 | + " img = cv2.flip(img, 1)\n", | ||
665 | + " xmax = width - bbox[:, 0]\n", | ||
666 | + " xmin = width - bbox[:, 2]\n", | ||
667 | + " bbox[:, 0] = xmin\n", | ||
668 | + " bbox[:, 2] = xmax\n", | ||
669 | + "\n", | ||
670 | + " if np.random.uniform(0, 1) < py:\n", | ||
671 | + " img = cv2.flip(img, 0)\n", | ||
672 | + " ymax = height - bbox[:, 1]\n", | ||
673 | + " ymin = height - bbox[:, 3]\n", | ||
674 | + " bbox[:, 1] = ymin\n", | ||
675 | + " bbox[:, 3] = ymax\n", | ||
676 | + " return img, bbox\n", | ||
677 | + "\n", | ||
678 | + "\n", | ||
679 | + "def random_expand(img, bbox, max_ratio=4, fill=0, keep_ratio=True):\n", | ||
680 | + " h, w, c = img.shape\n", | ||
681 | + " ratio_x = random.uniform(1, max_ratio)\n", | ||
682 | + " if keep_ratio:\n", | ||
683 | + " ratio_y = ratio_x\n", | ||
684 | + " else:\n", | ||
685 | + " ratio_y = random.uniform(1, max_ratio)\n", | ||
686 | + "\n", | ||
687 | + " oh, ow = int(h * ratio_y), int(w * ratio_x)\n", | ||
688 | + " off_y = random.randint(0, oh - h)\n", | ||
689 | + " off_x = random.randint(0, ow - w)\n", | ||
690 | + "\n", | ||
691 | + " dst = np.full(shape=(oh, ow, c), fill_value=fill, dtype=img.dtype)\n", | ||
692 | + "\n", | ||
693 | + " dst[off_y:off_y + h, off_x:off_x + w, :] = img\n", | ||
694 | + "\n", | ||
695 | + " # correct bbox\n", | ||
696 | + " bbox[:, :2] += (off_x, off_y)\n", | ||
697 | + " bbox[:, 2:4] += (off_x, off_y)\n", | ||
698 | + "\n", | ||
699 | + " return dst, bbox\n", | ||
700 | + "\n", | ||
701 | + "def process_box(boxes, labels, img_size, class_num, anchors):\n", | ||
702 | + " anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]\n", | ||
703 | + "\n", | ||
704 | + " # convert boxes form:\n", | ||
705 | + " # shape: [N, 2]\n", | ||
706 | + " # (x_center, y_center)\n", | ||
707 | + " box_centers = (boxes[:, 0:2] + boxes[:, 2:4]) / 2\n", | ||
708 | + " # (width, height)\n", | ||
709 | + " box_sizes = boxes[:, 2:4] - boxes[:, 0:2]\n", | ||
710 | + "\n", | ||
711 | + " # [13, 13, 3, 5+num_class+1] `5` means coords and labels. `1` means mix up weight. \n", | ||
712 | + " y_true_13 = np.zeros((img_size[1] // 32, img_size[0] // 32, 3, 6 + class_num), np.float32)\n", | ||
713 | + " y_true_26 = np.zeros((img_size[1] // 16, img_size[0] // 16, 3, 6 + class_num), np.float32)\n", | ||
714 | + " y_true_52 = np.zeros((img_size[1] // 8, img_size[0] // 8, 3, 6 + class_num), np.float32)\n", | ||
715 | + "\n", | ||
716 | + " # mix up weight default to 1.\n", | ||
717 | + " y_true_13[..., -1] = 1.\n", | ||
718 | + " y_true_26[..., -1] = 1.\n", | ||
719 | + " y_true_52[..., -1] = 1.\n", | ||
720 | + "\n", | ||
721 | + " y_true = [y_true_13, y_true_26, y_true_52]\n", | ||
722 | + "\n", | ||
723 | + " # [N, 1, 2]\n", | ||
724 | + " box_sizes = np.expand_dims(box_sizes, 1)\n", | ||
725 | + " # broadcast tricks\n", | ||
726 | + " # [N, 1, 2] & [9, 2] ==> [N, 9, 2]\n", | ||
727 | + " mins = np.maximum(- box_sizes / 2, - anchors / 2)\n", | ||
728 | + " maxs = np.minimum(box_sizes / 2, anchors / 2)\n", | ||
729 | + " # [N, 9, 2]\n", | ||
730 | + " whs = maxs - mins\n", | ||
731 | + "\n", | ||
732 | + " # [N, 9]\n", | ||
733 | + " iou = (whs[:, :, 0] * whs[:, :, 1]) / (\n", | ||
734 | + " box_sizes[:, :, 0] * box_sizes[:, :, 1] + anchors[:, 0] * anchors[:, 1] - whs[:, :, 0] * whs[:, :,\n", | ||
735 | + " 1] + 1e-10)\n", | ||
736 | + " # [N]\n", | ||
737 | + " best_match_idx = np.argmax(iou, axis=1)\n", | ||
738 | + "\n", | ||
739 | + " ratio_dict = {1.: 8., 2.: 16., 3.: 32.}\n", | ||
740 | + " for i, idx in enumerate(best_match_idx):\n", | ||
741 | + " # idx: 0,1,2 ==> 2; 3,4,5 ==> 1; 6,7,8 ==> 0\n", | ||
742 | + " feature_map_group = 2 - idx // 3\n", | ||
743 | + " # scale ratio: 0,1,2 ==> 8; 3,4,5 ==> 16; 6,7,8 ==> 32\n", | ||
744 | + " ratio = ratio_dict[np.ceil((idx + 1) / 3.)]\n", | ||
745 | + " x = int(np.floor(box_centers[i, 0] / ratio))\n", | ||
746 | + " y = int(np.floor(box_centers[i, 1] / ratio))\n", | ||
747 | + " k = anchors_mask[feature_map_group].index(idx)\n", | ||
748 | + " c = labels[i]\n", | ||
749 | + " # print(feature_map_group, '|', y,x,k,c)\n", | ||
750 | + "\n", | ||
751 | + " y_true[feature_map_group][y, x, k, :2] = box_centers[i]\n", | ||
752 | + " y_true[feature_map_group][y, x, k, 2:4] = box_sizes[i]\n", | ||
753 | + " y_true[feature_map_group][y, x, k, 4] = 1.\n", | ||
754 | + " y_true[feature_map_group][y, x, k, 5 + c] = 1.\n", | ||
755 | + " y_true[feature_map_group][y, x, k, -1] = boxes[i, -1]\n", | ||
756 | + "\n", | ||
757 | + " return y_true_13, y_true_26, y_true_52\n", | ||
758 | + "\n", | ||
759 | + "\n", | ||
760 | + "def parse_data(data, class_num, img_size, anchors, is_training, letterbox_resize):\n", | ||
761 | + " \n", | ||
762 | + " img_idx, encoded_img, boxes, labels, _, _ = parse_tfrecord(data)\n", | ||
763 | + " img = cv2.imdecode(encoded_img, cv2.IMREAD_COLOR)\n", | ||
764 | + " boxes = np.concatenate((boxes, np.full(shape=(boxes.shape[0], 1), fill_value=1., dtype=np.float32)), axis=-1)\n", | ||
765 | + "\n", | ||
766 | + " ## I erased mix-up method here\n", | ||
767 | + "\n", | ||
768 | + " if is_training:\n", | ||
769 | + " # random color distortion\n", | ||
770 | + " img = random_color_distort(img)\n", | ||
771 | + "\n", | ||
772 | + " # random expansion with prob 0.5\n", | ||
773 | + " if np.random.uniform(0, 1) > 0.5:\n", | ||
774 | + " img, boxes = random_expand(img, boxes, 4)\n", | ||
775 | + "\n", | ||
776 | + " # random cropping\n", | ||
777 | + " h, w, _ = img.shape\n", | ||
778 | + " boxes, crop = random_crop_with_constraints(boxes, (w, h))\n", | ||
779 | + " x0, y0, w, h = crop\n", | ||
780 | + " img = img[y0: y0+h, x0: x0+w]\n", | ||
781 | + "\n", | ||
782 | + " # resize with random interpolation\n", | ||
783 | + " h, w, _ = img.shape\n", | ||
784 | + " interp = np.random.randint(0, 5)\n", | ||
785 | + " img, boxes = resize_with_bbox(img, boxes, img_size[0], img_size[1], interp=interp, letterbox=letterbox_resize)\n", | ||
786 | + "\n", | ||
787 | + " # random horizontal flip\n", | ||
788 | + " h, w, _ = img.shape\n", | ||
789 | + " img, boxes = random_flip(img, boxes, px=0.5)\n", | ||
790 | + " else:\n", | ||
791 | + " img, boxes = resize_with_bbox(img, boxes, img_size[0], img_size[1], interp=1, letterbox=letterbox_resize)\n", | ||
792 | + "\n", | ||
793 | + " img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)\n", | ||
794 | + "\n", | ||
795 | + " # the input of yolo_v3 should be in range 0~1\n", | ||
796 | + " img = img / 255.\n", | ||
797 | + "\n", | ||
798 | + " y_true_13, y_true_26, y_true_52 = process_box(boxes, labels, img_size, class_num, anchors)\n", | ||
799 | + "\n", | ||
800 | + " return img_idx, img, y_true_13, y_true_26, y_true_52\n", | ||
801 | + "\n", | ||
802 | + "\n", | ||
803 | + "def get_batch_data(records, class_num, img_size, anchors, is_training, multi_scale=False, mix_up=False, letterbox_resize=True, interval=10):\n", | ||
804 | + " global iter_cnt\n", | ||
805 | + "\n", | ||
806 | + " # multi_scale training\n", | ||
807 | + " if multi_scale and is_training:\n", | ||
808 | + " random.seed(iter_cnt // interval)\n", | ||
809 | + " random_img_size = [[x * 32, x * 32] for x in range(10, 20)]\n", | ||
810 | + " img_size = random.sample(random_img_size, 1)[0]\n", | ||
811 | + " iter_cnt += 1\n", | ||
812 | + "\n", | ||
813 | + " img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch = [], [], [], [], []\n", | ||
814 | + "\n", | ||
815 | + " # deleted mix up strategy\n", | ||
816 | + " \n", | ||
817 | + " for data in records:\n", | ||
818 | + " img_idx, img, y_true_13, y_true_26, y_true_52 = parse_data(data, class_num, img_size, anchors, is_training, letterbox_resize)\n", | ||
819 | + "\n", | ||
820 | + " img_idx_batch.append(img_idx)\n", | ||
821 | + " img_batch.append(img)\n", | ||
822 | + " y_true_13_batch.append(y_true_13)\n", | ||
823 | + " y_true_26_batch.append(y_true_26)\n", | ||
824 | + " y_true_52_batch.append(y_true_52)\n", | ||
825 | + "\n", | ||
826 | + " img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch = np.asarray(img_idx_batch, np.int64), np.asarray(img_batch), np.asarray(y_true_13_batch), np.asarray(y_true_26_batch), np.asarray(y_true_52_batch)\n", | ||
827 | + "\n", | ||
828 | + " return img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch" | ||
829 | + ], | ||
830 | + "execution_count": 0, | ||
831 | + "outputs": [] | ||
832 | + }, | ||
833 | + { | ||
834 | + "cell_type": "code", | ||
835 | + "metadata": { | ||
836 | + "id": "sd9Pk3XgDqxt", | ||
837 | + "colab_type": "code", | ||
838 | + "colab": {} | ||
839 | + }, | ||
840 | + "source": [ | ||
841 | + "## evaluation utils\n", | ||
842 | + "\n", | ||
843 | + "from collections import Counter\n", | ||
844 | + "\n", | ||
845 | + "def calc_iou(pred_boxes, true_boxes):\n", | ||
846 | + " pred_boxes = np.expand_dims(pred_boxes, -2)\n", | ||
847 | + " true_boxes = np.expand_dims(true_boxes, 0)\n", | ||
848 | + "\n", | ||
849 | + " intersect_mins = np.maximum(pred_boxes[..., :2], true_boxes[..., :2])\n", | ||
850 | + " intersect_maxs = np.minimum(pred_boxes[..., 2:], true_boxes[..., 2:])\n", | ||
851 | + " intersect_wh = np.maximum(intersect_maxs - intersect_mins, 0.)\n", | ||
852 | + "\n", | ||
853 | + " intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]\n", | ||
854 | + " pred_box_wh = pred_boxes[..., 2:] - pred_boxes[..., :2]\n", | ||
855 | + " pred_box_area = pred_box_wh[..., 0] * pred_box_wh[..., 1]\n", | ||
856 | + " true_boxes_wh = true_boxes[..., 2:] - true_boxes[..., :2]\n", | ||
857 | + " true_boxes_area = true_boxes_wh[..., 0] * true_boxes_wh[..., 1]\n", | ||
858 | + "\n", | ||
859 | + " iou = intersect_area / (pred_box_area + true_boxes_area - intersect_area + 1e-10)\n", | ||
860 | + "\n", | ||
861 | + " return iou\n", | ||
862 | + "\n", | ||
863 | + "\n", | ||
864 | + "def evaluate_on_cpu(y_pred, y_true, num_classes, calc_now=True, max_boxes=50, score_thresh=0.5, iou_thresh=0.5):\n", | ||
865 | + " num_images = y_true[0].shape[0]\n", | ||
866 | + " true_labels_dict = {i: 0 for i in range(num_classes)}\n", | ||
867 | + " pred_labels_dict = {i: 0 for i in range(num_classes)}\n", | ||
868 | + " true_positive_dict = {i: 0 for i in range(num_classes)}\n", | ||
869 | + "\n", | ||
870 | + " for i in range(num_images):\n", | ||
871 | + " true_labels_list, true_boxes_list = [], []\n", | ||
872 | + " for j in range(3):\n", | ||
873 | + " true_probs_temp = y_true[j][i][..., 5:-1]\n", | ||
874 | + " true_boxes_temp = y_true[j][i][..., 0:4]\n", | ||
875 | + "\n", | ||
876 | + " object_mask = true_probs_temp.sum(axis=-1) > 0\n", | ||
877 | + "\n", | ||
878 | + " true_probs_temp = true_probs_temp[object_mask]\n", | ||
879 | + " true_boxes_temp = true_boxes_temp[object_mask]\n", | ||
880 | + "\n", | ||
881 | + " true_labels_list += np.argmax(true_probs_temp, axis=-1).tolist()\n", | ||
882 | + " true_boxes_list += true_boxes_temp.tolist()\n", | ||
883 | + "\n", | ||
884 | + " if len(true_labels_list) != 0:\n", | ||
885 | + " for cls, count in Counter(true_labels_list).items():\n", | ||
886 | + " true_labels_dict[cls] += count\n", | ||
887 | + "\n", | ||
888 | + " true_boxes = np.array(true_boxes_list)\n", | ||
889 | + " box_centers, box_sizes = true_boxes[:, 0:2], true_boxes[:, 2:4]\n", | ||
890 | + " true_boxes[:, 0:2] = box_centers - box_sizes / 2.\n", | ||
891 | + " true_boxes[:, 2:4] = true_boxes[:, 0:2] + box_sizes\n", | ||
892 | + "\n", | ||
893 | + " pred_boxes = y_pred[0][i:i + 1]\n", | ||
894 | + " pred_confs = y_pred[1][i:i + 1]\n", | ||
895 | + " pred_probs = y_pred[2][i:i + 1]\n", | ||
896 | + "\n", | ||
897 | + " pred_boxes, pred_confs, pred_labels = cpu_nms(pred_boxes, pred_confs * pred_probs, num_classes, max_boxes=max_boxes, score_thresh=score_thresh, iou_thresh=iou_thresh)\n", | ||
898 | + "\n", | ||
899 | + " pred_labels_list = [] if pred_labels is None else pred_labels.tolist()\n", | ||
900 | + " if pred_labels_list == []:\n", | ||
901 | + " continue\n", | ||
902 | + "\n", | ||
903 | + " # calc iou\n", | ||
904 | + " iou_matrix = calc_iou(pred_boxes, true_boxes)\n", | ||
905 | + " max_iou_idx = np.argmax(iou_matrix, axis=-1)\n", | ||
906 | + "\n", | ||
907 | + " correct_idx = []\n", | ||
908 | + " correct_conf = []\n", | ||
909 | + "\n", | ||
910 | + " for k in range(max_iou_idx.shape[0]):\n", | ||
911 | + " pred_labels_dict[pred_labels_list[k]] += 1\n", | ||
912 | + " match_idx = max_iou_idx[k] # V level\n", | ||
913 | + " if iou_matrix[k, match_idx] > iou_thresh and true_labels_list[match_idx] == pred_labels_list[k]:\n", | ||
914 | + " if match_idx not in correct_idx:\n", | ||
915 | + " correct_idx.append(match_idx)\n", | ||
916 | + " correct_conf.append(pred_confs[k])\n", | ||
917 | + " else:\n", | ||
918 | + " same_idx = correct_idx.index(match_idx)\n", | ||
919 | + " if pred_confs[k] > correct_conf[same_idx]:\n", | ||
920 | + " correct_idx.pop(same_idx)\n", | ||
921 | + " correct_conf.pop(same_idx)\n", | ||
922 | + " correct_idx.append(match_idx)\n", | ||
923 | + " correct_conf.append(pred_confs[k])\n", | ||
924 | + "\n", | ||
925 | + " for t in correct_idx:\n", | ||
926 | + " true_positive_dict[true_labels_list[t]] += 1\n", | ||
927 | + "\n", | ||
928 | + " if calc_now:\n", | ||
929 | + " # avoid divided by 0\n", | ||
930 | + " recall = sum(true_positive_dict.values()) / (sum(true_labels_dict.values()) + 1e-6)\n", | ||
931 | + " precision = sum(true_positive_dict.values()) / (sum(pred_labels_dict.values()) + 1e-6)\n", | ||
932 | + "\n", | ||
933 | + " return recall, precision\n", | ||
934 | + " else:\n", | ||
935 | + " return true_positive_dict, true_labels_dict, pred_labels_dict\n", | ||
936 | + "\n", | ||
937 | + "\n", | ||
938 | + "def evaluate_on_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, y_pred, y_true, num_classes, iou_thresh=0.5, calc_now=True):\n", | ||
939 | + " num_images = y_true[0].shape[0]\n", | ||
940 | + " true_labels_dict = {i: 0 for i in range(num_classes)}\n", | ||
941 | + " pred_labels_dict = {i: 0 for i in range(num_classes)}\n", | ||
942 | + " true_positive_dict = {i: 0 for i in range(num_classes)}\n", | ||
943 | + "\n", | ||
944 | + " for i in range(num_images):\n", | ||
945 | + " true_labels_list, true_boxes_list = [], []\n", | ||
946 | + " for j in range(3):\n", | ||
947 | + " true_probs_temp = y_true[j][i][..., 5:-1]\n", | ||
948 | + " true_boxes_temp = y_true[j][i][..., 0:4]\n", | ||
949 | + "\n", | ||
950 | + " object_mask = true_probs_temp.sum(axis=-1) > 0\n", | ||
951 | + "\n", | ||
952 | + " true_probs_temp = true_probs_temp[object_mask]\n", | ||
953 | + " true_boxes_temp = true_boxes_temp[object_mask]\n", | ||
954 | + "\n", | ||
955 | + " true_labels_list += np.argmax(true_probs_temp, axis=-1).tolist()\n", | ||
956 | + " true_boxes_list += true_boxes_temp.tolist()\n", | ||
957 | + "\n", | ||
958 | + " if len(true_labels_list) != 0:\n", | ||
959 | + " for cls, count in Counter(true_labels_list).items():\n", | ||
960 | + " true_labels_dict[cls] += count\n", | ||
961 | + "\n", | ||
962 | + " true_boxes = np.array(true_boxes_list)\n", | ||
963 | + " box_centers, box_sizes = true_boxes[:, 0:2], true_boxes[:, 2:4]\n", | ||
964 | + " true_boxes[:, 0:2] = box_centers - box_sizes / 2.\n", | ||
965 | + " true_boxes[:, 2:4] = true_boxes[:, 0:2] + box_sizes\n", | ||
966 | + "\n", | ||
967 | + " pred_boxes = y_pred[0][i:i + 1]\n", | ||
968 | + " pred_confs = y_pred[1][i:i + 1]\n", | ||
969 | + " pred_probs = y_pred[2][i:i + 1]\n", | ||
970 | + "\n", | ||
971 | + " pred_boxes, pred_confs, pred_labels = sess.run(gpu_nms_op, feed_dict={pred_boxes_flag: pred_boxes, pred_scores_flag: pred_confs * pred_probs})\n", | ||
972 | + "\n", | ||
973 | + " pred_labels_list = [] if pred_labels is None else pred_labels.tolist()\n", | ||
974 | + " if pred_labels_list == []:\n", | ||
975 | + " continue\n", | ||
976 | + "\n", | ||
977 | + " # calc iou\n", | ||
978 | + " iou_matrix = calc_iou(pred_boxes, true_boxes)\n", | ||
979 | + " max_iou_idx = np.argmax(iou_matrix, axis=-1)\n", | ||
980 | + "\n", | ||
981 | + " correct_idx = []\n", | ||
982 | + " correct_conf = []\n", | ||
983 | + " for k in range(max_iou_idx.shape[0]):\n", | ||
984 | + " pred_labels_dict[pred_labels_list[k]] += 1\n", | ||
985 | + " match_idx = max_iou_idx[k] # V level\n", | ||
986 | + " if iou_matrix[k, match_idx] > iou_thresh and true_labels_list[match_idx] == pred_labels_list[k]:\n", | ||
987 | + " if match_idx not in correct_idx:\n", | ||
988 | + " correct_idx.append(match_idx)\n", | ||
989 | + " correct_conf.append(pred_confs[k])\n", | ||
990 | + " else:\n", | ||
991 | + " same_idx = correct_idx.index(match_idx)\n", | ||
992 | + " if pred_confs[k] > correct_conf[same_idx]:\n", | ||
993 | + " correct_idx.pop(same_idx)\n", | ||
994 | + " correct_conf.pop(same_idx)\n", | ||
995 | + " correct_idx.append(match_idx)\n", | ||
996 | + " correct_conf.append(pred_confs[k])\n", | ||
997 | + "\n", | ||
998 | + " for t in correct_idx:\n", | ||
999 | + " true_positive_dict[true_labels_list[t]] += 1\n", | ||
1000 | + "\n", | ||
1001 | + " if calc_now:\n", | ||
1002 | + " # avoid divided by 0\n", | ||
1003 | + " recall = sum(true_positive_dict.values()) / (sum(true_labels_dict.values()) + 1e-6)\n", | ||
1004 | + " precision = sum(true_positive_dict.values()) / (sum(pred_labels_dict.values()) + 1e-6)\n", | ||
1005 | + "\n", | ||
1006 | + " return recall, precision\n", | ||
1007 | + " else:\n", | ||
1008 | + " return true_positive_dict, true_labels_dict, pred_labels_dict\n", | ||
1009 | + "\n", | ||
1010 | + "\n", | ||
1011 | + "def get_preds_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, image_ids, y_pred):\n", | ||
1012 | + " image_id = image_ids[0]\n", | ||
1013 | + "\n", | ||
1014 | + " pred_boxes = y_pred[0][0:1]\n", | ||
1015 | + " pred_confs = y_pred[1][0:1]\n", | ||
1016 | + " pred_probs = y_pred[2][0:1]\n", | ||
1017 | + "\n", | ||
1018 | + " boxes, scores, labels = sess.run(gpu_nms_op, feed_dict={pred_boxes_flag: pred_boxes, pred_scores_flag: pred_confs * pred_probs})\n", | ||
1019 | + "\n", | ||
1020 | + " pred_content = []\n", | ||
1021 | + " for i in range(len(labels)):\n", | ||
1022 | + " x_min, y_min, x_max, y_max = boxes[i]\n", | ||
1023 | + " score = scores[i]\n", | ||
1024 | + " label = labels[i]\n", | ||
1025 | + " pred_content.append([image_id, x_min, y_min, x_max, y_max, score, label])\n", | ||
1026 | + "\n", | ||
1027 | + " return pred_content\n", | ||
1028 | + "\n", | ||
1029 | + "gt_dict = {} # key: img_id, value: gt object list\n", | ||
1030 | + "def parse_gt_rec(gt_filename, compression_type, target_img_size, letterbox_resize=True):\n", | ||
1031 | + " global gt_dict\n", | ||
1032 | + "\n", | ||
1033 | + " if not gt_dict:\n", | ||
1034 | + " new_width, new_height = target_img_size\n", | ||
1035 | + "\n", | ||
1036 | + " with TFRecordIterator(gt_filename, compression_type) as reader:\n", | ||
1037 | + " for data in reader:\n", | ||
1038 | + " img_id, image, boxes, labels, ori_width, ori_height = parse_record(data)\n", | ||
1039 | + "\n", | ||
1040 | + " objects = []\n", | ||
1041 | + " for i in range(len(labels)):\n", | ||
1042 | + " x_min, y_min, x_max, y_max = boxes[i]\n", | ||
1043 | + " label = labels[i]\n", | ||
1044 | + "\n", | ||
1045 | + " if letterbox_resize:\n", | ||
1046 | + " resize_ratio = min(new_width / ori_width, new_height / ori_height)\n", | ||
1047 | + "\n", | ||
1048 | + " resize_w = int(resize_ratio * ori_width)\n", | ||
1049 | + " resize_h = int(resize_ratio * ori_height)\n", | ||
1050 | + "\n", | ||
1051 | + " dw = int((new_width - resize_w) / 2)\n", | ||
1052 | + " dh = int((new_height - resize_h) / 2)\n", | ||
1053 | + "\n", | ||
1054 | + " objects.append([x_min * resize_ratio + dw,\n", | ||
1055 | + " y_min * resize_ratio + dh,\n", | ||
1056 | + " x_max * resize_ratio + dw,\n", | ||
1057 | + " y_max * resize_ratio + dh,\n", | ||
1058 | + " label])\n", | ||
1059 | + " else:\n", | ||
1060 | + " objects.append([x_min * new_width / ori_width,\n", | ||
1061 | + " y_min * new_height / ori_height,\n", | ||
1062 | + " x_max * new_width / ori_width,\n", | ||
1063 | + " y_max * new_height / ori_height,\n", | ||
1064 | + " label])\n", | ||
1065 | + " gt_dict[img_id] = objects\n", | ||
1066 | + " return gt_dict\n", | ||
1067 | + "\n", | ||
1068 | + "\n", | ||
1069 | + "# The following two functions are modified from FAIR's Detectron repo to calculate mAP:\n", | ||
1070 | + "# https://github.com/facebookresearch/Detectron/blob/master/detectron/datasets/voc_eval.py\n", | ||
1071 | + "def voc_ap(rec, prec, use_07_metric=False):\n", | ||
1072 | + " if use_07_metric:\n", | ||
1073 | + " ap = 0.\n", | ||
1074 | + " for t in np.arange(0., 1.1, 0.1):\n", | ||
1075 | + " if np.sum(rec >= t) == 0:\n", | ||
1076 | + " p = 0\n", | ||
1077 | + " else:\n", | ||
1078 | + " p = np.max(prec[rec >= t])\n", | ||
1079 | + " ap = ap + p / 11.\n", | ||
1080 | + " else:\n", | ||
1081 | + " mrec = np.concatenate(([0.], rec, [1.]))\n", | ||
1082 | + " mpre = np.concatenate(([0.], prec, [0.]))\n", | ||
1083 | + "\n", | ||
1084 | + " for i in range(mpre.size - 1, 0, -1):\n", | ||
1085 | + " mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])\n", | ||
1086 | + "\n", | ||
1087 | + " i = np.where(mrec[1:] != mrec[:-1])[0]\n", | ||
1088 | + "\n", | ||
1089 | + " ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])\n", | ||
1090 | + " return ap\n", | ||
1091 | + "\n", | ||
1092 | + "\n", | ||
1093 | + "def voc_eval(gt_dict, val_preds, classidx, iou_thres=0.5, use_07_metric=False):\n", | ||
1094 | + " # 1.obtain gt: extract all gt objects for this class\n", | ||
1095 | + " class_recs = {}\n", | ||
1096 | + " npos = 0\n", | ||
1097 | + " for img_id in gt_dict:\n", | ||
1098 | + " R = [obj for obj in gt_dict[img_id] if obj[-1] == classidx]\n", | ||
1099 | + " bbox = np.array([x[:4] for x in R])\n", | ||
1100 | + " det = [False] * len(R)\n", | ||
1101 | + " npos += len(R)\n", | ||
1102 | + " class_recs[img_id] = {'bbox': bbox, 'det': det}\n", | ||
1103 | + "\n", | ||
1104 | + " # 2. obtain pred results\n", | ||
1105 | + " pred = [x for x in val_preds if x[-1] == classidx]\n", | ||
1106 | + " img_ids = [x[0] for x in pred]\n", | ||
1107 | + " confidence = np.array([x[-2] for x in pred])\n", | ||
1108 | + " BB = np.array([[x[1], x[2], x[3], x[4]] for x in pred])\n", | ||
1109 | + "\n", | ||
1110 | + " # 3. sort by confidence\n", | ||
1111 | + " sorted_ind = np.argsort(-confidence)\n", | ||
1112 | + " try:\n", | ||
1113 | + " BB = BB[sorted_ind, :]\n", | ||
1114 | + " except:\n", | ||
1115 | + " print('no box, ignore')\n", | ||
1116 | + " return 1e-6, 1e-6, 0, 0, 0\n", | ||
1117 | + " img_ids = [img_ids[x] for x in sorted_ind]\n", | ||
1118 | + "\n", | ||
1119 | + " # 4. mark TPs and FPs\n", | ||
1120 | + " nd = len(img_ids)\n", | ||
1121 | + " tp = np.zeros(nd)\n", | ||
1122 | + " fp = np.zeros(nd)\n", | ||
1123 | + "\n", | ||
1124 | + " for d in range(nd):\n", | ||
1125 | + " R = class_recs[img_ids[d]]\n", | ||
1126 | + " bb = BB[d, :]\n", | ||
1127 | + " ovmax = -np.Inf\n", | ||
1128 | + " BBGT = R['bbox']\n", | ||
1129 | + "\n", | ||
1130 | + " if BBGT.size > 0:\n", | ||
1131 | + " ixmin = np.maximum(BBGT[:, 0], bb[0])\n", | ||
1132 | + " iymin = np.maximum(BBGT[:, 1], bb[1])\n", | ||
1133 | + " ixmax = np.minimum(BBGT[:, 2], bb[2])\n", | ||
1134 | + " iymax = np.minimum(BBGT[:, 3], bb[3])\n", | ||
1135 | + " iw = np.maximum(ixmax - ixmin + 1., 0.)\n", | ||
1136 | + " ih = np.maximum(iymax - iymin + 1., 0.)\n", | ||
1137 | + " inters = iw * ih\n", | ||
1138 | + "\n", | ||
1139 | + " uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + (BBGT[:, 2] - BBGT[:, 0] + 1.) * (\n", | ||
1140 | + " BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)\n", | ||
1141 | + "\n", | ||
1142 | + " overlaps = inters / uni\n", | ||
1143 | + " ovmax = np.max(overlaps)\n", | ||
1144 | + " jmax = np.argmax(overlaps)\n", | ||
1145 | + "\n", | ||
1146 | + " if ovmax > iou_thres:\n", | ||
1147 | + " # gt not matched yet\n", | ||
1148 | + " if not R['det'][jmax]:\n", | ||
1149 | + " tp[d] = 1.\n", | ||
1150 | + " R['det'][jmax] = 1\n", | ||
1151 | + " else:\n", | ||
1152 | + " fp[d] = 1.\n", | ||
1153 | + " else:\n", | ||
1154 | + " fp[d] = 1.\n", | ||
1155 | + "\n", | ||
1156 | + " fp = np.cumsum(fp)\n", | ||
1157 | + " tp = np.cumsum(tp)\n", | ||
1158 | + " rec = tp / float(npos)\n", | ||
1159 | + " # avoid divide by zero in case the first detection matches a difficult\n", | ||
1160 | + " prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)\n", | ||
1161 | + " ap = voc_ap(rec, prec, use_07_metric)\n", | ||
1162 | + "\n", | ||
1163 | + " # return rec, prec, ap\n", | ||
1164 | + " return npos, nd, tp[-1] / float(npos), tp[-1] / float(nd), ap" | ||
1165 | + ], | ||
1166 | + "execution_count": 0, | ||
1167 | + "outputs": [] | ||
1168 | + }, | ||
1169 | + { | ||
1170 | + "cell_type": "code", | ||
1171 | + "metadata": { | ||
1172 | + "id": "X4uQxNl0FRli", | ||
1173 | + "colab_type": "code", | ||
1174 | + "outputId": "c2b22c73-6195-4b80-d1b4-5ada76ef3da8", | ||
1175 | + "colab": { | ||
1176 | + "base_uri": "https://localhost:8080/", | ||
1177 | + "height": 161 | ||
1178 | + } | ||
1179 | + }, | ||
1180 | + "source": [ | ||
1181 | + "## model\n", | ||
1182 | + "\n", | ||
1183 | + "slim = tf.contrib.slim\n", | ||
1184 | + "\n", | ||
1185 | + "def conv2d(inputs, filters, kernel_size, strides=1):\n", | ||
1186 | + " def _fixed_padding(inputs, kernel_size):\n", | ||
1187 | + " pad_total = kernel_size - 1\n", | ||
1188 | + " pad_beg = pad_total // 2\n", | ||
1189 | + " pad_end = pad_total - pad_beg\n", | ||
1190 | + "\n", | ||
1191 | + " padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],\n", | ||
1192 | + " [pad_beg, pad_end], [0, 0]], mode='CONSTANT')\n", | ||
1193 | + " return padded_inputs\n", | ||
1194 | + " if strides > 1: \n", | ||
1195 | + " inputs = _fixed_padding(inputs, kernel_size)\n", | ||
1196 | + " inputs = slim.conv2d(inputs, filters, kernel_size, stride=strides,\n", | ||
1197 | + " padding=('SAME' if strides == 1 else 'VALID'))\n", | ||
1198 | + " return inputs\n", | ||
1199 | + "\n", | ||
1200 | + "def darknet53_body(inputs):\n", | ||
1201 | + " def res_block(inputs, filters):\n", | ||
1202 | + " shortcut = inputs\n", | ||
1203 | + " net = conv2d(inputs, filters * 1, 1)\n", | ||
1204 | + " net = conv2d(net, filters * 2, 3)\n", | ||
1205 | + "\n", | ||
1206 | + " net = net + shortcut\n", | ||
1207 | + "\n", | ||
1208 | + " return net\n", | ||
1209 | + " \n", | ||
1210 | + " # first two conv2d layers\n", | ||
1211 | + " net = conv2d(inputs, 32, 3, strides=1)\n", | ||
1212 | + " net = conv2d(net, 64, 3, strides=2)\n", | ||
1213 | + "\n", | ||
1214 | + " # res_block * 1\n", | ||
1215 | + " net = res_block(net, 32)\n", | ||
1216 | + "\n", | ||
1217 | + " net = conv2d(net, 128, 3, strides=2)\n", | ||
1218 | + "\n", | ||
1219 | + " # res_block * 2\n", | ||
1220 | + " for i in range(2):\n", | ||
1221 | + " net = res_block(net, 64)\n", | ||
1222 | + "\n", | ||
1223 | + " net = conv2d(net, 256, 3, strides=2)\n", | ||
1224 | + "\n", | ||
1225 | + " # res_block * 8\n", | ||
1226 | + " for i in range(8):\n", | ||
1227 | + " net = res_block(net, 128)\n", | ||
1228 | + "\n", | ||
1229 | + " route_1 = net\n", | ||
1230 | + " net = conv2d(net, 512, 3, strides=2)\n", | ||
1231 | + "\n", | ||
1232 | + " # res_block * 8\n", | ||
1233 | + " for i in range(8):\n", | ||
1234 | + " net = res_block(net, 256)\n", | ||
1235 | + "\n", | ||
1236 | + " route_2 = net\n", | ||
1237 | + " net = conv2d(net, 1024, 3, strides=2)\n", | ||
1238 | + "\n", | ||
1239 | + " # res_block * 4\n", | ||
1240 | + " for i in range(4):\n", | ||
1241 | + " net = res_block(net, 512)\n", | ||
1242 | + " route_3 = net\n", | ||
1243 | + "\n", | ||
1244 | + " return route_1, route_2, route_3\n", | ||
1245 | + "\n", | ||
1246 | + "\n", | ||
1247 | + "def yolo_block(inputs, filters):\n", | ||
1248 | + " net = conv2d(inputs, filters * 1, 1)\n", | ||
1249 | + " net = conv2d(net, filters * 2, 3)\n", | ||
1250 | + " net = conv2d(net, filters * 1, 1)\n", | ||
1251 | + " net = conv2d(net, filters * 2, 3)\n", | ||
1252 | + " net = conv2d(net, filters * 1, 1)\n", | ||
1253 | + " route = net\n", | ||
1254 | + " net = conv2d(net, filters * 2, 3)\n", | ||
1255 | + " return route, net\n", | ||
1256 | + "\n", | ||
1257 | + "\n", | ||
1258 | + "def upsample_layer(inputs, out_shape):\n", | ||
1259 | + " new_height, new_width = out_shape[1], out_shape[2]\n", | ||
1260 | + " # NOTE: here height is the first\n", | ||
1261 | + " inputs = tf.image.resize_nearest_neighbor(inputs, (new_height, new_width), name='upsampled')\n", | ||
1262 | + " return inputs\n", | ||
1263 | + "\n", | ||
1264 | + "class yolov3(object):\n", | ||
1265 | + "\n", | ||
1266 | + " def __init__(self, class_num, anchors, use_label_smooth=False, use_focal_loss=False, batch_norm_decay=0.999, weight_decay=5e-4, use_static_shape=True):\n", | ||
1267 | + " self.class_num = class_num\n", | ||
1268 | + " self.anchors = anchors\n", | ||
1269 | + " self.batch_norm_decay = batch_norm_decay\n", | ||
1270 | + " self.use_label_smooth = use_label_smooth\n", | ||
1271 | + " self.use_focal_loss = use_focal_loss\n", | ||
1272 | + " self.weight_decay = weight_decay\n", | ||
1273 | + " self.use_static_shape = use_static_shape\n", | ||
1274 | + "\n", | ||
1275 | + " def forward(self, inputs, is_training=False, reuse=False):\n", | ||
1276 | + " # the input size: [height, weight] format\n", | ||
1277 | + " self.img_size = tf.shape(inputs)[1:3]\n", | ||
1278 | + " print(\"Img size:\", self.img_size)\n", | ||
1279 | + "\t\t\n", | ||
1280 | + " batch_norm_params = {\n", | ||
1281 | + " 'decay': self.batch_norm_decay,\n", | ||
1282 | + " 'epsilon': 1e-05,\n", | ||
1283 | + " 'scale': True,\n", | ||
1284 | + " 'is_training': is_training,\n", | ||
1285 | + " 'fused': None,\n", | ||
1286 | + " }\n", | ||
1287 | + "\n", | ||
1288 | + " with slim.arg_scope([slim.conv2d, slim.batch_norm], reuse=reuse):\n", | ||
1289 | + " with slim.arg_scope([slim.conv2d], \n", | ||
1290 | + " normalizer_fn=slim.batch_norm,\n", | ||
1291 | + " normalizer_params=batch_norm_params,\n", | ||
1292 | + " biases_initializer=None,\n", | ||
1293 | + " activation_fn=lambda x: tf.nn.leaky_relu(x, alpha=0.1),\n", | ||
1294 | + " weights_regularizer=slim.l2_regularizer(self.weight_decay)):\n", | ||
1295 | + "\n", | ||
1296 | + " with tf.variable_scope('darknet53_body'):\n", | ||
1297 | + " route_1, route_2, route_3 = darknet53_body(inputs)\n", | ||
1298 | + "\n", | ||
1299 | + " with tf.variable_scope('yolov3_head'):\n", | ||
1300 | + " inter1, net = yolo_block(route_3, 512)\n", | ||
1301 | + " feature_map_1 = slim.conv2d(net, 3 * (5 + self.class_num), 1,\n", | ||
1302 | + " stride=1, normalizer_fn=None,\n", | ||
1303 | + " activation_fn=None, biases_initializer=tf.zeros_initializer())\n", | ||
1304 | + " feature_map_1 = tf.identity(feature_map_1, name='feature_map_1')\n", | ||
1305 | + "\n", | ||
1306 | + " inter1 = conv2d(inter1, 256, 1)\n", | ||
1307 | + " inter1 = upsample_layer(inter1, route_2.get_shape().as_list() if self.use_static_shape else tf.shape(route_2))\n", | ||
1308 | + " concat1 = tf.concat([inter1, route_2], axis=3)\n", | ||
1309 | + "\n", | ||
1310 | + " inter2, net = yolo_block(concat1, 256)\n", | ||
1311 | + " feature_map_2 = slim.conv2d(net, 3 * (5 + self.class_num), 1,\n", | ||
1312 | + " stride=1, normalizer_fn=None,\n", | ||
1313 | + " activation_fn=None, biases_initializer=tf.zeros_initializer())\n", | ||
1314 | + " feature_map_2 = tf.identity(feature_map_2, name='feature_map_2')\n", | ||
1315 | + "\n", | ||
1316 | + " inter2 = conv2d(inter2, 128, 1)\n", | ||
1317 | + " inter2 = upsample_layer(inter2, route_1.get_shape().as_list() if self.use_static_shape else tf.shape(route_1))\n", | ||
1318 | + " concat2 = tf.concat([inter2, route_1], axis=3)\n", | ||
1319 | + "\n", | ||
1320 | + " _, feature_map_3 = yolo_block(concat2, 128)\n", | ||
1321 | + " feature_map_3 = slim.conv2d(feature_map_3, 3 * (5 + self.class_num), 1,\n", | ||
1322 | + " stride=1, normalizer_fn=None,\n", | ||
1323 | + " activation_fn=None, biases_initializer=tf.zeros_initializer())\n", | ||
1324 | + " feature_map_3 = tf.identity(feature_map_3, name='feature_map_3')\n", | ||
1325 | + "\n", | ||
1326 | + " return feature_map_1, feature_map_2, feature_map_3\n", | ||
1327 | + "\n", | ||
1328 | + " def reorg_layer(self, feature_map, anchors):\t\n", | ||
1329 | + " # size : [h, w] format\n", | ||
1330 | + " grid_size = feature_map.get_shape().as_list()[1:3] if self.use_static_shape else tf.shape(feature_map)[1:3] # [13, 13]\n", | ||
1331 | + " ratio = tf.cast(self.img_size / grid_size, tf.float32)\n", | ||
1332 | + "\t\t\n", | ||
1333 | + " # anchor : [w, h] format\n", | ||
1334 | + " rescaled_anchors = [(anchor[0] / ratio[1], anchor[1] / ratio[0]) for anchor in anchors]\n", | ||
1335 | + "\n", | ||
1336 | + " feature_map = tf.reshape(feature_map, [-1, grid_size[0], grid_size[1], 3, 5 + self.class_num])\n", | ||
1337 | + "\t\t\n", | ||
1338 | + " box_centers, box_sizes, conf_logits, prob_logits = tf.split(feature_map, [2, 2, 1, self.class_num], axis=-1)\n", | ||
1339 | + " box_centers = tf.nn.sigmoid(box_centers)\n", | ||
1340 | + "\n", | ||
1341 | + " grid_x = tf.range(grid_size[1], dtype=tf.int32)\n", | ||
1342 | + " grid_y = tf.range(grid_size[0], dtype=tf.int32)\n", | ||
1343 | + " grid_x, grid_y = tf.meshgrid(grid_x, grid_y)\n", | ||
1344 | + " x_offset = tf.reshape(grid_x, (-1, 1))\n", | ||
1345 | + " y_offset = tf.reshape(grid_y, (-1, 1))\n", | ||
1346 | + " x_y_offset = tf.concat([x_offset, y_offset], axis=-1)\n", | ||
1347 | + "\t\t\n", | ||
1348 | + " x_y_offset = tf.cast(tf.reshape(x_y_offset, [grid_size[0], grid_size[1], 1, 2]), tf.float32)\n", | ||
1349 | + "\n", | ||
1350 | + " box_centers = box_centers + x_y_offset\n", | ||
1351 | + " box_centers = box_centers * ratio[::-1]\n", | ||
1352 | + "\n", | ||
1353 | + " box_sizes = tf.exp(box_sizes) * rescaled_anchors\n", | ||
1354 | + " box_sizes = box_sizes * ratio[::-1]\n", | ||
1355 | + "\n", | ||
1356 | + " boxes = tf.concat([box_centers, box_sizes], axis=-1)\n", | ||
1357 | + "\n", | ||
1358 | + " return x_y_offset, boxes, conf_logits, prob_logits\n", | ||
1359 | + " \n", | ||
1360 | + " def predict(self, feature_maps):\n", | ||
1361 | + " feature_map_1, feature_map_2, feature_map_3 = feature_maps\n", | ||
1362 | + "\n", | ||
1363 | + " feature_map_anchors = [(feature_map_1, self.anchors[6:9]),\n", | ||
1364 | + " (feature_map_2, self.anchors[3:6]),\n", | ||
1365 | + " (feature_map_3, self.anchors[0:3])]\n", | ||
1366 | + " reorg_results = [self.reorg_layer(feature_map, anchors) for (feature_map, anchors) in feature_map_anchors]\n", | ||
1367 | + "\n", | ||
1368 | + " def _reshape_logit(result):\n", | ||
1369 | + " x_y_offset, boxes, conf_logits, prob_logits = result\n", | ||
1370 | + " grid_size = x_y_offset.get_shape().as_list()[:2] if self.use_static_shape else tf.shape(x_y_offset)[:2]\n", | ||
1371 | + " boxes = tf.reshape(boxes, [-1, grid_size[0] * grid_size[1] * 3, 4])\n", | ||
1372 | + " conf_logits = tf.reshape(conf_logits, [-1, grid_size[0] * grid_size[1] * 3, 1])\n", | ||
1373 | + " prob_logits = tf.reshape(prob_logits, [-1, grid_size[0] * grid_size[1] * 3, self.class_num])\n", | ||
1374 | + " return boxes, conf_logits, prob_logits\n", | ||
1375 | + "\n", | ||
1376 | + " boxes_list, confs_list, probs_list = [], [], []\n", | ||
1377 | + "\t\t\n", | ||
1378 | + " for result in reorg_results:\n", | ||
1379 | + " boxes, conf_logits, prob_logits = _reshape_logit(result)\n", | ||
1380 | + " confs = tf.sigmoid(conf_logits)\n", | ||
1381 | + " probs = tf.sigmoid(prob_logits)\n", | ||
1382 | + " boxes_list.append(boxes)\n", | ||
1383 | + " confs_list.append(confs)\n", | ||
1384 | + " probs_list.append(probs)\n", | ||
1385 | + " \n", | ||
1386 | + " boxes = tf.concat(boxes_list, axis=1)\n", | ||
1387 | + " confs = tf.concat(confs_list, axis=1)\n", | ||
1388 | + " probs = tf.concat(probs_list, axis=1)\n", | ||
1389 | + "\n", | ||
1390 | + " center_x, center_y, width, height = tf.split(boxes, [1, 1, 1, 1], axis=-1)\n", | ||
1391 | + " x_min = center_x - width / 2\n", | ||
1392 | + " y_min = center_y - height / 2\n", | ||
1393 | + " x_max = center_x + width / 2\n", | ||
1394 | + " y_max = center_y + height / 2\n", | ||
1395 | + "\n", | ||
1396 | + " boxes = tf.concat([x_min, y_min, x_max, y_max], axis=-1)\n", | ||
1397 | + "\n", | ||
1398 | + " return boxes, confs, probs\n", | ||
1399 | + " \n", | ||
1400 | + " def loss_layer(self, feature_map_i, y_true, anchors):\n", | ||
1401 | + " grid_size = tf.shape(feature_map_i)[1:3]\n", | ||
1402 | + " ratio = tf.cast(self.img_size / grid_size, tf.float32)\n", | ||
1403 | + " # N: batch_size\n", | ||
1404 | + " N = tf.cast(tf.shape(feature_map_i)[0], tf.float32)\n", | ||
1405 | + "\n", | ||
1406 | + " x_y_offset, pred_boxes, pred_conf_logits, pred_prob_logits = self.reorg_layer(feature_map_i, anchors)\n", | ||
1407 | + "\n", | ||
1408 | + "\t\t### mask\n", | ||
1409 | + " object_mask = y_true[..., 4:5]\n", | ||
1410 | + " ignore_mask = tf.TensorArray(tf.float32, size=0, dynamic_size=True)\n", | ||
1411 | + "\t\t\n", | ||
1412 | + " def loop_cond(idx, ignore_mask):\n", | ||
1413 | + " return tf.less(idx, tf.cast(N, tf.int32))\n", | ||
1414 | + "\t\t\t\n", | ||
1415 | + " def loop_body(idx, ignore_mask):\n", | ||
1416 | + " valid_true_boxes = tf.boolean_mask(y_true[idx, ..., 0:4], tf.cast(object_mask[idx, ..., 0], 'bool'))\n", | ||
1417 | + "\t\t\t\n", | ||
1418 | + " iou = self.box_iou(pred_boxes[idx], valid_true_boxes)\t\t\t\n", | ||
1419 | + " best_iou = tf.reduce_max(iou, axis=-1)\n", | ||
1420 | + "\t\t\t\n", | ||
1421 | + " ignore_mask_tmp = tf.cast(best_iou < 0.5, tf.float32)\n", | ||
1422 | + "\t\t\t\n", | ||
1423 | + " ignore_mask = ignore_mask.write(idx, ignore_mask_tmp)\n", | ||
1424 | + " return idx + 1, ignore_mask\n", | ||
1425 | + "\t\t\t\n", | ||
1426 | + " _, ignore_mask = tf.while_loop(cond=loop_cond, body=loop_body, loop_vars=[0, ignore_mask])\n", | ||
1427 | + " ignore_mask = ignore_mask.stack()\n", | ||
1428 | + " ignore_mask = tf.expand_dims(ignore_mask, -1)\n", | ||
1429 | + "\n", | ||
1430 | + " pred_box_xy = pred_boxes[..., 0:2]\n", | ||
1431 | + " pred_box_wh = pred_boxes[..., 2:4]\n", | ||
1432 | + "\n", | ||
1433 | + " true_xy = y_true[..., 0:2] / ratio[::-1] - x_y_offset\n", | ||
1434 | + " pred_xy = pred_box_xy / ratio[::-1] - x_y_offset\n", | ||
1435 | + "\n", | ||
1436 | + " true_tw_th = y_true[..., 2:4] / anchors\n", | ||
1437 | + " pred_tw_th = pred_box_wh / anchors\n", | ||
1438 | + "\t\t\n", | ||
1439 | + " true_tw_th = tf.where(condition=tf.equal(true_tw_th, 0),\n", | ||
1440 | + " x=tf.ones_like(true_tw_th), y=true_tw_th)\n", | ||
1441 | + " pred_tw_th = tf.where(condition=tf.equal(pred_tw_th, 0),\n", | ||
1442 | + " x=tf.ones_like(pred_tw_th), y=pred_tw_th)\n", | ||
1443 | + " true_tw_th = tf.log(tf.clip_by_value(true_tw_th, 1e-9, 1e9))\n", | ||
1444 | + " pred_tw_th = tf.log(tf.clip_by_value(pred_tw_th, 1e-9, 1e9))\n", | ||
1445 | + "\n", | ||
1446 | + " box_loss_scale = 2. - (y_true[..., 2:3] / tf.cast(self.img_size[1], tf.float32)) * (y_true[..., 3:4] / tf.cast(self.img_size[0], tf.float32))\n", | ||
1447 | + "\n", | ||
1448 | + " ### loss\n", | ||
1449 | + "\t\t\n", | ||
1450 | + " mix_w = y_true[..., -1:]\n", | ||
1451 | + "\t\t\n", | ||
1452 | + " xy_loss = tf.reduce_sum(tf.square(true_xy - pred_xy) * object_mask * box_loss_scale * mix_w) / N\n", | ||
1453 | + " wh_loss = tf.reduce_sum(tf.square(true_tw_th - pred_tw_th) * object_mask * box_loss_scale * mix_w) / N\n", | ||
1454 | + "\n", | ||
1455 | + " conf_pos_mask = object_mask\n", | ||
1456 | + " conf_neg_mask = (1 - object_mask) * ignore_mask\n", | ||
1457 | + " conf_loss_pos = conf_pos_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=object_mask, logits=pred_conf_logits)\n", | ||
1458 | + " conf_loss_neg = conf_neg_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=object_mask, logits=pred_conf_logits)\n", | ||
1459 | + "\t\t\n", | ||
1460 | + " conf_loss = conf_loss_pos + conf_loss_neg\n", | ||
1461 | + "\n", | ||
1462 | + " if self.use_focal_loss:\n", | ||
1463 | + " alpha = 1.0\n", | ||
1464 | + " gamma = 2.0\n", | ||
1465 | + " focal_mask = alpha * tf.pow(tf.abs(object_mask - tf.sigmoid(pred_conf_logits)), gamma)\n", | ||
1466 | + " conf_loss *= focal_mask\n", | ||
1467 | + " conf_loss = tf.reduce_sum(conf_loss * mix_w) / N\n", | ||
1468 | + "\n", | ||
1469 | + " if self.use_label_smooth:\n", | ||
1470 | + " delta = 0.01\n", | ||
1471 | + " label_target = (1 - delta) * y_true[..., 5:-1] + delta * 1. / self.class_num\n", | ||
1472 | + " else:\n", | ||
1473 | + " label_target = y_true[..., 5:-1]\n", | ||
1474 | + "\t\t\t\n", | ||
1475 | + " class_loss = object_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=label_target, logits=pred_prob_logits) * mix_w\n", | ||
1476 | + " class_loss = tf.reduce_sum(class_loss) / N\n", | ||
1477 | + "\n", | ||
1478 | + " return xy_loss, wh_loss, conf_loss, class_loss\n", | ||
1479 | + " \n", | ||
1480 | + "\n", | ||
1481 | + " def box_iou(self, pred_boxes, valid_true_boxes):\n", | ||
1482 | + " pred_box_xy = pred_boxes[..., 0:2]\n", | ||
1483 | + " pred_box_wh = pred_boxes[..., 2:4]\n", | ||
1484 | + "\n", | ||
1485 | + " pred_box_xy = tf.expand_dims(pred_box_xy, -2)\n", | ||
1486 | + " pred_box_wh = tf.expand_dims(pred_box_wh, -2)\n", | ||
1487 | + "\n", | ||
1488 | + " true_box_xy = valid_true_boxes[:, 0:2]\n", | ||
1489 | + " true_box_wh = valid_true_boxes[:, 2:4]\n", | ||
1490 | + "\n", | ||
1491 | + " intersect_mins = tf.maximum(pred_box_xy - pred_box_wh / 2.,\n", | ||
1492 | + " true_box_xy - true_box_wh / 2.)\n", | ||
1493 | + " intersect_maxs = tf.minimum(pred_box_xy + pred_box_wh / 2.,\n", | ||
1494 | + " true_box_xy + true_box_wh / 2.)\n", | ||
1495 | + " intersect_wh = tf.maximum(intersect_maxs - intersect_mins, 0.)\n", | ||
1496 | + "\n", | ||
1497 | + " intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]\n", | ||
1498 | + " pred_box_area = pred_box_wh[..., 0] * pred_box_wh[..., 1]\n", | ||
1499 | + " true_box_area = true_box_wh[..., 0] * true_box_wh[..., 1]\n", | ||
1500 | + " true_box_area = tf.expand_dims(true_box_area, axis=0)\n", | ||
1501 | + "\n", | ||
1502 | + " iou = intersect_area / (pred_box_area + true_box_area - intersect_area + 1e-10)\n", | ||
1503 | + "\n", | ||
1504 | + " return iou\n", | ||
1505 | + "\n", | ||
1506 | + " \n", | ||
1507 | + " def compute_loss(self, y_pred, y_true):\n", | ||
1508 | + " loss_xy, loss_wh, loss_conf, loss_class = 0., 0., 0., 0.\n", | ||
1509 | + " anchor_group = [self.anchors[6:9], self.anchors[3:6], self.anchors[0:3]]\n", | ||
1510 | + "\n", | ||
1511 | + " for i in range(len(y_pred)):\n", | ||
1512 | + " result = self.loss_layer(y_pred[i], y_true[i], anchor_group[i])\n", | ||
1513 | + " loss_xy += result[0]\n", | ||
1514 | + " loss_wh += result[1]\n", | ||
1515 | + " loss_conf += result[2]\n", | ||
1516 | + " loss_class += result[3]\n", | ||
1517 | + " total_loss = loss_xy + loss_wh + loss_conf + loss_class\n", | ||
1518 | + " return [total_loss, loss_xy, loss_wh, loss_conf, loss_class]" | ||
1519 | + ], | ||
1520 | + "execution_count": 8, | ||
1521 | + "outputs": [ | ||
1522 | + { | ||
1523 | + "output_type": "stream", | ||
1524 | + "text": [ | ||
1525 | + "WARNING:tensorflow:\n", | ||
1526 | + "The TensorFlow contrib module will not be included in TensorFlow 2.0.\n", | ||
1527 | + "For more information, please see:\n", | ||
1528 | + " * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n", | ||
1529 | + " * https://github.com/tensorflow/addons\n", | ||
1530 | + " * https://github.com/tensorflow/io (for I/O related ops)\n", | ||
1531 | + "If you depend on functionality not listed there, please file an issue.\n", | ||
1532 | + "\n" | ||
1533 | + ], | ||
1534 | + "name": "stdout" | ||
1535 | + } | ||
1536 | + ] | ||
1537 | + }, | ||
1538 | + { | ||
1539 | + "cell_type": "code", | ||
1540 | + "metadata": { | ||
1541 | + "id": "Nlddq-K7AJin", | ||
1542 | + "colab_type": "code", | ||
1543 | + "outputId": "c5baed55-0d4e-4c65-fa7d-340b27baf8f9", | ||
1544 | + "colab": { | ||
1545 | + "base_uri": "https://localhost:8080/", | ||
1546 | + "height": 89 | ||
1547 | + } | ||
1548 | + }, | ||
1549 | + "source": [ | ||
1550 | + "## arguments\n", | ||
1551 | + "\n", | ||
1552 | + "import math\n", | ||
1553 | + "\n", | ||
1554 | + "\n", | ||
1555 | + "### Some paths\n", | ||
1556 | + "\n", | ||
1557 | + "data_path = '/content/gdrive/My Drive/yolo/data/'\n", | ||
1558 | + "train_file = data_path + 'train.tfrecord' # The path of the training txt file.\n", | ||
1559 | + "val_file = data_path + 'val.tfrecord' # The path of the validation txt file.\n", | ||
1560 | + "restore_path = data_path + 'darknet_weights/yolov3.ckpt' # The path of the weights to restore.\n", | ||
1561 | + "save_dir = '/content/gdrive/My Drive/yolo/checkpoint/' # The directory of the weights to save.\n", | ||
1562 | + "\n", | ||
1563 | + "### we are not using tensorboard logs in this code\n", | ||
1564 | + "\n", | ||
1565 | + "log_dir = data_path + 'logs/' # The directory to store the tensorboard log files.\n", | ||
1566 | + "progress_log_path = data_path + 'progress.log' # The path to record the training progress.\n", | ||
1567 | + "\n", | ||
1568 | + "anchor_path = data_path + 'yolo_anchors.txt' # The path of the anchor txt file.\n", | ||
1569 | + "class_name_path = data_path + 'classes.txt' # The path of the class names.\n", | ||
1570 | + "\n", | ||
1571 | + "### Training releated numbers\n", | ||
1572 | + "batch_size = 4\n", | ||
1573 | + "img_size = [416, 416] # Images will be resized to `img_size` and fed to the network, size format: [width, height]\n", | ||
1574 | + "letterbox_resizing = True # Whether to use the letterbox resize, i.e., keep the original aspect ratio in the resized image.\n", | ||
1575 | + "total_epoches = 10\n", | ||
1576 | + "train_evaluation_step = 10 # Evaluate on the training batch after some steps.\n", | ||
1577 | + "val_evaluation_epoch = 2 # Evaluate on the whole validation dataset after some epochs. Set to None to evaluate every epoch.\n", | ||
1578 | + "save_epoch = 5 # Save the model after some epochs.\n", | ||
1579 | + "batch_norm_decay = 0.99 # decay in bn ops\n", | ||
1580 | + "weight_decay = 5e-4 # l2 weight decay\n", | ||
1581 | + "current_global_step = 0 # used when resuming training\n", | ||
1582 | + "\n", | ||
1583 | + "### tf.data parameters\n", | ||
1584 | + "num_threads = 10 # Number of threads for image processing used in tf.data pipeline.\n", | ||
1585 | + "prefetech_buffer = 5 # Prefetech_buffer used in tf.data pipeline.\n", | ||
1586 | + "\n", | ||
1587 | + "### Learning rate and optimizer\n", | ||
1588 | + "optimizer_name = 'momentum' # Chosen from [sgd, momentum, adam, rmsprop]\n", | ||
1589 | + "save_optimizer = True # Whether to save the optimizer parameters into the checkpoint file.\n", | ||
1590 | + "learning_rate_init = 1e-4\n", | ||
1591 | + "lr_type = 'piecewise' # Chosen from [fixed, exponential, cosine_decay, cosine_decay_restart, piecewise]\n", | ||
1592 | + "lr_decay_epoch = 5 # Epochs after which learning rate decays. Int or float. Used when chosen `exponential` and `cosine_decay_restart` lr_type.\n", | ||
1593 | + "lr_decay_factor = 0.96 # The learning rate decay factor. Used when chosen `exponential` lr_type.\n", | ||
1594 | + "lr_lower_bound = 1e-6 # The minimum learning rate.\n", | ||
1595 | + "# only used in piecewise lr type\n", | ||
1596 | + "pw_boundaries = [30, 50] # epoch based boundaries\n", | ||
1597 | + "pw_values = [learning_rate_init, 3e-5, 1e-5]\n", | ||
1598 | + "\n", | ||
1599 | + "### Load and finetune\n", | ||
1600 | + "# Choose the parts you want to restore the weights. List form.\n", | ||
1601 | + "# restore_include: None, restore_exclude: None => restore the whole model\n", | ||
1602 | + "# restore_include: None, restore_exclude: scope => restore the whole model except `scope`\n", | ||
1603 | + "# restore_include: scope1, restore_exclude: scope2 => if scope1 contains scope2, restore scope1 and not restore scope2 (scope1 - scope2)\n", | ||
1604 | + "# choise 1: only restore the darknet body\n", | ||
1605 | + "# restore_include = ['yolov3/darknet53_body']\n", | ||
1606 | + "# restore_exclude = None\n", | ||
1607 | + "# choise 2: restore all layers except the last 3 conv2d layers in 3 scale\n", | ||
1608 | + "restore_include = None\n", | ||
1609 | + "restore_exclude = ['yolov3/yolov3_head/Conv_14', 'yolov3/yolov3_head/Conv_6', 'yolov3/yolov3_head/Conv_22']\n", | ||
1610 | + "# Choose the parts you want to finetune. List form.\n", | ||
1611 | + "# Set to None to train the whole model.\n", | ||
1612 | + "\n", | ||
1613 | + "update_part = ['yolov3/yolov3_head']\n", | ||
1614 | + "\n", | ||
1615 | + "### other training strategies\n", | ||
1616 | + "multi_scale_train = True # Whether to apply multi-scale training strategy. Image size varies from [320, 320] to [640, 640] by default.\n", | ||
1617 | + "use_label_smooth = True # Whether to use class label smoothing strategy.\n", | ||
1618 | + "use_focal_loss = True # Whether to apply focal loss on the conf loss.\n", | ||
1619 | + "use_mix_up = True # Whether to use mix up data augmentation strategy. \n", | ||
1620 | + "use_warm_up = True # whether to use warm up strategy to prevent from gradient exploding.\n", | ||
1621 | + "warm_up_epoch = 2 # Warm up training epoches. Set to a larger value if gradient explodes.\n", | ||
1622 | + "\n", | ||
1623 | + "### some constants in validation\n", | ||
1624 | + "# nms\n", | ||
1625 | + "nms_threshold = 0.45 # iou threshold in nms operation\n", | ||
1626 | + "score_threshold = 0.01 # threshold of the probability of the classes in nms operation, i.e. score = pred_confs * pred_probs. set lower for higher recall.\n", | ||
1627 | + "nms_topk = 150 # keep at most nms_topk outputs after nms\n", | ||
1628 | + "# mAP eval\n", | ||
1629 | + "eval_threshold = 0.5 # the iou threshold applied in mAP evaluation\n", | ||
1630 | + "use_voc_07_metric = False # whether to use voc 2007 evaluation metric, i.e. the 11-point metric\n", | ||
1631 | + "\n", | ||
1632 | + "### parse some params\n", | ||
1633 | + "anchors = parse_anchors(anchor_path)\n", | ||
1634 | + "classes = read_class_names(class_name_path)\n", | ||
1635 | + "class_num = len(classes)\n", | ||
1636 | + "train_img_cnt = TFRecordIterator(train_file, 'GZIP').count()\n", | ||
1637 | + "val_img_cnt = TFRecordIterator(val_file, 'GZIP').count()\n", | ||
1638 | + "train_batch_num = int(math.ceil(float(train_img_cnt) / batch_size))\n", | ||
1639 | + "\n", | ||
1640 | + "lr_decay_freq = int(train_batch_num * lr_decay_epoch)\n", | ||
1641 | + "pw_boundaries = [float(i) * train_batch_num + current_global_step for i in pw_boundaries]\n" | ||
1642 | + ], | ||
1643 | + "execution_count": 9, | ||
1644 | + "outputs": [ | ||
1645 | + { | ||
1646 | + "output_type": "stream", | ||
1647 | + "text": [ | ||
1648 | + "WARNING:tensorflow:From <ipython-input-2-ea7f0591b13c>:7: tf_record_iterator (from tensorflow.python.lib.io.tf_record) is deprecated and will be removed in a future version.\n", | ||
1649 | + "Instructions for updating:\n", | ||
1650 | + "Use eager execution and: \n", | ||
1651 | + "`tf.data.TFRecordDataset(path)`\n" | ||
1652 | + ], | ||
1653 | + "name": "stdout" | ||
1654 | + } | ||
1655 | + ] | ||
1656 | + }, | ||
1657 | + { | ||
1658 | + "cell_type": "code", | ||
1659 | + "metadata": { | ||
1660 | + "id": "NagT2oNZFf0q", | ||
1661 | + "colab_type": "code", | ||
1662 | + "colab": {} | ||
1663 | + }, | ||
1664 | + "source": [ | ||
1665 | + "## train\n", | ||
1666 | + "\n", | ||
1667 | + "import os\n", | ||
1668 | + "from tqdm import trange\n", | ||
1669 | + "\n", | ||
1670 | + "if training:\n", | ||
1671 | + " is_training = tf.placeholder(tf.bool, name=\"phase_train\")\n", | ||
1672 | + " handle_flag = tf.placeholder(tf.string, [], name='iterator_handle_flag')\n", | ||
1673 | + "\n", | ||
1674 | + " pred_boxes_flag = tf.placeholder(tf.float32, [1, None, None])\n", | ||
1675 | + " pred_scores_flag = tf.placeholder(tf.float32, [1, None, None])\n", | ||
1676 | + " gpu_nms_op = gpu_nms(pred_boxes_flag, pred_scores_flag, class_num, nms_topk, score_threshold, nms_threshold)\n", | ||
1677 | + "\n", | ||
1678 | + " ### tf.data pipeline\n", | ||
1679 | + " train_dataset = tf.data.TFRecordDataset(filenames=train_file, compression_type='GZIP')\n", | ||
1680 | + " train_dataset = train_dataset.shuffle(train_img_cnt)\n", | ||
1681 | + " train_dataset = train_dataset.batch(batch_size)\n", | ||
1682 | + " train_dataset = train_dataset.map(\n", | ||
1683 | + " lambda x: tf.py_func(get_batch_data,\n", | ||
1684 | + " inp=[x, class_num, img_size, anchors, True, multi_scale_train, use_mix_up, letterbox_resizing],\n", | ||
1685 | + " Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]),\n", | ||
1686 | + " num_parallel_calls=num_threads\n", | ||
1687 | + " )\n", | ||
1688 | + " train_dataset = train_dataset.prefetch(prefetech_buffer)\n", | ||
1689 | + "\n", | ||
1690 | + " val_dataset = tf.data.TFRecordDataset(filenames=val_file, compression_type='GZIP')\n", | ||
1691 | + " val_dataset = val_dataset.batch(1)\n", | ||
1692 | + " val_dataset = val_dataset.map(\n", | ||
1693 | + " lambda x: tf.py_func(get_batch_data,\n", | ||
1694 | + " inp=[x, class_num, img_size, anchors, False, False, False, letterbox_resizing],\n", | ||
1695 | + " Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]),\n", | ||
1696 | + " num_parallel_calls=num_threads\n", | ||
1697 | + " )\n", | ||
1698 | + " val_dataset.prefetch(prefetech_buffer)\n", | ||
1699 | + "\n", | ||
1700 | + " iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)\n", | ||
1701 | + " train_init_op = iterator.make_initializer(train_dataset)\n", | ||
1702 | + " val_init_op = iterator.make_initializer(val_dataset)\n", | ||
1703 | + "\n", | ||
1704 | + " image_ids, image, y_true_13, y_true_26, y_true_52 = iterator.get_next()\n", | ||
1705 | + " y_true = [y_true_13, y_true_26, y_true_52]\n", | ||
1706 | + "\n", | ||
1707 | + " image_ids.set_shape([None])\n", | ||
1708 | + " image.set_shape([None, None, None, 3])\n", | ||
1709 | + " for y in y_true:\n", | ||
1710 | + " y.set_shape([None, None, None, None, None])\n", | ||
1711 | + "\n", | ||
1712 | + "\n", | ||
1713 | + " ### Model definition\n", | ||
1714 | + " yolo_model = yolov3(class_num, anchors, use_label_smooth, use_focal_loss, batch_norm_decay, weight_decay, use_static_shape=False)\n", | ||
1715 | + "\n", | ||
1716 | + " with tf.variable_scope('yolov3'):\n", | ||
1717 | + " pred_feature_maps = yolo_model.forward(image, is_training=is_training)\n", | ||
1718 | + "\n", | ||
1719 | + " loss = yolo_model.compute_loss(pred_feature_maps, y_true)\n", | ||
1720 | + " y_pred = yolo_model.predict(pred_feature_maps)\n", | ||
1721 | + "\n", | ||
1722 | + " l2_loss = tf.losses.get_regularization_loss()\n", | ||
1723 | + "\n", | ||
1724 | + " saver_to_restore = tf.train.Saver(var_list=tf.contrib.framework.get_variables_to_restore(include=restore_include, exclude=restore_exclude))\n", | ||
1725 | + " update_vars = tf.contrib.framework.get_variables_to_restore(include=update_part)\n", | ||
1726 | + "\n", | ||
1727 | + "\n", | ||
1728 | + " global_step = tf.Variable(float(current_global_step), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES])\n", | ||
1729 | + " if use_warm_up:\n", | ||
1730 | + " learning_rate = tf.cond(tf.less(global_step, train_batch_num * warm_up_epoch), \n", | ||
1731 | + " lambda: learning_rate_init * global_step / (train_batch_num * warm_up_epoch),\n", | ||
1732 | + " lambda: config_learning_rate(global_step - train_batch_num * warm_up_epoch))\n", | ||
1733 | + " else:\n", | ||
1734 | + " learning_rate = config_learning_rate(global_step)\n", | ||
1735 | + "\n", | ||
1736 | + " optimizer = config_optimizer(optimizer_name, learning_rate)\n", | ||
1737 | + "\n", | ||
1738 | + " update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)\n", | ||
1739 | + "\n", | ||
1740 | + " with tf.control_dependencies(update_ops):\n", | ||
1741 | + " gvs = optimizer.compute_gradients(loss[0] + l2_loss, var_list=update_vars)\n", | ||
1742 | + " clip_grad_var = [gv if gv[0] is None else [\n", | ||
1743 | + " tf.clip_by_norm(gv[0], 100.), gv[1]] for gv in gvs]\n", | ||
1744 | + " train_op = optimizer.apply_gradients(clip_grad_var, global_step=global_step)\n", | ||
1745 | + "\n", | ||
1746 | + " if save_optimizer:\n", | ||
1747 | + " print('Saving optimizer parameters: ON')\n", | ||
1748 | + " saver_to_save = tf.train.Saver()\n", | ||
1749 | + " saver_best = tf.train.Saver()\n", | ||
1750 | + " else:\n", | ||
1751 | + " print('Saving optimizer parameters: OFF')\n", | ||
1752 | + "\n", | ||
1753 | + "\n", | ||
1754 | + " with tf.Session() as sess:\n", | ||
1755 | + " sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])\n", | ||
1756 | + "\n", | ||
1757 | + " if os.path.exists(restore_path):\n", | ||
1758 | + " saver_to_restore.restore(sess, restore_path)\n", | ||
1759 | + "\n", | ||
1760 | + " print('\\nStart training...: Total epoches =', total_epoches, '\\n')\n", | ||
1761 | + "\n", | ||
1762 | + " best_mAP = -np.Inf\n", | ||
1763 | + "\n", | ||
1764 | + " for epoch in range(total_epoches):\n", | ||
1765 | + " sess.run(train_init_op)\n", | ||
1766 | + " loss_total, loss_xy, loss_wh, loss_conf, loss_class = AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter()\n", | ||
1767 | + "\n", | ||
1768 | + " ### train part\n", | ||
1769 | + " for i in trange(train_batch_num):\n", | ||
1770 | + " _, __y_pred, __y_true, __loss, __global_step, __lr = sess.run(\n", | ||
1771 | + " [train_op, y_pred, y_true, loss, global_step, learning_rate],\n", | ||
1772 | + " feed_dict={is_training: True})\n", | ||
1773 | + "\n", | ||
1774 | + " loss_total.update(__loss[0], len(__y_pred[0]))\n", | ||
1775 | + " loss_xy.update(__loss[1], len(__y_pred[0]))\n", | ||
1776 | + " loss_wh.update(__loss[2], len(__y_pred[0]))\n", | ||
1777 | + " loss_conf.update(__loss[3], len(__y_pred[0]))\n", | ||
1778 | + " loss_class.update(__loss[4], len(__y_pred[0]))\n", | ||
1779 | + "\n", | ||
1780 | + " if __global_step % train_evaluation_step == 0 and __global_step > 0:\n", | ||
1781 | + " recall, precision = evaluate_on_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, __y_pred, __y_true, class_num, nms_threshold)\n", | ||
1782 | + "\n", | ||
1783 | + " info = \"Epoch: {}, global_step: {} | loss: total: {:.2f}, xy: {:.2f}, wh: {:.2f}, conf: {:.2f}, class: {:.2f} | \".format(\n", | ||
1784 | + " epoch, int(__global_step), loss_total.average, loss_xy.average, loss_wh.average, loss_conf.average, loss_class.average)\n", | ||
1785 | + " info += 'Last batch: rec: {:.3f}, prec: {:.3f} | lr: {:.5g}'.format(recall, precision, __lr)\n", | ||
1786 | + " print(info)\n", | ||
1787 | + " \n", | ||
1788 | + " if np.isnan(loss_total.average):\n", | ||
1789 | + " print('****' * 10)\n", | ||
1790 | + " raise ArithmeticError('Gradient exploded!')\n", | ||
1791 | + "\n", | ||
1792 | + " ## train end (saving parameters)\n", | ||
1793 | + " if save_optimizer and epoch % save_epoch == 0 and epoch > 0:\n", | ||
1794 | + " if loss_total.average <= 2.:\n", | ||
1795 | + " saver_to_save.save(sess, save_dir + 'model-epoch_{}_step_{}_loss_{:.4f}_lr_{:.5g}'.format(epoch, int(__global_step), loss_total.average, __lr))\n", | ||
1796 | + "\n", | ||
1797 | + " ### validation part\n", | ||
1798 | + " if epoch % val_evaluation_epoch == 0 and epoch >= warm_up_epoch:\n", | ||
1799 | + " sess.run(val_init_op)\n", | ||
1800 | + "\n", | ||
1801 | + " val_loss_total, val_loss_xy, val_loss_wh, val_loss_conf, val_loss_class = AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter()\n", | ||
1802 | + "\n", | ||
1803 | + " val_preds = []\n", | ||
1804 | + "\n", | ||
1805 | + " for j in trange(val_img_cnt):\n", | ||
1806 | + " __image_ids, __y_pred, __loss = sess.run([image_ids, y_pred, loss],\n", | ||
1807 | + " feed_dict={is_training: False})\n", | ||
1808 | + " pred_content = get_preds_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, __image_ids, __y_pred)\n", | ||
1809 | + " val_preds.extend(pred_content)\n", | ||
1810 | + " val_loss_total.update(__loss[0])\n", | ||
1811 | + " val_loss_xy.update(__loss[1])\n", | ||
1812 | + " val_loss_wh.update(__loss[2])\n", | ||
1813 | + " val_loss_conf.update(__loss[3])\n", | ||
1814 | + " val_loss_class.update(__loss[4])\n", | ||
1815 | + "\n", | ||
1816 | + " # calc mAP\n", | ||
1817 | + " rec_total, prec_total, ap_total = AverageMeter(), AverageMeter(), AverageMeter()\n", | ||
1818 | + " gt_dict = parse_gt_rec(val_file, 'GZIP', img_size, letterbox_resize)\n", | ||
1819 | + "\n", | ||
1820 | + " info = '======> Epoch: {}, global_step: {}, lr: {:.6g} <======\\n'.format(epoch, __global_step, __lr)\n", | ||
1821 | + "\n", | ||
1822 | + " for ii in range(class_num):\n", | ||
1823 | + " npos, nd, rec, prec, ap = voc_eval(gt_dict, val_preds, ii, iou_thres=eval_threshold, use_07_metric=use_voc_07_metric)\n", | ||
1824 | + " info += 'EVAL: Class {}: Recall: {:.4f}, Precision: {:.4f}, AP: {:.4f}\\n'.format(ii, rec, prec, ap)\n", | ||
1825 | + " rec_total.update(rec, npos)\n", | ||
1826 | + " prec_total.update(prec, nd)\n", | ||
1827 | + " ap_total.update(ap, 1)\n", | ||
1828 | + "\n", | ||
1829 | + " mAP = ap_total.average\n", | ||
1830 | + " info += 'EVAL: Recall: {:.4f}, Precison: {:.4f}, mAP: {:.4f}\\n'.format(rec_total.average, prec_total.average, mAP)\n", | ||
1831 | + " info += 'EVAL: loss: total: {:.2f}, xy: {:.2f}, wh: {:.2f}, conf: {:.2f}, class: {:.2f}\\n'.format(\n", | ||
1832 | + " val_loss_total.average, val_loss_xy.average, val_loss_wh.average, val_loss_conf.average, val_loss_class.average)\n", | ||
1833 | + " print(info)\n", | ||
1834 | + "\n", | ||
1835 | + " if save_optimizer and mAP > best_mAP:\n", | ||
1836 | + " best_mAP = mAP\n", | ||
1837 | + " saver_best.save(sess, save_dir + 'best_model_Epoch_{}_step_{}_mAP_{:.4f}_loss_{:.4f}_lr_{:.7g}'.format(\n", | ||
1838 | + " epoch, int(__global_step), best_mAP, val_loss_total.average, __lr))" | ||
1839 | + ], | ||
1840 | + "execution_count": 0, | ||
1841 | + "outputs": [] | ||
1842 | + }, | ||
1843 | + { | ||
1844 | + "cell_type": "code", | ||
1845 | + "metadata": { | ||
1846 | + "id": "HmoSmKIuOpyC", | ||
1847 | + "colab_type": "code", | ||
1848 | + "colab": {} | ||
1849 | + }, | ||
1850 | + "source": [ | ||
1851 | + "## evaluation (test)\n", | ||
1852 | + "\n", | ||
1853 | + "import argparse\n", | ||
1854 | + "\n", | ||
1855 | + "if not training:\n", | ||
1856 | + "\n", | ||
1857 | + " ### ArgumentParser\n", | ||
1858 | + " parser = argparse.ArgumentParser(description=\"YOLO-V3 eval procedure.\")\n", | ||
1859 | + "\n", | ||
1860 | + " # paths\n", | ||
1861 | + " parser.add_argument(\"--eval_file\", type=str, default=\"/content/gdrive/My Drive/yolo/data/test.tfrecord\",\n", | ||
1862 | + " help=\"The path of the validation or test txt file.\")\n", | ||
1863 | + "\n", | ||
1864 | + " parser.add_argument(\"--restore_path\", type=str, default=\"/content/gdrive/My Drive/yolo/data/darknet_weights/yolov3.ckpt\",\n", | ||
1865 | + " help=\"The path of the weights to restore.\")\n", | ||
1866 | + "\n", | ||
1867 | + " parser.add_argument(\"--anchor_path\", type=str, default=\"./content/gdrive/My Drive/yolo/data/yolo_anchors.txt\",\n", | ||
1868 | + " help=\"The path of the anchor txt file.\")\n", | ||
1869 | + "\n", | ||
1870 | + " parser.add_argument(\"--class_name_path\", type=str, default=\"/content/gdrive/My Drive/yolo/data/classes.txt\",\n", | ||
1871 | + " help=\"The path of the class names.\")\n", | ||
1872 | + "\n", | ||
1873 | + " # some numbers\n", | ||
1874 | + " parser.add_argument(\"--img_size\", nargs='*', type=int, default=[416, 416],\n", | ||
1875 | + " help=\"Resize the input image to `img_size`, size format: [width, height]\")\n", | ||
1876 | + "\n", | ||
1877 | + " parser.add_argument(\"--letterbox_resize\", type=lambda x: (str(x).lower() == 'true'), default=False,\n", | ||
1878 | + " help=\"Whether to use the letterbox resize, i.e., keep the original image aspect ratio.\")\n", | ||
1879 | + "\n", | ||
1880 | + " parser.add_argument(\"--num_threads\", type=int, default=10,\n", | ||
1881 | + " help=\"Number of threads for image processing used in tf.data pipeline.\")\n", | ||
1882 | + "\n", | ||
1883 | + " parser.add_argument(\"--prefetech_buffer\", type=int, default=5,\n", | ||
1884 | + " help=\"Prefetech_buffer used in tf.data pipeline.\")\n", | ||
1885 | + "\n", | ||
1886 | + " parser.add_argument(\"--nms_threshold\", type=float, default=0.45,\n", | ||
1887 | + " help=\"IOU threshold in nms operation.\")\n", | ||
1888 | + "\n", | ||
1889 | + " parser.add_argument(\"--score_threshold\", type=float, default=0.01,\n", | ||
1890 | + " help=\"Threshold of the probability of the classes in nms operation.\")\n", | ||
1891 | + "\n", | ||
1892 | + " parser.add_argument(\"--nms_topk\", type=int, default=400,\n", | ||
1893 | + " help=\"Keep at most nms_topk outputs after nms.\")\n", | ||
1894 | + "\n", | ||
1895 | + " parser.add_argument(\"--use_voc_07_metric\", type=lambda x: (str(x).lower() == 'true'), default=False,\n", | ||
1896 | + " help=\"Whether to use the voc 2007 mAP metrics.\")\n", | ||
1897 | + "\n", | ||
1898 | + " args = parser.parse_args()\n", | ||
1899 | + "\n", | ||
1900 | + " # args params\n", | ||
1901 | + " args.anchors = parse_anchors(args.anchor_path)\n", | ||
1902 | + " args.classes = read_class_names(args.class_name_path)\n", | ||
1903 | + " args.class_num = len(args.classes)\n", | ||
1904 | + " args.img_cnt = len(open(args.eval_file, 'r').readlines())\n", | ||
1905 | + "\n", | ||
1906 | + " # setting placeholders\n", | ||
1907 | + " is_training = tf.placeholder(dtype=tf.bool, name=\"phase_train\")\n", | ||
1908 | + " handle_flag = tf.placeholder(tf.string, [], name='iterator_handle_flag')\n", | ||
1909 | + " pred_boxes_flag = tf.placeholder(tf.float32, [1, None, None])\n", | ||
1910 | + " pred_scores_flag = tf.placeholder(tf.float32, [1, None, None])\n", | ||
1911 | + " gpu_nms_op = gpu_nms(pred_boxes_flag, pred_scores_flag, args.class_num, args.nms_topk, args.score_threshold, args.nms_threshold)\n", | ||
1912 | + "\n", | ||
1913 | + " ### tf.data pipeline\n", | ||
1914 | + " val_dataset = tf.data.TFRecordDataset(filenames=args.eval_file, compression_type='GZIP')\n", | ||
1915 | + " val_dataset = val_dataset.batch(1)\n", | ||
1916 | + " val_dataset = val_dataset.map(\n", | ||
1917 | + " lambda x: tf.py_func(get_batch_data, [x, args.class_num, args.img_size, args.anchors, False, False, False, args.letterbox_resize], [tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]),\n", | ||
1918 | + " num_parallel_calls=args.num_threads\n", | ||
1919 | + " )\n", | ||
1920 | + " val_dataset.prefetch(args.prefetech_buffer)\n", | ||
1921 | + " iterator = val_dataset.make_one_shot_iterator()\n", | ||
1922 | + "\n", | ||
1923 | + " image_ids, image, y_true_13, y_true_26, y_true_52 = iterator.get_next()\n", | ||
1924 | + " image_ids.set_shape([None])\n", | ||
1925 | + " y_true = [y_true_13, y_true_26, y_true_52]\n", | ||
1926 | + " image.set_shape([None, args.img_size[1], args.img_size[0], 3])\n", | ||
1927 | + " for y in y_true:\n", | ||
1928 | + " y.set_shape([None, None, None, None, None])\n", | ||
1929 | + "\n", | ||
1930 | + " ### Model definition\n", | ||
1931 | + " yolo_model = yolov3(args.class_num, args.anchors)\n", | ||
1932 | + " with tf.variable_scope('yolov3'):\n", | ||
1933 | + " pred_feature_maps = yolo_model.forward(image, is_training=is_training)\n", | ||
1934 | + " loss = yolo_model.compute_loss(pred_feature_maps, y_true)\n", | ||
1935 | + " y_pred = yolo_model.predict(pred_feature_maps)\n", | ||
1936 | + "\n", | ||
1937 | + " saver_to_restore = tf.train.Saver()\n", | ||
1938 | + "\n", | ||
1939 | + "\n", | ||
1940 | + " with tf.Session() as sess:\n", | ||
1941 | + " sess.run([tf.global_variables_initializer()])\n", | ||
1942 | + " if os.path.exists(args.restore_path):\n", | ||
1943 | + " saver_to_restore.restore(sess, args.restore_path)\n", | ||
1944 | + " else:\n", | ||
1945 | + " raise ValueError('there is no model to evaluate. You should move/create the checkpoint file to restore path')\n", | ||
1946 | + "\n", | ||
1947 | + " print('\\nStart evaluation...\\n')\n", | ||
1948 | + "\n", | ||
1949 | + " val_loss_total, val_loss_xy, val_loss_wh, val_loss_conf, val_loss_class = \\\n", | ||
1950 | + " AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter()\n", | ||
1951 | + " val_preds = []\n", | ||
1952 | + "\n", | ||
1953 | + " for j in trange(args.img_cnt):\n", | ||
1954 | + " __image_ids, __y_pred, __loss = sess.run([image_ids, y_pred, loss], feed_dict={is_training: False})\n", | ||
1955 | + " pred_content = get_preds_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, __image_ids, __y_pred)\n", | ||
1956 | + "\n", | ||
1957 | + " val_preds.extend(pred_content)\n", | ||
1958 | + " val_loss_total.update(__loss[0])\n", | ||
1959 | + " val_loss_xy.update(__loss[1])\n", | ||
1960 | + " val_loss_wh.update(__loss[2])\n", | ||
1961 | + " val_loss_conf.update(__loss[3])\n", | ||
1962 | + " val_loss_class.update(__loss[4])\n", | ||
1963 | + "\n", | ||
1964 | + " rec_total, prec_total, ap_total = AverageMeter(), AverageMeter(), AverageMeter()\n", | ||
1965 | + " gt_dict = parse_gt_rec(args.eval_file, 'GZIP', args.img_size, args.letterbox_resize)\n", | ||
1966 | + " print('mAP eval:')\n", | ||
1967 | + " for ii in range(args.class_num):\n", | ||
1968 | + " npos, nd, rec, prec, ap = voc_eval(gt_dict, val_preds, ii, iou_thres=0.5, use_07_metric=args.use_voc_07_metric)\n", | ||
1969 | + " rec_total.update(rec, npos)\n", | ||
1970 | + " prec_total.update(prec, nd)\n", | ||
1971 | + " ap_total.update(ap, 1)\n", | ||
1972 | + " print('Class {}: Recall: {:.4f}, Precision: {:.4f}, AP: {:.4f}'.format(ii, rec, prec, ap))\n", | ||
1973 | + "\n", | ||
1974 | + " mAP = ap_total.average\n", | ||
1975 | + " print('final mAP: {:.4f}'.format(mAP))\n", | ||
1976 | + " print(\"recall: {:.3f}, precision: {:.3f}\".format(rec_total.average, prec_total.average))\n", | ||
1977 | + " print(\"total_loss: {:.3f}, loss_xy: {:.3f}, loss_wh: {:.3f}, loss_conf: {:.3f}, loss_class: {:.3f}\".format(\n", | ||
1978 | + " val_loss_total.average, val_loss_xy.average, val_loss_wh.average, val_loss_conf.average, val_loss_class.average\n", | ||
1979 | + " ))" | ||
1980 | + ], | ||
1981 | + "execution_count": 0, | ||
1982 | + "outputs": [] | ||
1983 | + } | ||
1984 | + ] | ||
1985 | +} | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment