downloader.py
6.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import os
import cv2
from tqdm import tqdm
from modules.utils import images_options
from modules.utils import bcolors as bc
from multiprocessing.dummy import Pool as ThreadPool
import numpy as np
# from modules.bounding_boxes import row_num
# print('names:',names)
def download(args, df_val, folder, dataset_dir, class_name, class_code, class_list=None, threads = 20):
'''
Manage the download of the images and the label maker.
:param args: argument parser.
:param df_val: DataFrame Values
:param folder: train, validation or test
:param dataset_dir: self explanatory
:param class_name: self explanatory
:param class_code: self explanatory
:param class_list: list of the class if multiclasses is activated
:param threads: number of threads
:return: None
'''
if os.name == 'posix':
rows, columns = os.popen('stty size', 'r').read().split()
elif os.name == 'nt':
try:
columns, rows = os.get_terminal_size(0)
except OSError:
columns, rows = os.get_terminal_size(1)
else:
columns = 50
l = int((int(columns) - len(class_name))/2)
print ('\n' + bc.HEADER + '-'*l + class_name + '-'*l + bc.ENDC)
print(bc.INFO + 'Downloading {} images.'.format(args.type_csv) + bc.ENDC)
df_val_images = images_options(df_val, args)
images_list = df_val_images['ImageID'][df_val_images.LabelName == class_code].values
images_list = set(images_list)
print(bc.INFO + '[INFO] Found {} online images for {}.'.format(len(images_list), folder) + bc.ENDC)
if args.limit is not None:
import itertools
print(bc.INFO + 'Limiting to {} images.'.format(args.limit) + bc.ENDC)
images_list = set(itertools.islice(images_list, args.limit))
if class_list is not None:
class_name_list = '_'.join(class_list)
else:
class_name_list = class_name
download_img(folder, dataset_dir, class_name_list, images_list, threads)
if not args.sub:
get_label(folder, dataset_dir, class_name, class_code, df_val, class_name_list, args)
def download_img(folder, dataset_dir, class_name, images_list, threads):
'''
Download the images.
:param folder: train, validation or test
:param dataset_dir: self explanatory
:param class_name: self explanatory
:param images_list: list of the images to download
:param threads: number of threads
:return: None
'''
image_dir = folder
download_dir = os.path.join(dataset_dir, image_dir, class_name)
downloaded_images_list = [f.split('.')[0] for f in os.listdir(download_dir)]
print("all images:{},already downloaded:{}".format(len(images_list),len(downloaded_images_list)))
images_list = list(set(images_list) - set(downloaded_images_list));
pool = ThreadPool(threads)
if len(images_list) > 0:
print(bc.INFO + 'Download of {} images in {}.'.format(len(images_list), folder) + bc.ENDC)
commands = []
for image in images_list:
path = image_dir + '/' + str(image) + '.jpg ' + '"' + download_dir + '"'
command = 'aws s3 --no-sign-request --only-show-errors cp s3://open-images-dataset/' + path
commands.append(command)
list(tqdm(pool.imap(os.system, commands), total = len(commands) ))
print(bc.INFO + 'Done!' + bc.ENDC)
pool.close()
pool.join()
else:
print(bc.INFO + 'All images already downloaded.' +bc.ENDC)
def xyxy2xywh(x):
# Convert bounding box format from [x1, y1, x2, y2] to [x, y, w, h]
# y = torch.zeros_like(x) if isinstance(x, torch.Tensor) else np.zeros_like(x)
y = [0.,0.,0.,0.]
y[0] = (x[0] + x[2])/2
y[1] = (x[1] + x[3])/2
y[2] = x[:][2] - x[:][0]
y[3] = x[:][3] - x[:][1]
return y
def get_label(folder, dataset_dir, class_name, class_code, df_val, class_list, args):
'''
Make the label.txt files
:param folder: train, validation or test
:param dataset_dir: self explanatory
:param class_name: self explanatory
:param class_code: self explanatory
:param df_val: DataFrame values
:param class_list: list of the class if multiclasses is activated
:return: None
'''
from modules.bounding_boxes import row_num
if not args.noLabels:
# from modules.bounding_boxes import row_num
print(bc.INFO + 'Creating labels for {} of {}.'.format(class_name, folder) + bc.ENDC)
image_dir = folder
if class_list is not None:
download_dir = os.path.join(dataset_dir, image_dir, class_list)
label_dir = os.path.join(dataset_dir, folder, class_list, 'Label')
else:
download_dir = os.path.join(dataset_dir, image_dir, class_name)
label_dir = os.path.join(dataset_dir, folder, class_name, 'Label')
downloaded_images_list = [f.split('.')[0] for f in os.listdir(download_dir) if f.endswith('.jpg')]
images_label_list = list(set(downloaded_images_list))
groups = df_val[(df_val.LabelName == class_code)].groupby(df_val.ImageID)
for image in images_label_list:
current_image_path = os.path.join(download_dir, image + '.jpg')
# print(current_image_path)
# dataset_image = cv2.imread(current_image_path)
file_name = str(image.split('.')[0]) + '.txt'
file_path = os.path.join(label_dir, file_name)
boxes = groups.get_group(image.split('.')[0])[['XMin', 'YMin','XMax','YMax']].values.tolist()
file_name = str(image.split('.')[0]) + '.txt'
file_path = os.path.join(label_dir, file_name)
if os.path.isfile(file_path):
f = open(file_path, 'a')
# print("appending to FILE completed")
# f.close()
else:
f = open(file_path, 'w')
# print("{} {} {} {} {}".format(row_num,x,y,w,h),f)
# f.close()
for box in boxes:
x,y,w,h = xyxy2xywh(box)
# print(row_num,x,y,w,h,image)
print("{} {} {} {} {}".format(row_num,x,y,w,h),file=f)
print(bc.INFO + 'Labels creation completed.' + bc.ENDC)