fetch_dataset.py
2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# Author: Aqeel Anwar(ICSRL)
# Created: 7/30/2020, 1:44 PM
# Email: aqeel.anwar@gatech.edu
# Code resued from https://stackoverflow.com/questions/38511444/python-download-files-from-google-drive-using-url
# Make sure you run this from parent folder and not from utils folder i.e.
# python utils/fetch_dataset.py
import requests, os
from zipfile import ZipFile
import argparse
import urllib
parser = argparse.ArgumentParser(
description="Download dataset - Python code to download associated datasets"
)
parser.add_argument(
"--dataset",
type=str,
default="mfr2",
help="Name of the dataset - Details on available datasets can be found at GitHub Page",
)
args = parser.parse_args()
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params={"id": id}, stream=True)
token = get_confirm_token(response)
if token:
params = {"id": id, "confirm": token}
response = session.get(URL, params=params, stream=True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith("download_warning"):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
print(destination)
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
def download(t_url):
response = urllib.request.urlopen(t_url)
data = response.read()
txt_str = str(data)
lines = txt_str.split("\\n")
return lines
def Convert(lst):
it = iter(lst)
res_dct = dict(zip(it, it))
return res_dct
if __name__ == "__main__":
# Fetch the latest download_links.txt file from GitHub
link = "https://raw.githubusercontent.com/aqeelanwar/MaskTheFace/master/datasets/download_links.txt"
links_dict = Convert(
download(link)[0]
.replace(":", "\n")
.replace("b'", "")
.replace("'", "")
.replace(" ", "")
.split("\n")
)
file_id = links_dict[args.dataset]
destination = "datasets\_.zip"
print("Downloading: ", args.dataset)
download_file_from_google_drive(file_id, destination)
print("Extracting: ", args.dataset)
with ZipFile(destination, "r") as zipObj:
# Extract all the contents of zip file in current directory
zipObj.extractall(destination.rsplit(os.path.sep, 1)[0])
os.remove(destination)