Showing
26 changed files
with
1013 additions
and
3 deletions
... | @@ -5,6 +5,8 @@ TEST_WAV_DIR = 'test_wavs' | ... | @@ -5,6 +5,8 @@ TEST_WAV_DIR = 'test_wavs' |
5 | 5 | ||
6 | # Feature path | 6 | # Feature path |
7 | TRAIN_FEAT_DIR = 'feat_logfbank_nfilt40/train' | 7 | TRAIN_FEAT_DIR = 'feat_logfbank_nfilt40/train' |
8 | +# TRAIN_FEAT_DIR = '/test/merge_dataset' | ||
9 | +# TRAIN_FEAT_DIR = '/test/trainFeature' | ||
8 | TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test' | 10 | TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test' |
9 | 11 | ||
10 | # Context window size | 12 | # Context window size | ... | ... |
Speaker_Recognition/configure1_merge.py
0 → 100644
1 | +# Wave path | ||
2 | +TRAIN_WAV_DIR = '/home/admin/Desktop/read_25h_2/train' | ||
3 | +DEV_WAV_DIR = '/home/admin/Desktop/read_25h_2/dev' | ||
4 | +TEST_WAV_DIR = 'test_wavs' | ||
5 | + | ||
6 | +# Feature path | ||
7 | +# TRAIN_FEAT_DIR = '/test/zeroth/train_data_01/003' | ||
8 | +TRAIN_FEAT_DIR = '/test/merge_train_dataset' | ||
9 | +# TRAIN_FEAT_DIR = '/test/trainFeature' | ||
10 | +# TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test' | ||
11 | +TEST_FEAT_DIR = '/test/merge_test_dataset' | ||
12 | +# Context window size | ||
13 | +NUM_WIN_SIZE = 100 #10 | ||
14 | + | ||
15 | +# Settings for feature extraction | ||
16 | +USE_LOGSCALE = True | ||
17 | +USE_DELTA = False | ||
18 | +USE_SCALE = False | ||
19 | +SAMPLE_RATE = 16000 | ||
20 | +FILTER_BANK = 40 | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
Speaker_Recognition/configure1_zeroth.py
0 → 100644
1 | +# Wave path | ||
2 | +TRAIN_WAV_DIR = '/home/admin/Desktop/read_25h_2/train' | ||
3 | +DEV_WAV_DIR = '/home/admin/Desktop/read_25h_2/dev' | ||
4 | +TEST_WAV_DIR = 'test_wavs' | ||
5 | + | ||
6 | +# Feature path | ||
7 | +# TRAIN_FEAT_DIR = '/test/zeroth/train_data_01/003' | ||
8 | +TRAIN_FEAT_DIR = '/test/zeroth_train_dataset' | ||
9 | +# TRAIN_FEAT_DIR = '/test/trainFeature' | ||
10 | +# TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test' | ||
11 | +TEST_FEAT_DIR = '/test/zeroth_test_dataset' | ||
12 | +# Context window size | ||
13 | +NUM_WIN_SIZE = 100 #10 | ||
14 | + | ||
15 | +# Settings for feature extraction | ||
16 | +USE_LOGSCALE = True | ||
17 | +USE_DELTA = False | ||
18 | +USE_SCALE = False | ||
19 | +SAMPLE_RATE = 16000 | ||
20 | +FILTER_BANK = 40 | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
Speaker_Recognition/enroll1.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import configure as c | ||
9 | + | ||
10 | +from DB_wav_reader import read_feats_structure | ||
11 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
12 | +from model.model1 import background_resnet | ||
13 | + | ||
14 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
15 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
16 | + if use_cuda: | ||
17 | + model.cuda() | ||
18 | + print('=> loading checkpoint') | ||
19 | + # original saved file with DataParallel | ||
20 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
21 | + # create new OrderedDict that does not contain `module.` | ||
22 | + model.load_state_dict(checkpoint['state_dict']) | ||
23 | + model.eval() | ||
24 | + return model | ||
25 | + | ||
26 | +def split_enroll_and_test(dataroot_dir): | ||
27 | + DB_all = read_feats_structure(dataroot_dir) | ||
28 | + enroll_DB = pd.DataFrame() | ||
29 | + test_DB = pd.DataFrame() | ||
30 | + | ||
31 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
32 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
33 | + | ||
34 | + # Reset the index | ||
35 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
36 | + test_DB = test_DB.reset_index(drop=True) | ||
37 | + return enroll_DB, test_DB | ||
38 | + | ||
39 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
40 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
41 | + | ||
42 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
43 | + activation = 0 | ||
44 | + with torch.no_grad(): | ||
45 | + for i in range(tot_segments): | ||
46 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
47 | + | ||
48 | + TT = ToTensorTestInput() | ||
49 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
50 | + | ||
51 | + if use_cuda: | ||
52 | + temp_input = temp_input.cuda() | ||
53 | + temp_activation,_ = model(temp_input) | ||
54 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
55 | + | ||
56 | + activation = l2_norm(activation, 1) | ||
57 | + | ||
58 | + return activation | ||
59 | + | ||
60 | +def l2_norm(input, alpha): | ||
61 | + input_size = input.size() # size:(n_frames, dim) | ||
62 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
63 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
64 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
65 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
66 | + output = _output.view(input_size) | ||
67 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
68 | + output = output * alpha | ||
69 | + return output | ||
70 | + | ||
71 | +def enroll_per_spk(use_cuda, test_frames, model, DB, embedding_dir): | ||
72 | + """ | ||
73 | + Output the averaged d-vector for each speaker (enrollment) | ||
74 | + Return the dictionary (length of n_spk) | ||
75 | + """ | ||
76 | + n_files = len(DB) # 10 | ||
77 | + enroll_speaker_list = sorted(set(DB['speaker_id'])) | ||
78 | + | ||
79 | + embeddings = {} | ||
80 | + | ||
81 | + # Aggregates all the activations | ||
82 | + print("Start to aggregate all the d-vectors per enroll speaker") | ||
83 | + | ||
84 | + for i in range(n_files): | ||
85 | + filename = DB['filename'][i] | ||
86 | + spk = DB['speaker_id'][i] | ||
87 | + | ||
88 | + activation = get_embeddings(use_cuda, filename, model, test_frames) | ||
89 | + if spk in embeddings: | ||
90 | + embeddings[spk] += activation | ||
91 | + else: | ||
92 | + embeddings[spk] = activation | ||
93 | + | ||
94 | + print("Aggregates the activation (spk : %s)" % (spk)) | ||
95 | + | ||
96 | + if not os.path.exists(embedding_dir): | ||
97 | + os.makedirs(embedding_dir) | ||
98 | + | ||
99 | + # Save the embeddings | ||
100 | + for spk_index in enroll_speaker_list: | ||
101 | + embedding_path = os.path.join(embedding_dir, spk_index+'.pth') | ||
102 | + torch.save(embeddings[spk_index], embedding_path) | ||
103 | + print("Save the embeddings for %s" % (spk_index)) | ||
104 | + return embeddings | ||
105 | + | ||
106 | +def main(): | ||
107 | + | ||
108 | + # Settings | ||
109 | + use_cuda = True | ||
110 | + log_dir = 'new_model1' | ||
111 | + embedding_size = 128 | ||
112 | + cp_num = 24 # Which checkpoint to use? | ||
113 | + n_classes = 241 | ||
114 | + test_frames = 200 | ||
115 | + | ||
116 | + # Load model from checkpoint | ||
117 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
118 | + | ||
119 | + # Get the dataframe for enroll DB | ||
120 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
121 | + | ||
122 | + # Where to save embeddings | ||
123 | + embedding_dir = 'enroll_embeddings1' | ||
124 | + | ||
125 | + # Perform the enrollment and save the results | ||
126 | + enroll_per_spk(use_cuda, test_frames, model, enroll_DB, embedding_dir) | ||
127 | + | ||
128 | + """ Test speaker list | ||
129 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
130 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
131 | + """ | ||
132 | + | ||
133 | +if __name__ == '__main__': | ||
134 | + main() | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
... | @@ -123,10 +123,10 @@ def main(): | ... | @@ -123,10 +123,10 @@ def main(): |
123 | """ | 123 | """ |
124 | 124 | ||
125 | spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\ | 125 | spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\ |
126 | - '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'] | 126 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777'] |
127 | 127 | ||
128 | # Set the test speaker | 128 | # Set the test speaker |
129 | - test_speaker = '230M4087' | 129 | + test_speaker = '778M8777' |
130 | 130 | ||
131 | test_path = os.path.join(test_dir, test_speaker, 'test.p') | 131 | test_path = os.path.join(test_dir, test_speaker, 'test.p') |
132 | 132 | ||
... | @@ -134,4 +134,4 @@ def main(): | ... | @@ -134,4 +134,4 @@ def main(): |
134 | best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list) | 134 | best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list) |
135 | 135 | ||
136 | if __name__ == '__main__': | 136 | if __name__ == '__main__': |
137 | - main() | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
137 | + main() | ... | ... |
Speaker_Recognition/identification1.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import configure as c | ||
9 | + | ||
10 | +from DB_wav_reader import read_feats_structure | ||
11 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
12 | +from model.model1 import background_resnet | ||
13 | + | ||
14 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
15 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
16 | + if use_cuda: | ||
17 | + model.cuda() | ||
18 | + print('=> loading checkpoint') | ||
19 | + # original saved file with DataParallel | ||
20 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
21 | + # create new OrderedDict that does not contain `module.` | ||
22 | + model.load_state_dict(checkpoint['state_dict']) | ||
23 | + model.eval() | ||
24 | + return model | ||
25 | + | ||
26 | +def split_enroll_and_test(dataroot_dir): | ||
27 | + DB_all = read_feats_structure(dataroot_dir) | ||
28 | + enroll_DB = pd.DataFrame() | ||
29 | + test_DB = pd.DataFrame() | ||
30 | + | ||
31 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
32 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
33 | + | ||
34 | + # Reset the index | ||
35 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
36 | + test_DB = test_DB.reset_index(drop=True) | ||
37 | + return enroll_DB, test_DB | ||
38 | + | ||
39 | +def load_enroll_embeddings(embedding_dir): | ||
40 | + embeddings = {} | ||
41 | + for f in os.listdir(embedding_dir): | ||
42 | + spk = f.replace('.pth','') | ||
43 | + # Select the speakers who are in the 'enroll_spk_list' | ||
44 | + embedding_path = os.path.join(embedding_dir, f) | ||
45 | + tmp_embeddings = torch.load(embedding_path) | ||
46 | + embeddings[spk] = tmp_embeddings | ||
47 | + | ||
48 | + return embeddings | ||
49 | + | ||
50 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
51 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
52 | + | ||
53 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
54 | + activation = 0 | ||
55 | + with torch.no_grad(): | ||
56 | + for i in range(tot_segments): | ||
57 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
58 | + | ||
59 | + TT = ToTensorTestInput() | ||
60 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
61 | + | ||
62 | + if use_cuda: | ||
63 | + temp_input = temp_input.cuda() | ||
64 | + temp_activation,_ = model(temp_input) | ||
65 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
66 | + | ||
67 | + activation = l2_norm(activation, 1) | ||
68 | + | ||
69 | + return activation | ||
70 | + | ||
71 | +def l2_norm(input, alpha): | ||
72 | + input_size = input.size() # size:(n_frames, dim) | ||
73 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
74 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
75 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
76 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
77 | + output = _output.view(input_size) | ||
78 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
79 | + output = output * alpha | ||
80 | + return output | ||
81 | + | ||
82 | +def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list): | ||
83 | + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames) | ||
84 | + max_score = -10**8 | ||
85 | + best_spk = None | ||
86 | + for spk in spk_list: | ||
87 | + score = F.cosine_similarity(test_embedding, embeddings[spk]) | ||
88 | + score = score.data.cpu().numpy() | ||
89 | + if score > max_score: | ||
90 | + max_score = score | ||
91 | + best_spk = spk | ||
92 | + #print("Speaker identification result : %s" %best_spk) | ||
93 | + true_spk = test_filename.split('/')[-2].split('_')[0] | ||
94 | + print("\n=== Speaker identification ===") | ||
95 | + print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk)) | ||
96 | + return best_spk | ||
97 | + | ||
98 | +def main(): | ||
99 | + | ||
100 | + log_dir = 'new_model1' # Where the checkpoints are saved | ||
101 | + embedding_dir = 'enroll_embeddings1' # Where embeddings are saved | ||
102 | + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved | ||
103 | + | ||
104 | + # Settings | ||
105 | + use_cuda = True # Use cuda or not | ||
106 | + embedding_size = 128 # Dimension of speaker embeddings | ||
107 | + cp_num = 30 # Which checkpoint to use? | ||
108 | + n_classes = 241 # How many speakers in training data? | ||
109 | + test_frames = 100 # Split the test utterance | ||
110 | + | ||
111 | + # Load model from checkpoint | ||
112 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
113 | + | ||
114 | + # Get the dataframe for test DB | ||
115 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
116 | + | ||
117 | + # Load enroll embeddings | ||
118 | + embeddings = load_enroll_embeddings(embedding_dir) | ||
119 | + | ||
120 | + """ Test speaker list | ||
121 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
122 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
123 | + """ | ||
124 | + | ||
125 | + spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\ | ||
126 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777'] | ||
127 | + | ||
128 | + # Set the test speaker | ||
129 | + test_speaker = '213F5100' | ||
130 | + | ||
131 | + test_path = os.path.join(test_dir, test_speaker, 'test.p') | ||
132 | + | ||
133 | + # Perform the test | ||
134 | + best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list) | ||
135 | + | ||
136 | +if __name__ == '__main__': | ||
137 | + main() |
Speaker_Recognition/model/model1.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn as nn | ||
3 | +import torch.nn.functional as F | ||
4 | +from torch.autograd import Function | ||
5 | +import model.resnet as resnet | ||
6 | + | ||
7 | + | ||
8 | +class background_resnet(nn.Module): | ||
9 | + def __init__(self, embedding_size, num_classes, backbone='resnet18'): | ||
10 | + super(background_resnet, self).__init__() | ||
11 | + self.backbone = backbone | ||
12 | + # copying modules from pretrained models | ||
13 | + if backbone == 'resnet50': | ||
14 | + self.pretrained = resnet.resnet50(pretrained=False) | ||
15 | + elif backbone == 'resnet101': | ||
16 | + self.pretrained = resnet.resnet101(pretrained=False) | ||
17 | + elif backbone == 'resnet152': | ||
18 | + self.pretrained = resnet.resnet152(pretrained=False) | ||
19 | + elif backbone == 'resnet18': | ||
20 | + self.pretrained = resnet.resnet18(pretrained=False) | ||
21 | + elif backbone == 'resnet34': | ||
22 | + self.pretrained = resnet.resnet34(pretrained=False) | ||
23 | + else: | ||
24 | + raise RuntimeError('unknown backbone: {}'.format(backbone)) | ||
25 | + | ||
26 | + self.fc0 = nn.Linear(128, embedding_size) | ||
27 | + self.bn0 = nn.BatchNorm1d(embedding_size) | ||
28 | + self.relu = nn.ReLU() | ||
29 | + self.last = nn.Linear(embedding_size, num_classes) | ||
30 | + | ||
31 | + def forward(self, x): | ||
32 | + # input x: minibatch x 1 x 40 x 40 | ||
33 | + x = self.pretrained.conv1(x) | ||
34 | + x = self.pretrained.bn1(x) | ||
35 | + x = self.pretrained.relu(x) | ||
36 | + | ||
37 | + x = self.pretrained.layer1(x) | ||
38 | + x = self.pretrained.layer2(x) | ||
39 | + x = self.pretrained.layer3(x) | ||
40 | + x = self.pretrained.layer4(x) | ||
41 | + | ||
42 | + out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1] | ||
43 | + out = torch.squeeze(out) # [batch, n_embed] | ||
44 | + # flatten the out so that the fully connected layer can be connected from here | ||
45 | + out = out.view(x.size(0), -1) # (n_batch, n_embed) | ||
46 | + spk_embedding = self.fc0(out) | ||
47 | + out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed] | ||
48 | + out = self.last(out) | ||
49 | + | ||
50 | + return spk_embedding, out | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
... | @@ -113,6 +113,7 @@ class ResNet(nn.Module): | ... | @@ -113,6 +113,7 @@ class ResNet(nn.Module): |
113 | self.layer2 = self._make_layer(block, 32, layers[1], stride=2) | 113 | self.layer2 = self._make_layer(block, 32, layers[1], stride=2) |
114 | self.layer3 = self._make_layer(block, 64, layers[2], stride=2) | 114 | self.layer3 = self._make_layer(block, 64, layers[2], stride=2) |
115 | self.layer4 = self._make_layer(block, 128, layers[3], stride=2) | 115 | self.layer4 = self._make_layer(block, 128, layers[3], stride=2) |
116 | + | ||
116 | self.avgpool = nn.AvgPool2d(1, stride=1) | 117 | self.avgpool = nn.AvgPool2d(1, stride=1) |
117 | self.fc = nn.Linear(128 * block.expansion, num_classes) | 118 | self.fc = nn.Linear(128 * block.expansion, num_classes) |
118 | 119 | ... | ... |
Speaker_Recognition/model/resnet1.py
0 → 100644
1 | +"""Imported from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py | ||
2 | +and added support for the 1x32x32 mel spectrogram for the speech recognition. | ||
3 | +Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun: Deep Residual Learning for Image Recognition | ||
4 | +https://arxiv.org/abs/1512.03385 | ||
5 | +""" | ||
6 | + | ||
7 | +import torch.nn as nn | ||
8 | +import math | ||
9 | +import torch.utils.model_zoo as model_zoo | ||
10 | + | ||
11 | + | ||
12 | +__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', | ||
13 | + 'resnet152'] | ||
14 | + | ||
15 | + | ||
16 | +model_urls = { | ||
17 | + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', | ||
18 | + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', | ||
19 | + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', | ||
20 | + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', | ||
21 | + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', | ||
22 | +} | ||
23 | + | ||
24 | + | ||
25 | +def conv3x3(in_planes, out_planes, stride=1): | ||
26 | + """3x3 convolution with padding""" | ||
27 | + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, | ||
28 | + padding=1, bias=False) | ||
29 | + | ||
30 | + | ||
31 | +class BasicBlock(nn.Module): | ||
32 | + expansion = 1 | ||
33 | + | ||
34 | + def __init__(self, inplanes, planes, stride=1, downsample=None): | ||
35 | + super(BasicBlock, self).__init__() | ||
36 | + self.conv1 = conv3x3(inplanes, planes, stride) | ||
37 | + self.bn1 = nn.BatchNorm2d(planes) | ||
38 | + self.relu = nn.ReLU(inplace=True) | ||
39 | + self.conv2 = conv3x3(planes, planes) | ||
40 | + self.bn2 = nn.BatchNorm2d(planes) | ||
41 | + self.downsample = downsample | ||
42 | + self.stride = stride | ||
43 | + | ||
44 | + def forward(self, x): | ||
45 | + residual = x | ||
46 | + | ||
47 | + out = self.conv1(x) | ||
48 | + out = self.bn1(out) | ||
49 | + out = self.relu(out) | ||
50 | + | ||
51 | + out = self.conv2(out) | ||
52 | + out = self.bn2(out) | ||
53 | + | ||
54 | + if self.downsample is not None: | ||
55 | + residual = self.downsample(x) | ||
56 | + | ||
57 | + out += residual | ||
58 | + out = self.relu(out) | ||
59 | + | ||
60 | + return out | ||
61 | + | ||
62 | + | ||
63 | +class Bottleneck(nn.Module): | ||
64 | + expansion = 4 | ||
65 | + | ||
66 | + def __init__(self, inplanes, planes, stride=1, downsample=None): | ||
67 | + super(Bottleneck, self).__init__() | ||
68 | + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) | ||
69 | + self.bn1 = nn.BatchNorm2d(planes) | ||
70 | + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, | ||
71 | + padding=1, bias=False) | ||
72 | + self.bn2 = nn.BatchNorm2d(planes) | ||
73 | + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) | ||
74 | + self.bn3 = nn.BatchNorm2d(planes * 4) | ||
75 | + self.relu = nn.ReLU(inplace=True) | ||
76 | + self.downsample = downsample | ||
77 | + self.stride = stride | ||
78 | + | ||
79 | + def forward(self, x): | ||
80 | + residual = x | ||
81 | + | ||
82 | + out = self.conv1(x) | ||
83 | + out = self.bn1(out) | ||
84 | + out = self.relu(out) | ||
85 | + | ||
86 | + out = self.conv2(out) | ||
87 | + out = self.bn2(out) | ||
88 | + out = self.relu(out) | ||
89 | + | ||
90 | + out = self.conv3(out) | ||
91 | + out = self.bn3(out) | ||
92 | + | ||
93 | + if self.downsample is not None: | ||
94 | + residual = self.downsample(x) | ||
95 | + | ||
96 | + out += residual | ||
97 | + out = self.relu(out) | ||
98 | + | ||
99 | + return out | ||
100 | + | ||
101 | + | ||
102 | +class ResNet(nn.Module): | ||
103 | + | ||
104 | + def __init__(self, block, layers, num_classes=1000, in_channels=1): | ||
105 | + self.inplanes = 16 | ||
106 | + super(ResNet, self).__init__() | ||
107 | + self.conv1 = nn.Conv2d(in_channels, 16, kernel_size=7, stride=1, padding=3, | ||
108 | + bias=False) # ori : stride = 2 | ||
109 | + self.bn1 = nn.BatchNorm2d(16) | ||
110 | + self.relu = nn.ReLU(inplace=True) | ||
111 | + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) | ||
112 | + self.layer1 = self._make_layer(block, 16, layers[0]) | ||
113 | + self.layer2 = self._make_layer(block, 32, layers[1], stride=2) | ||
114 | + self.layer3 = self._make_layer(block, 64, layers[2], stride=2) | ||
115 | + self.layer4 = self._make_layer(block, 128, layers[3], stride=2) | ||
116 | + self.layer5 = self._make_layer(block, 256, layers[3], stride=2) | ||
117 | + self.avgpool = nn.AvgPool2d(1, stride=1) | ||
118 | + self.fc = nn.Linear(128 * block.expansion, num_classes) | ||
119 | + | ||
120 | + for m in self.modules(): | ||
121 | + if isinstance(m, nn.Conv2d): | ||
122 | + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels | ||
123 | + m.weight.data.normal_(0, math.sqrt(2. / n)) | ||
124 | + elif isinstance(m, nn.BatchNorm2d): | ||
125 | + m.weight.data.fill_(1) | ||
126 | + m.bias.data.zero_() | ||
127 | + | ||
128 | + def _make_layer(self, block, planes, blocks, stride=1): | ||
129 | + downsample = None | ||
130 | + if stride != 1 or self.inplanes != planes * block.expansion: | ||
131 | + downsample = nn.Sequential( | ||
132 | + nn.Conv2d(self.inplanes, planes * block.expansion, | ||
133 | + kernel_size=1, stride=stride, bias=False), | ||
134 | + nn.BatchNorm2d(planes * block.expansion), | ||
135 | + ) | ||
136 | + | ||
137 | + layers = [] | ||
138 | + layers.append(block(self.inplanes, planes, stride, downsample)) | ||
139 | + self.inplanes = planes * block.expansion | ||
140 | + for i in range(1, blocks): | ||
141 | + layers.append(block(self.inplanes, planes)) | ||
142 | + | ||
143 | + return nn.Sequential(*layers) | ||
144 | + | ||
145 | + def forward(self, x): | ||
146 | + x = self.conv1(x) | ||
147 | + x = self.bn1(x) | ||
148 | + x = self.relu(x) | ||
149 | + x = self.maxpool(x) | ||
150 | + | ||
151 | + x = self.layer1(x) | ||
152 | + x = self.layer2(x) | ||
153 | + x = self.layer3(x) | ||
154 | + x = self.layer4(x) | ||
155 | + | ||
156 | + x = self.avgpool(x) | ||
157 | + x = x.view(x.size(0), -1) | ||
158 | + x = self.fc(x) | ||
159 | + | ||
160 | + return x | ||
161 | + | ||
162 | + | ||
163 | +def resnet18(pretrained=False, **kwargs): | ||
164 | + """Constructs a ResNet-18 model. | ||
165 | + Args: | ||
166 | + pretrained (bool): If True, returns a model pre-trained on ImageNet | ||
167 | + """ | ||
168 | + model = ResNet(BasicBlock, [2, 2, 2, 2, 2], **kwargs) | ||
169 | + if pretrained: | ||
170 | + model.load_state_dict(model_zoo.load_url(model_urls['resnet18'])) | ||
171 | + return model | ||
172 | + | ||
173 | + | ||
174 | +def resnet34(pretrained=False, **kwargs): | ||
175 | + """Constructs a ResNet-34 model. | ||
176 | + Args: | ||
177 | + pretrained (bool): If True, returns a model pre-trained on ImageNet | ||
178 | + """ | ||
179 | + model = ResNet(BasicBlock, [3, 4, 6, 3, 3], **kwargs) | ||
180 | + if pretrained: | ||
181 | + model.load_state_dict(model_zoo.load_url(model_urls['resnet34'])) | ||
182 | + return model | ||
183 | + | ||
184 | + | ||
185 | +def resnet50(pretrained=False, **kwargs): | ||
186 | + """Constructs a ResNet-50 model. | ||
187 | + Args: | ||
188 | + pretrained (bool): If True, returns a model pre-trained on ImageNet | ||
189 | + """ | ||
190 | + model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) | ||
191 | + if pretrained: | ||
192 | + model.load_state_dict(model_zoo.load_url(model_urls['resnet50'])) | ||
193 | + return model | ||
194 | + | ||
195 | + | ||
196 | +def resnet101(pretrained=False, **kwargs): | ||
197 | + """Constructs a ResNet-101 model. | ||
198 | + Args: | ||
199 | + pretrained (bool): If True, returns a model pre-trained on ImageNet | ||
200 | + """ | ||
201 | + model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) | ||
202 | + if pretrained: | ||
203 | + model.load_state_dict(model_zoo.load_url(model_urls['resnet101'])) | ||
204 | + return model | ||
205 | + | ||
206 | + | ||
207 | +def resnet152(pretrained=False, **kwargs): | ||
208 | + """Constructs a ResNet-152 model. | ||
209 | + Args: | ||
210 | + pretrained (bool): If True, returns a model pre-trained on ImageNet | ||
211 | + """ | ||
212 | + model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) | ||
213 | + if pretrained: | ||
214 | + model.load_state_dict(model_zoo.load_url(model_urls['resnet152'])) | ||
215 | + return model | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
Speaker_Recognition/train1.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn as nn | ||
3 | +import torch.optim as optim | ||
4 | +import torchvision.transforms as transforms | ||
5 | + | ||
6 | +import time | ||
7 | +import os | ||
8 | +import numpy as np | ||
9 | +import configure as c | ||
10 | +import pandas as pd | ||
11 | +from DB_wav_reader import read_feats_structure | ||
12 | +from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded | ||
13 | +from model.model1 import background_resnet | ||
14 | +import matplotlib.pyplot as plt | ||
15 | +import pandas as pd | ||
16 | +def load_dataset(val_ratio): | ||
17 | + # Load training set and validation set | ||
18 | + | ||
19 | + | ||
20 | + # Split training set into training set and validation set according to "val_ratio" | ||
21 | + train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio) | ||
22 | + | ||
23 | + file_loader = read_MFB # numpy array:(n_frames, n_dims) | ||
24 | + | ||
25 | + transform = transforms.Compose([ | ||
26 | + TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims) | ||
27 | + ToTensorInput() # torch tensor:(1, n_dims, n_frames) | ||
28 | + ]) | ||
29 | + transform_T = ToTensorDevInput() | ||
30 | + | ||
31 | + | ||
32 | + speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers | ||
33 | + spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)} | ||
34 | + | ||
35 | + train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx) | ||
36 | + valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx) | ||
37 | + | ||
38 | + n_classes = len(speaker_list) # How many speakers? 240 | ||
39 | + return train_dataset, valid_dataset, n_classes | ||
40 | + | ||
41 | +def split_train_dev(train_feat_dir, valid_ratio): | ||
42 | + train_valid_DB = read_feats_structure(train_feat_dir) | ||
43 | + total_len = len(train_valid_DB) # 148642 | ||
44 | + valid_len = int(total_len * valid_ratio/100.) | ||
45 | + train_len = total_len - valid_len | ||
46 | + shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True) | ||
47 | + # Split the DB into train and valid set | ||
48 | + train_DB = shuffled_train_valid_DB.iloc[:train_len] | ||
49 | + valid_DB = shuffled_train_valid_DB.iloc[train_len:] | ||
50 | + # Reset the index | ||
51 | + train_DB = train_DB.reset_index(drop=True) | ||
52 | + valid_DB = valid_DB.reset_index(drop=True) | ||
53 | + print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100)) | ||
54 | + print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100)) | ||
55 | + print('Total %d utts' %(total_len)) | ||
56 | + | ||
57 | + return train_DB, valid_DB | ||
58 | + | ||
59 | +def main(): | ||
60 | + # Set hyperparameters | ||
61 | + use_cuda = True # use gpu or cpu | ||
62 | + val_ratio = 10 # Percentage of validation set | ||
63 | + embedding_size = 128 | ||
64 | + start = 1 # Start epoch | ||
65 | + n_epochs = 30 # How many epochs? | ||
66 | + end = start + n_epochs # Last epoch | ||
67 | + | ||
68 | + lr = 1e-1 # Initial learning rate | ||
69 | + wd = 1e-4 # Weight decay (L2 penalty) | ||
70 | + optimizer_type = 'sgd' # ex) sgd, adam, adagrad | ||
71 | + | ||
72 | + batch_size = 64 # Batch size for training | ||
73 | + valid_batch_size = 16 # Batch size for validation | ||
74 | + use_shuffle = True # Shuffle for training or not | ||
75 | + | ||
76 | + # Load dataset | ||
77 | + train_dataset, valid_dataset, n_classes = load_dataset(val_ratio) | ||
78 | + | ||
79 | + # print the experiment configuration | ||
80 | + print('\nNumber of classes (speakers):\n{}\n'.format(n_classes)) | ||
81 | + | ||
82 | + log_dir = 'new_model1' # where to save checkpoints | ||
83 | + | ||
84 | + if not os.path.exists(log_dir): | ||
85 | + os.makedirs(log_dir) | ||
86 | + | ||
87 | + # instantiate model and initialize weights | ||
88 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
89 | + | ||
90 | + if use_cuda: | ||
91 | + model.cuda() | ||
92 | + | ||
93 | + # define loss function (criterion), optimizer and scheduler | ||
94 | + criterion = nn.CrossEntropyLoss() | ||
95 | + optimizer = create_optimizer(optimizer_type, model, lr, wd) | ||
96 | + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1) | ||
97 | + | ||
98 | + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, | ||
99 | + batch_size=batch_size, | ||
100 | + shuffle=use_shuffle) | ||
101 | + valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset, | ||
102 | + batch_size=valid_batch_size, | ||
103 | + shuffle=False, | ||
104 | + collate_fn = collate_fn_feat_padded) | ||
105 | + | ||
106 | + # to track the average training loss per epoch as the model trains | ||
107 | + avg_train_losses = [] | ||
108 | + # to track the average validation loss per epoch as the model trains | ||
109 | + avg_valid_losses = [] | ||
110 | + | ||
111 | + | ||
112 | + for epoch in range(start, end): | ||
113 | + | ||
114 | + # train for one epoch | ||
115 | + train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes) | ||
116 | + | ||
117 | + # evaluate on validation set | ||
118 | + valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch) | ||
119 | + | ||
120 | + scheduler.step(valid_loss, epoch) | ||
121 | + | ||
122 | + # calculate average loss over an epoch | ||
123 | + avg_train_losses.append(train_loss) | ||
124 | + avg_valid_losses.append(valid_loss) | ||
125 | + # do checkpointing | ||
126 | + torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(), | ||
127 | + 'optimizer': optimizer.state_dict()}, | ||
128 | + '{}/checkpoint_{}.pth'.format(log_dir, epoch)) | ||
129 | + | ||
130 | + # find position of lowest validation loss | ||
131 | + minposs = avg_valid_losses.index(min(avg_valid_losses))+1 | ||
132 | + print('Lowest validation loss at epoch %d' %minposs) | ||
133 | + | ||
134 | + # visualize the loss and learning rate as the network trained | ||
135 | + visualize_the_losses(avg_train_losses, avg_valid_losses) | ||
136 | + | ||
137 | + | ||
138 | +def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes): | ||
139 | + batch_time = AverageMeter() | ||
140 | + losses = AverageMeter() | ||
141 | + train_acc = AverageMeter() | ||
142 | + | ||
143 | + n_correct, n_total = 0, 0 | ||
144 | + log_interval = 84 | ||
145 | + # switch to train mode | ||
146 | + model.train() | ||
147 | + | ||
148 | + end = time.time() | ||
149 | + # pbar = tqdm(enumerate(train_loader)) | ||
150 | + for batch_idx, (data) in enumerate(train_loader): | ||
151 | + inputs, targets = data # target size:(batch size,1), input size:(batch size, 1, dim, win) | ||
152 | + targets = targets.view(-1) # target size:(batch size) | ||
153 | + current_sample = inputs.size(0) # batch size | ||
154 | + | ||
155 | + if use_cuda: | ||
156 | + inputs = inputs.cuda() | ||
157 | + targets = targets.cuda() | ||
158 | + _, output = model(inputs) # out size:(batch size, #classes), for softmax | ||
159 | + | ||
160 | + # calculate accuracy of predictions in the current batch | ||
161 | + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item() | ||
162 | + n_total += current_sample | ||
163 | + train_acc_temp = 100. * n_correct / n_total | ||
164 | + train_acc.update(train_acc_temp, inputs.size(0)) | ||
165 | + | ||
166 | + loss = criterion(output, targets) | ||
167 | + losses.update(loss.item(), inputs.size(0)) | ||
168 | + | ||
169 | + # compute gradient and do SGD step | ||
170 | + optimizer.zero_grad() | ||
171 | + loss.backward() | ||
172 | + optimizer.step() | ||
173 | + | ||
174 | + # measure elapsed time | ||
175 | + batch_time.update(time.time() - end) | ||
176 | + end = time.time() | ||
177 | + | ||
178 | + if batch_idx % log_interval == 0: | ||
179 | + print( | ||
180 | + 'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t' | ||
181 | + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' | ||
182 | + 'Loss {loss.avg:.4f}\t' | ||
183 | + 'Acc {train_acc.avg:.4f}'.format( | ||
184 | + epoch, batch_idx * len(inputs), len(train_loader.dataset), | ||
185 | + 100. * batch_idx / len(train_loader), | ||
186 | + batch_time=batch_time, loss=losses, train_acc=train_acc)) | ||
187 | + return losses.avg | ||
188 | + | ||
189 | +def validate(val_loader, model, criterion, use_cuda, epoch): | ||
190 | + batch_time = AverageMeter() | ||
191 | + losses = AverageMeter() | ||
192 | + val_acc = AverageMeter() | ||
193 | + | ||
194 | + n_correct, n_total = 0, 0 | ||
195 | + | ||
196 | + # switch to evaluate mode | ||
197 | + model.eval() | ||
198 | + | ||
199 | + with torch.no_grad(): | ||
200 | + end = time.time() | ||
201 | + for i, (data) in enumerate(val_loader): | ||
202 | + inputs, targets = data | ||
203 | + current_sample = inputs.size(0) # batch size | ||
204 | + | ||
205 | + if use_cuda: | ||
206 | + inputs = inputs.cuda() | ||
207 | + targets = targets.cuda() | ||
208 | + | ||
209 | + # compute output | ||
210 | + _, output = model(inputs) | ||
211 | + | ||
212 | + # measure accuracy and record loss | ||
213 | + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item() | ||
214 | + n_total += current_sample | ||
215 | + val_acc_temp = 100. * n_correct / n_total | ||
216 | + val_acc.update(val_acc_temp, inputs.size(0)) | ||
217 | + | ||
218 | + loss = criterion(output, targets) | ||
219 | + losses.update(loss.item(), inputs.size(0)) | ||
220 | + # measure elapsed time | ||
221 | + batch_time.update(time.time() - end) | ||
222 | + end = time.time() | ||
223 | + | ||
224 | + print(' * Validation: ' | ||
225 | + 'Loss {loss.avg:.4f}\t' | ||
226 | + 'Acc {val_acc.avg:.4f}'.format( | ||
227 | + loss=losses, val_acc=val_acc)) | ||
228 | + | ||
229 | + return losses.avg | ||
230 | + | ||
231 | +class AverageMeter(object): | ||
232 | + """Computes and stores the average and current value""" | ||
233 | + def __init__(self): | ||
234 | + self.reset() | ||
235 | + def reset(self): | ||
236 | + self.val = 0 | ||
237 | + self.avg = 0 | ||
238 | + self.sum = 0 | ||
239 | + self.count = 0 | ||
240 | + def update(self, val, n=1): | ||
241 | + self.val = val | ||
242 | + self.sum += val * n | ||
243 | + self.count += n | ||
244 | + self.avg = self.sum / self.count | ||
245 | + | ||
246 | +def create_optimizer(optimizer, model, new_lr, wd): | ||
247 | + # setup optimizer | ||
248 | + if optimizer == 'sgd': | ||
249 | + optimizer = optim.SGD(model.parameters(), lr=new_lr, | ||
250 | + momentum=0.9, dampening=0, | ||
251 | + weight_decay=wd) | ||
252 | + elif optimizer == 'adam': | ||
253 | + optimizer = optim.Adam(model.parameters(), lr=new_lr, | ||
254 | + weight_decay=wd) | ||
255 | + elif optimizer == 'adagrad': | ||
256 | + optimizer = optim.Adagrad(model.parameters(), | ||
257 | + lr=new_lr, | ||
258 | + weight_decay=wd) | ||
259 | + return optimizer | ||
260 | + | ||
261 | +def visualize_the_losses(train_loss, valid_loss): | ||
262 | + epoch = [] | ||
263 | + for i in range (1,31) : | ||
264 | + epoch.append(i) | ||
265 | + with open("file.txt", "w") as output: | ||
266 | + output.write(str(epoch)) | ||
267 | + output.write('\n') | ||
268 | + output.write(str(train_loss)) | ||
269 | + output.write('\n') | ||
270 | + output.write(str(valid_loss)) | ||
271 | + # fig = plt.figure(figsize=(10,8)) | ||
272 | + # plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss') | ||
273 | + # plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss') | ||
274 | + | ||
275 | + # find position of lowest validation loss | ||
276 | + # minposs = valid_loss.index(min(valid_loss))+1 | ||
277 | + # plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint') | ||
278 | + | ||
279 | + # plt.xlabel('epochs') | ||
280 | + # plt.ylabel('loss') | ||
281 | + # plt.ylim(0, 3.5) # consistent scale | ||
282 | + # plt.xlim(0, len(train_loss)+1) # consistent scale | ||
283 | + # plt.grid(True) | ||
284 | + # plt.legend() | ||
285 | + # plt.tight_layout() | ||
286 | + #plt.show() | ||
287 | + # fig.savefig('loss_plot.png', bbox_inches='tight') | ||
288 | + | ||
289 | +if __name__ == '__main__': | ||
290 | + main() |
Speaker_Recognition/verification1.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import configure as c | ||
9 | + | ||
10 | +from DB_wav_reader import read_feats_structure | ||
11 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
12 | +from model.model1 import background_resnet | ||
13 | + | ||
14 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
15 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
16 | + if use_cuda: | ||
17 | + model.cuda() | ||
18 | + print('=> loading checkpoint') | ||
19 | + # original saved file with DataParallel | ||
20 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
21 | + # create new OrderedDict that does not contain `module.` | ||
22 | + model.load_state_dict(checkpoint['state_dict']) | ||
23 | + model.eval() | ||
24 | + return model | ||
25 | + | ||
26 | +def split_enroll_and_test(dataroot_dir): | ||
27 | + DB_all = read_feats_structure(dataroot_dir) | ||
28 | + enroll_DB = pd.DataFrame() | ||
29 | + test_DB = pd.DataFrame() | ||
30 | + | ||
31 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
32 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
33 | + | ||
34 | + # Reset the index | ||
35 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
36 | + test_DB = test_DB.reset_index(drop=True) | ||
37 | + return enroll_DB, test_DB | ||
38 | + | ||
39 | +def load_enroll_embeddings(embedding_dir): | ||
40 | + embeddings = {} | ||
41 | + for f in os.listdir(embedding_dir): | ||
42 | + spk = f.replace('.pth','') | ||
43 | + # Select the speakers who are in the 'enroll_spk_list' | ||
44 | + embedding_path = os.path.join(embedding_dir, f) | ||
45 | + tmp_embeddings = torch.load(embedding_path) | ||
46 | + embeddings[spk] = tmp_embeddings | ||
47 | + | ||
48 | + return embeddings | ||
49 | + | ||
50 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
51 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
52 | + | ||
53 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
54 | + activation = 0 | ||
55 | + with torch.no_grad(): | ||
56 | + for i in range(tot_segments): | ||
57 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
58 | + | ||
59 | + TT = ToTensorTestInput() | ||
60 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
61 | + | ||
62 | + if use_cuda: | ||
63 | + temp_input = temp_input.cuda() | ||
64 | + temp_activation,_ = model(temp_input) | ||
65 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
66 | + | ||
67 | + activation = l2_norm(activation, 1) | ||
68 | + | ||
69 | + return activation | ||
70 | + | ||
71 | +def l2_norm(input, alpha): | ||
72 | + input_size = input.size() # size:(n_frames, dim) | ||
73 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
74 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
75 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
76 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
77 | + output = _output.view(input_size) | ||
78 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
79 | + output = output * alpha | ||
80 | + return output | ||
81 | + | ||
82 | +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres): | ||
83 | + enroll_embedding = embeddings[enroll_speaker] | ||
84 | + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames) | ||
85 | + | ||
86 | + score = F.cosine_similarity(test_embedding, enroll_embedding) | ||
87 | + score = score.data.cpu().numpy() | ||
88 | + | ||
89 | + if score > thres: | ||
90 | + result = 'Accept' | ||
91 | + else: | ||
92 | + result = 'Reject' | ||
93 | + | ||
94 | + test_spk = test_filename.split('/')[-2].split('_')[0] | ||
95 | + print("\n=== Speaker verification ===") | ||
96 | + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result)) | ||
97 | + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres)) | ||
98 | + | ||
99 | +def main(): | ||
100 | + | ||
101 | + log_dir = 'new_model1' # Where the checkpoints are saved | ||
102 | + embedding_dir = 'enroll_embeddings1' # Where embeddings are saved | ||
103 | + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved | ||
104 | + | ||
105 | + # Settings | ||
106 | + use_cuda = True # Use cuda or not | ||
107 | + embedding_size = 128 # Dimension of speaker embeddings | ||
108 | + cp_num = 29 # Which checkpoint to use? | ||
109 | + n_classes = 241 # How many speakers in training data? | ||
110 | + test_frames = 100 # Split the test utterance | ||
111 | + | ||
112 | + # Load model from checkpoint | ||
113 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
114 | + | ||
115 | + # Get the dataframe for test DB | ||
116 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
117 | + | ||
118 | + # Load enroll embeddings | ||
119 | + embeddings = load_enroll_embeddings(embedding_dir) | ||
120 | + | ||
121 | + """ Test speaker list | ||
122 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
123 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
124 | + """ | ||
125 | + | ||
126 | + # Set the true speaker | ||
127 | + enroll_speaker = 'zerothfloac' | ||
128 | + | ||
129 | + # Set the claimed speaker | ||
130 | + test_speaker = 'zerothfloac' | ||
131 | + | ||
132 | + # Threshold | ||
133 | + thres = 0.95 | ||
134 | + | ||
135 | + test_path = os.path.join(test_dir, test_speaker, 'test.p') | ||
136 | + | ||
137 | + # Perform the test | ||
138 | + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres) | ||
139 | + | ||
140 | +if __name__ == '__main__': | ||
141 | + main() |
-
Please register or login to post a comment