Showing
17 changed files
with
2686 additions
and
0 deletions
Speaker_Recognition @ df38711f
1 | +Subproject commit df38711f36cfb15ee578d14a70d0141d1d0a8134 |
Speaker_Recognition/identification3.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import configure as c | ||
9 | + | ||
10 | +from DB_wav_reader import read_feats_structure | ||
11 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
12 | +from model.model3 import background_resnet | ||
13 | + | ||
14 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
15 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
16 | + if use_cuda: | ||
17 | + model.cuda() | ||
18 | + print('=> loading checkpoint') | ||
19 | + # original saved file with DataParallel | ||
20 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
21 | + # create new OrderedDict that does not contain `module.` | ||
22 | + model.load_state_dict(checkpoint['state_dict']) | ||
23 | + model.eval() | ||
24 | + return model | ||
25 | + | ||
26 | +def split_enroll_and_test(dataroot_dir): | ||
27 | + DB_all = read_feats_structure(dataroot_dir) | ||
28 | + enroll_DB = pd.DataFrame() | ||
29 | + test_DB = pd.DataFrame() | ||
30 | + | ||
31 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
32 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
33 | + | ||
34 | + # Reset the index | ||
35 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
36 | + test_DB = test_DB.reset_index(drop=True) | ||
37 | + return enroll_DB, test_DB | ||
38 | + | ||
39 | +def load_enroll_embeddings(embedding_dir): | ||
40 | + embeddings = {} | ||
41 | + for f in os.listdir(embedding_dir): | ||
42 | + spk = f.replace('.pth','') | ||
43 | + # Select the speakers who are in the 'enroll_spk_list' | ||
44 | + embedding_path = os.path.join(embedding_dir, f) | ||
45 | + tmp_embeddings = torch.load(embedding_path) | ||
46 | + embeddings[spk] = tmp_embeddings | ||
47 | + | ||
48 | + return embeddings | ||
49 | + | ||
50 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
51 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
52 | + | ||
53 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
54 | + activation = 0 | ||
55 | + with torch.no_grad(): | ||
56 | + for i in range(tot_segments): | ||
57 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
58 | + | ||
59 | + TT = ToTensorTestInput() | ||
60 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
61 | + | ||
62 | + if use_cuda: | ||
63 | + temp_input = temp_input.cuda() | ||
64 | + temp_activation,_ = model(temp_input) | ||
65 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
66 | + | ||
67 | + activation = l2_norm(activation, 1) | ||
68 | + | ||
69 | + return activation | ||
70 | + | ||
71 | +def l2_norm(input, alpha): | ||
72 | + input_size = input.size() # size:(n_frames, dim) | ||
73 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
74 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
75 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
76 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
77 | + output = _output.view(input_size) | ||
78 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
79 | + output = output * alpha | ||
80 | + return output | ||
81 | + | ||
82 | +def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list): | ||
83 | + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames) | ||
84 | + max_score = -10**8 | ||
85 | + best_spk = None | ||
86 | + for spk in spk_list: | ||
87 | + score = F.cosine_similarity(test_embedding, embeddings[spk]) | ||
88 | + score = score.data.cpu().numpy() | ||
89 | + if score > max_score: | ||
90 | + max_score = score | ||
91 | + best_spk = spk | ||
92 | + #print("Speaker identification result : %s" %best_spk) | ||
93 | + true_spk = test_filename.split('/')[-2].split('_')[0] | ||
94 | + print("\n=== Speaker identification ===") | ||
95 | + print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk)) | ||
96 | + return best_spk | ||
97 | + | ||
98 | +def main(): | ||
99 | + | ||
100 | + log_dir = 'new_model3' # Where the checkpoints are saved | ||
101 | + embedding_dir = 'enroll_embeddings3' # Where embeddings are saved | ||
102 | + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved | ||
103 | + | ||
104 | + # Settings | ||
105 | + use_cuda = True # Use cuda or not | ||
106 | + embedding_size = 128 # Dimension of speaker embeddings | ||
107 | + cp_num = 11 # Which checkpoint to use? | ||
108 | + n_classes = 241 # How many speakers in training data? | ||
109 | + test_frames = 100 # Split the test utterance | ||
110 | + | ||
111 | + # Load model from checkpoint | ||
112 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
113 | + | ||
114 | + # Get the dataframe for test DB | ||
115 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
116 | + | ||
117 | + # Load enroll embeddings | ||
118 | + embeddings = load_enroll_embeddings(embedding_dir) | ||
119 | + | ||
120 | + """ Test speaker list | ||
121 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
122 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
123 | + """ | ||
124 | + | ||
125 | + spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\ | ||
126 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777'] | ||
127 | + | ||
128 | + # Set the test speaker | ||
129 | + test_speaker = '233F4013' | ||
130 | + | ||
131 | + test_path = os.path.join(test_dir, test_speaker, 'test.p') | ||
132 | + | ||
133 | + # Perform the test | ||
134 | + best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list) | ||
135 | + | ||
136 | +if __name__ == '__main__': | ||
137 | + main() |
Speaker_Recognition/identification4.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import configure as c | ||
9 | + | ||
10 | +from DB_wav_reader import read_feats_structure | ||
11 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
12 | +from model.model4 import background_resnet | ||
13 | + | ||
14 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
15 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
16 | + if use_cuda: | ||
17 | + model.cuda() | ||
18 | + print('=> loading checkpoint') | ||
19 | + # original saved file with DataParallel | ||
20 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
21 | + # create new OrderedDict that does not contain `module.` | ||
22 | + model.load_state_dict(checkpoint['state_dict']) | ||
23 | + model.eval() | ||
24 | + return model | ||
25 | + | ||
26 | +def split_enroll_and_test(dataroot_dir): | ||
27 | + DB_all = read_feats_structure(dataroot_dir) | ||
28 | + enroll_DB = pd.DataFrame() | ||
29 | + test_DB = pd.DataFrame() | ||
30 | + | ||
31 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
32 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
33 | + | ||
34 | + # Reset the index | ||
35 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
36 | + test_DB = test_DB.reset_index(drop=True) | ||
37 | + return enroll_DB, test_DB | ||
38 | + | ||
39 | +def load_enroll_embeddings(embedding_dir): | ||
40 | + embeddings = {} | ||
41 | + for f in os.listdir(embedding_dir): | ||
42 | + spk = f.replace('.pth','') | ||
43 | + # Select the speakers who are in the 'enroll_spk_list' | ||
44 | + embedding_path = os.path.join(embedding_dir, f) | ||
45 | + tmp_embeddings = torch.load(embedding_path) | ||
46 | + embeddings[spk] = tmp_embeddings | ||
47 | + | ||
48 | + return embeddings | ||
49 | + | ||
50 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
51 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
52 | + | ||
53 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
54 | + activation = 0 | ||
55 | + with torch.no_grad(): | ||
56 | + for i in range(tot_segments): | ||
57 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
58 | + | ||
59 | + TT = ToTensorTestInput() | ||
60 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
61 | + | ||
62 | + if use_cuda: | ||
63 | + temp_input = temp_input.cuda() | ||
64 | + temp_activation,_ = model(temp_input) | ||
65 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
66 | + | ||
67 | + activation = l2_norm(activation, 1) | ||
68 | + | ||
69 | + return activation | ||
70 | + | ||
71 | +def l2_norm(input, alpha): | ||
72 | + input_size = input.size() # size:(n_frames, dim) | ||
73 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
74 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
75 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
76 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
77 | + output = _output.view(input_size) | ||
78 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
79 | + output = output * alpha | ||
80 | + return output | ||
81 | + | ||
82 | +def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list): | ||
83 | + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames) | ||
84 | + max_score = -10**8 | ||
85 | + best_spk = None | ||
86 | + for spk in spk_list: | ||
87 | + score = F.cosine_similarity(test_embedding, embeddings[spk]) | ||
88 | + score = score.data.cpu().numpy() | ||
89 | + if score > max_score: | ||
90 | + max_score = score | ||
91 | + best_spk = spk | ||
92 | + #print("Speaker identification result : %s" %best_spk) | ||
93 | + true_spk = test_filename.split('/')[-2].split('_')[0] | ||
94 | + print("\n=== Speaker identification ===") | ||
95 | + print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk)) | ||
96 | + return best_spk | ||
97 | + | ||
98 | +def main(): | ||
99 | + | ||
100 | + log_dir = 'new_model4' # Where the checkpoints are saved | ||
101 | + embedding_dir = 'enroll_embeddings4' # Where embeddings are saved | ||
102 | + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved | ||
103 | + | ||
104 | + # Settings | ||
105 | + use_cuda = True # Use cuda or not | ||
106 | + embedding_size = 128 # Dimension of speaker embeddings | ||
107 | + cp_num = 25 # Which checkpoint to use? | ||
108 | + n_classes = 241 # How many speakers in training data? | ||
109 | + test_frames = 100 # Split the test utterance | ||
110 | + | ||
111 | + # Load model from checkpoint | ||
112 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
113 | + | ||
114 | + # Get the dataframe for test DB | ||
115 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
116 | + | ||
117 | + # Load enroll embeddings | ||
118 | + embeddings = load_enroll_embeddings(embedding_dir) | ||
119 | + | ||
120 | + """ Test speaker list | ||
121 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
122 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
123 | + """ | ||
124 | + | ||
125 | + spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\ | ||
126 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777'] | ||
127 | + | ||
128 | + # Set the test speaker | ||
129 | + test_speaker = '207F2088' | ||
130 | + | ||
131 | + test_path = os.path.join(test_dir, test_speaker, 'test.p') | ||
132 | + | ||
133 | + # Perform the test | ||
134 | + best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list) | ||
135 | + | ||
136 | +if __name__ == '__main__': | ||
137 | + main() |
Speaker_Recognition/identification5.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import configure as c | ||
9 | + | ||
10 | +from DB_wav_reader import read_feats_structure | ||
11 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
12 | +from model.model5 import background_resnet | ||
13 | + | ||
14 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
15 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
16 | + if use_cuda: | ||
17 | + model.cuda() | ||
18 | + print('=> loading checkpoint') | ||
19 | + # original saved file with DataParallel | ||
20 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
21 | + # create new OrderedDict that does not contain `module.` | ||
22 | + model.load_state_dict(checkpoint['state_dict']) | ||
23 | + model.eval() | ||
24 | + return model | ||
25 | + | ||
26 | +def split_enroll_and_test(dataroot_dir): | ||
27 | + DB_all = read_feats_structure(dataroot_dir) | ||
28 | + enroll_DB = pd.DataFrame() | ||
29 | + test_DB = pd.DataFrame() | ||
30 | + | ||
31 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
32 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
33 | + | ||
34 | + # Reset the index | ||
35 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
36 | + test_DB = test_DB.reset_index(drop=True) | ||
37 | + return enroll_DB, test_DB | ||
38 | + | ||
39 | +def load_enroll_embeddings(embedding_dir): | ||
40 | + embeddings = {} | ||
41 | + for f in os.listdir(embedding_dir): | ||
42 | + spk = f.replace('.pth','') | ||
43 | + # Select the speakers who are in the 'enroll_spk_list' | ||
44 | + embedding_path = os.path.join(embedding_dir, f) | ||
45 | + tmp_embeddings = torch.load(embedding_path) | ||
46 | + embeddings[spk] = tmp_embeddings | ||
47 | + | ||
48 | + return embeddings | ||
49 | + | ||
50 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
51 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
52 | + | ||
53 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
54 | + activation = 0 | ||
55 | + with torch.no_grad(): | ||
56 | + for i in range(tot_segments): | ||
57 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
58 | + | ||
59 | + TT = ToTensorTestInput() | ||
60 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
61 | + | ||
62 | + if use_cuda: | ||
63 | + temp_input = temp_input.cuda() | ||
64 | + temp_activation,_ = model(temp_input) | ||
65 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
66 | + | ||
67 | + activation = l2_norm(activation, 1) | ||
68 | + | ||
69 | + return activation | ||
70 | + | ||
71 | +def l2_norm(input, alpha): | ||
72 | + input_size = input.size() # size:(n_frames, dim) | ||
73 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
74 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
75 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
76 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
77 | + output = _output.view(input_size) | ||
78 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
79 | + output = output * alpha | ||
80 | + return output | ||
81 | + | ||
82 | +def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list): | ||
83 | + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames) | ||
84 | + max_score = -10**8 | ||
85 | + best_spk = None | ||
86 | + for spk in spk_list: | ||
87 | + score = F.cosine_similarity(test_embedding, embeddings[spk]) | ||
88 | + score = score.data.cpu().numpy() | ||
89 | + if score > max_score: | ||
90 | + max_score = score | ||
91 | + best_spk = spk | ||
92 | + #print("Speaker identification result : %s" %best_spk) | ||
93 | + true_spk = test_filename.split('/')[-2].split('_')[0] | ||
94 | + print("\n=== Speaker identification ===") | ||
95 | + print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk)) | ||
96 | + return best_spk | ||
97 | + | ||
98 | +def main(): | ||
99 | + | ||
100 | + log_dir = 'new_model5' # Where the checkpoints are saved | ||
101 | + embedding_dir = 'enroll_embeddings5' # Where embeddings are saved | ||
102 | + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved | ||
103 | + | ||
104 | + # Settings | ||
105 | + use_cuda = True # Use cuda or not | ||
106 | + embedding_size = 128 # Dimension of speaker embeddings | ||
107 | + cp_num = 30 # Which checkpoint to use? | ||
108 | + n_classes = 241 # How many speakers in training data? | ||
109 | + test_frames = 100 # Split the test utterance | ||
110 | + | ||
111 | + # Load model from checkpoint | ||
112 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
113 | + | ||
114 | + # Get the dataframe for test DB | ||
115 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
116 | + | ||
117 | + # Load enroll embeddings | ||
118 | + embeddings = load_enroll_embeddings(embedding_dir) | ||
119 | + | ||
120 | + """ Test speaker list | ||
121 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
122 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
123 | + """ | ||
124 | + | ||
125 | + spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\ | ||
126 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777'] | ||
127 | + | ||
128 | + # Set the test speaker | ||
129 | + test_speaker = '207F2088' | ||
130 | + | ||
131 | + test_path = os.path.join(test_dir, test_speaker, 'test.p') | ||
132 | + | ||
133 | + # Perform the test | ||
134 | + best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list) | ||
135 | + | ||
136 | +if __name__ == '__main__': | ||
137 | + main() |
Speaker_Recognition/model/model3.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn as nn | ||
3 | +import torch.nn.functional as F | ||
4 | +from torch.autograd import Function | ||
5 | +import model.resnet1 as resnet | ||
6 | + | ||
7 | + | ||
8 | +class background_resnet(nn.Module): | ||
9 | + def __init__(self, embedding_size, num_classes, backbone='resnet18'): | ||
10 | + super(background_resnet, self).__init__() | ||
11 | + self.backbone = backbone | ||
12 | + # copying modules from pretrained models | ||
13 | + if backbone == 'resnet50': | ||
14 | + self.pretrained = resnet.resnet50(pretrained=False) | ||
15 | + elif backbone == 'resnet101': | ||
16 | + self.pretrained = resnet.resnet101(pretrained=False) | ||
17 | + elif backbone == 'resnet152': | ||
18 | + self.pretrained = resnet.resnet152(pretrained=False) | ||
19 | + elif backbone == 'resnet18': | ||
20 | + self.pretrained = resnet.resnet18(pretrained=False) | ||
21 | + elif backbone == 'resnet34': | ||
22 | + self.pretrained = resnet.resnet34(pretrained=False) | ||
23 | + else: | ||
24 | + raise RuntimeError('unknown backbone: {}'.format(backbone)) | ||
25 | + | ||
26 | + self.fc0 = nn.Linear(256, embedding_size) | ||
27 | + self.bn0 = nn.BatchNorm1d(embedding_size) | ||
28 | + self.relu = nn.ReLU() | ||
29 | + self.last = nn.Linear(embedding_size, num_classes) | ||
30 | + | ||
31 | + def forward(self, x): | ||
32 | + # input x: minibatch x 1 x 40 x 40 | ||
33 | + x = self.pretrained.conv1(x) | ||
34 | + x = self.pretrained.bn1(x) | ||
35 | + x = self.pretrained.relu(x) | ||
36 | + x = self.pretrained.layer1(x) | ||
37 | + x = self.pretrained.layer2(x) | ||
38 | + x = self.pretrained.layer3(x) | ||
39 | + x = self.pretrained.layer4(x) | ||
40 | + x = self.pretrained.layer5(x) | ||
41 | + | ||
42 | + out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1] | ||
43 | + out = torch.squeeze(out) # [batch, n_embed] | ||
44 | + # flatten the out so that the fully connected layer can be connected from here | ||
45 | + out = out.view(x.size(0), -1) # (n_batch, n_embed) | ||
46 | + spk_embedding = self.fc0(out) | ||
47 | + out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed] | ||
48 | + out = self.last(out) | ||
49 | + | ||
50 | + return spk_embedding, out | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
Speaker_Recognition/model/model4.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn as nn | ||
3 | +import torch.nn.functional as F | ||
4 | +from torch.autograd import Function | ||
5 | +import model.resnet1 as resnet | ||
6 | + | ||
7 | + | ||
8 | +class background_resnet(nn.Module): | ||
9 | + def __init__(self, embedding_size, num_classes, backbone='resnet34'): | ||
10 | + super(background_resnet, self).__init__() | ||
11 | + self.backbone = backbone | ||
12 | + # copying modules from pretrained models | ||
13 | + if backbone == 'resnet50': | ||
14 | + self.pretrained = resnet.resnet50(pretrained=False) | ||
15 | + elif backbone == 'resnet101': | ||
16 | + self.pretrained = resnet.resnet101(pretrained=False) | ||
17 | + elif backbone == 'resnet152': | ||
18 | + self.pretrained = resnet.resnet152(pretrained=False) | ||
19 | + elif backbone == 'resnet18': | ||
20 | + self.pretrained = resnet.resnet18(pretrained=False) | ||
21 | + elif backbone == 'resnet34': | ||
22 | + self.pretrained = resnet.resnet34(pretrained=False) | ||
23 | + else: | ||
24 | + raise RuntimeError('unknown backbone: {}'.format(backbone)) | ||
25 | + | ||
26 | + self.fc0 = nn.Linear(256, embedding_size) # 512 등으로 바꿀 경우 resnet 사용할 수 있음 | ||
27 | + self.bn0 = nn.BatchNorm1d(embedding_size) | ||
28 | + self.relu = nn.ReLU() | ||
29 | + self.last = nn.Linear(embedding_size, num_classes) | ||
30 | + | ||
31 | + def forward(self, x): | ||
32 | + # input x: minibatch x 1 x 40 x 40 | ||
33 | + x = self.pretrained.conv1(x) | ||
34 | + x = self.pretrained.bn1(x) | ||
35 | + x = self.pretrained.relu(x) | ||
36 | + x = self.pretrained.layer1(x) | ||
37 | + x = self.pretrained.layer2(x) | ||
38 | + x = self.pretrained.layer3(x) | ||
39 | + x = self.pretrained.layer4(x) | ||
40 | + x = self.pretrained.layer5(x) | ||
41 | + | ||
42 | + out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1] | ||
43 | + out = torch.squeeze(out) # [batch, n_embed] | ||
44 | + # flatten the out so that the fully connected layer can be connected from here | ||
45 | + out = out.view(x.size(0), -1) # (n_batch, n_embed) | ||
46 | + spk_embedding = self.fc0(out) | ||
47 | + out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed] | ||
48 | + out = self.last(out) | ||
49 | + | ||
50 | + return spk_embedding, out | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
Speaker_Recognition/model/model5.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn as nn | ||
3 | +import torch.nn.functional as F | ||
4 | +from torch.autograd import Function | ||
5 | +import model.resnet1 as resnet | ||
6 | + | ||
7 | + | ||
8 | +class background_resnet(nn.Module): | ||
9 | + def __init__(self, embedding_size, num_classes, backbone='resnet50'): | ||
10 | + super(background_resnet, self).__init__() | ||
11 | + self.backbone = backbone | ||
12 | + # copying modules from pretrained models | ||
13 | + if backbone == 'resnet50': | ||
14 | + self.pretrained = resnet.resnet50(pretrained=False) | ||
15 | + elif backbone == 'resnet101': | ||
16 | + self.pretrained = resnet.resnet101(pretrained=False) | ||
17 | + elif backbone == 'resnet152': | ||
18 | + self.pretrained = resnet.resnet152(pretrained=False) | ||
19 | + elif backbone == 'resnet18': | ||
20 | + self.pretrained = resnet.resnet18(pretrained=False) | ||
21 | + elif backbone == 'resnet34': | ||
22 | + self.pretrained = resnet.resnet34(pretrained=False) | ||
23 | + else: | ||
24 | + raise RuntimeError('unknown backbone: {}'.format(backbone)) | ||
25 | + | ||
26 | + self.fc0 = nn.Linear(512, embedding_size) # 512 등으로 바꿀 경우 resnet 사용할 수 있음 | ||
27 | + self.bn0 = nn.BatchNorm1d(embedding_size) | ||
28 | + self.relu = nn.ReLU() | ||
29 | + self.last = nn.Linear(embedding_size, num_classes) | ||
30 | + | ||
31 | + def forward(self, x): | ||
32 | + # input x: minibatch x 1 x 40 x 40 | ||
33 | + x = self.pretrained.conv1(x) | ||
34 | + x = self.pretrained.bn1(x) | ||
35 | + x = self.pretrained.relu(x) | ||
36 | + x = self.pretrained.layer1(x) | ||
37 | + x = self.pretrained.layer2(x) | ||
38 | + x = self.pretrained.layer3(x) | ||
39 | + x = self.pretrained.layer4(x) | ||
40 | + | ||
41 | + out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1] | ||
42 | + out = torch.squeeze(out) # [batch, n_embed] | ||
43 | + # flatten the out so that the fully connected layer can be connected from here | ||
44 | + out = out.view(x.size(0), -1) # (n_batch, n_embed) | ||
45 | + spk_embedding = self.fc0(out) | ||
46 | + out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed] | ||
47 | + out = self.last(out) | ||
48 | + | ||
49 | + return spk_embedding, out | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
Speaker_Recognition/train3.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn as nn | ||
3 | +import torch.optim as optim | ||
4 | +import torchvision.transforms as transforms | ||
5 | + | ||
6 | +import time | ||
7 | +import os | ||
8 | +import numpy as np | ||
9 | +import configure as c | ||
10 | +import pandas as pd | ||
11 | +from DB_wav_reader import read_feats_structure | ||
12 | +from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded | ||
13 | +from model.model3 import background_resnet | ||
14 | +import matplotlib as mpl | ||
15 | +mpl.use('Agg') | ||
16 | +import matplotlib.pyplot as plt | ||
17 | + | ||
18 | +import pandas as pd | ||
19 | +def load_dataset(val_ratio): | ||
20 | + # Load training set and validation set | ||
21 | + | ||
22 | + | ||
23 | + # Split training set into training set and validation set according to "val_ratio" | ||
24 | + train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio) | ||
25 | + | ||
26 | + file_loader = read_MFB # numpy array:(n_frames, n_dims) | ||
27 | + | ||
28 | + transform = transforms.Compose([ | ||
29 | + TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims) | ||
30 | + ToTensorInput() # torch tensor:(1, n_dims, n_frames) | ||
31 | + ]) | ||
32 | + transform_T = ToTensorDevInput() | ||
33 | + | ||
34 | + | ||
35 | + speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers | ||
36 | + spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)} | ||
37 | + | ||
38 | + train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx) | ||
39 | + valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx) | ||
40 | + | ||
41 | + n_classes = len(speaker_list) # How many speakers? 240 | ||
42 | + return train_dataset, valid_dataset, n_classes | ||
43 | + | ||
44 | +def split_train_dev(train_feat_dir, valid_ratio): | ||
45 | + train_valid_DB = read_feats_structure(train_feat_dir) | ||
46 | + total_len = len(train_valid_DB) # 148642 | ||
47 | + valid_len = int(total_len * valid_ratio/100.) | ||
48 | + train_len = total_len - valid_len | ||
49 | + shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True) | ||
50 | + # Split the DB into train and valid set | ||
51 | + train_DB = shuffled_train_valid_DB.iloc[:train_len] | ||
52 | + valid_DB = shuffled_train_valid_DB.iloc[train_len:] | ||
53 | + # Reset the index | ||
54 | + train_DB = train_DB.reset_index(drop=True) | ||
55 | + valid_DB = valid_DB.reset_index(drop=True) | ||
56 | + print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100)) | ||
57 | + print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100)) | ||
58 | + print('Total %d utts' %(total_len)) | ||
59 | + | ||
60 | + return train_DB, valid_DB | ||
61 | + | ||
62 | +def main(): | ||
63 | + # Set hyperparameters | ||
64 | + use_cuda = True # use gpu or cpu | ||
65 | + val_ratio = 10 # Percentage of validation set | ||
66 | + embedding_size = 128 | ||
67 | + start = 1 # Start epoch | ||
68 | + n_epochs = 30 # How many epochs? | ||
69 | + end = start + n_epochs # Last epoch | ||
70 | + | ||
71 | + lr = 1e-1 # Initial learning rate | ||
72 | + wd = 1e-4 # Weight decay (L2 penalty) | ||
73 | + optimizer_type = 'sgd' # ex) sgd, adam, adagrad | ||
74 | + | ||
75 | + batch_size = 64 # Batch size for training | ||
76 | + valid_batch_size = 16 # Batch size for validation | ||
77 | + use_shuffle = True # Shuffle for training or not | ||
78 | + | ||
79 | + # Load dataset | ||
80 | + train_dataset, valid_dataset, n_classes = load_dataset(val_ratio) | ||
81 | + | ||
82 | + # print the experiment configuration | ||
83 | + print('\nNumber of classes (speakers):\n{}\n'.format(n_classes)) | ||
84 | + | ||
85 | + log_dir = 'new_model3' # where to save checkpoints | ||
86 | + | ||
87 | + if not os.path.exists(log_dir): | ||
88 | + os.makedirs(log_dir) | ||
89 | + | ||
90 | + # instantiate model and initialize weights | ||
91 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
92 | + | ||
93 | + if use_cuda: | ||
94 | + model.cuda() | ||
95 | + | ||
96 | + # define loss function (criterion), optimizer and scheduler | ||
97 | + criterion = nn.CrossEntropyLoss() | ||
98 | + optimizer = create_optimizer(optimizer_type, model, lr, wd) | ||
99 | + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1) | ||
100 | + | ||
101 | + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, | ||
102 | + batch_size=batch_size, | ||
103 | + shuffle=use_shuffle) | ||
104 | + valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset, | ||
105 | + batch_size=valid_batch_size, | ||
106 | + shuffle=False, | ||
107 | + collate_fn = collate_fn_feat_padded) | ||
108 | + | ||
109 | + # to track the average training loss per epoch as the model trains | ||
110 | + avg_train_losses = [] | ||
111 | + # to track the average validation loss per epoch as the model trains | ||
112 | + avg_valid_losses = [] | ||
113 | + | ||
114 | + | ||
115 | + for epoch in range(start, end): | ||
116 | + | ||
117 | + # train for one epoch | ||
118 | + train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes) | ||
119 | + | ||
120 | + # evaluate on validation set | ||
121 | + valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch) | ||
122 | + | ||
123 | + scheduler.step(valid_loss, epoch) | ||
124 | + | ||
125 | + # calculate average loss over an epoch | ||
126 | + avg_train_losses.append(train_loss) | ||
127 | + avg_valid_losses.append(valid_loss) | ||
128 | + # do checkpointing | ||
129 | + torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(), | ||
130 | + 'optimizer': optimizer.state_dict()}, | ||
131 | + '{}/checkpoint_{}.pth'.format(log_dir, epoch)) | ||
132 | + | ||
133 | + # find position of lowest validation loss | ||
134 | + minposs = avg_valid_losses.index(min(avg_valid_losses))+1 | ||
135 | + print('Lowest validation loss at epoch %d' %minposs) | ||
136 | + | ||
137 | + # visualize the loss and learning rate as the network trained | ||
138 | + visualize_the_losses(avg_train_losses, avg_valid_losses) | ||
139 | + | ||
140 | + | ||
141 | +def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes): | ||
142 | + batch_time = AverageMeter() | ||
143 | + losses = AverageMeter() | ||
144 | + train_acc = AverageMeter() | ||
145 | + | ||
146 | + n_correct, n_total = 0, 0 | ||
147 | + log_interval = 84 | ||
148 | + # switch to train mode | ||
149 | + model.train() | ||
150 | + | ||
151 | + end = time.time() | ||
152 | + # pbar = tqdm(enumerate(train_loader)) | ||
153 | + for batch_idx, (data) in enumerate(train_loader): | ||
154 | + inputs, targets = data # target size:(batch size,1), input size:(batch size, 1, dim, win) | ||
155 | + targets = targets.view(-1) # target size:(batch size) | ||
156 | + current_sample = inputs.size(0) # batch size | ||
157 | + | ||
158 | + if use_cuda: | ||
159 | + inputs = inputs.cuda() | ||
160 | + targets = targets.cuda() | ||
161 | + _, output = model(inputs) # out size:(batch size, #classes), for softmax | ||
162 | + | ||
163 | + # calculate accuracy of predictions in the current batch | ||
164 | + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item() | ||
165 | + n_total += current_sample | ||
166 | + train_acc_temp = 100. * n_correct / n_total | ||
167 | + train_acc.update(train_acc_temp, inputs.size(0)) | ||
168 | + | ||
169 | + loss = criterion(output, targets) | ||
170 | + losses.update(loss.item(), inputs.size(0)) | ||
171 | + | ||
172 | + # compute gradient and do SGD step | ||
173 | + optimizer.zero_grad() | ||
174 | + loss.backward() | ||
175 | + optimizer.step() | ||
176 | + | ||
177 | + # measure elapsed time | ||
178 | + batch_time.update(time.time() - end) | ||
179 | + end = time.time() | ||
180 | + | ||
181 | + if batch_idx % log_interval == 0: | ||
182 | + print( | ||
183 | + 'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t' | ||
184 | + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' | ||
185 | + 'Loss {loss.avg:.4f}\t' | ||
186 | + 'Acc {train_acc.avg:.4f}'.format( | ||
187 | + epoch, batch_idx * len(inputs), len(train_loader.dataset), | ||
188 | + 100. * batch_idx / len(train_loader), | ||
189 | + batch_time=batch_time, loss=losses, train_acc=train_acc)) | ||
190 | + return losses.avg | ||
191 | + | ||
192 | +def validate(val_loader, model, criterion, use_cuda, epoch): | ||
193 | + batch_time = AverageMeter() | ||
194 | + losses = AverageMeter() | ||
195 | + val_acc = AverageMeter() | ||
196 | + | ||
197 | + n_correct, n_total = 0, 0 | ||
198 | + | ||
199 | + # switch to evaluate mode | ||
200 | + model.eval() | ||
201 | + | ||
202 | + with torch.no_grad(): | ||
203 | + end = time.time() | ||
204 | + for i, (data) in enumerate(val_loader): | ||
205 | + inputs, targets = data | ||
206 | + current_sample = inputs.size(0) # batch size | ||
207 | + | ||
208 | + if use_cuda: | ||
209 | + inputs = inputs.cuda() | ||
210 | + targets = targets.cuda() | ||
211 | + | ||
212 | + # compute output | ||
213 | + _, output = model(inputs) | ||
214 | + | ||
215 | + # measure accuracy and record loss | ||
216 | + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item() | ||
217 | + n_total += current_sample | ||
218 | + val_acc_temp = 100. * n_correct / n_total | ||
219 | + val_acc.update(val_acc_temp, inputs.size(0)) | ||
220 | + | ||
221 | + loss = criterion(output, targets) | ||
222 | + losses.update(loss.item(), inputs.size(0)) | ||
223 | + # measure elapsed time | ||
224 | + batch_time.update(time.time() - end) | ||
225 | + end = time.time() | ||
226 | + | ||
227 | + print(' * Validation: ' | ||
228 | + 'Loss {loss.avg:.4f}\t' | ||
229 | + 'Acc {val_acc.avg:.4f}'.format( | ||
230 | + loss=losses, val_acc=val_acc)) | ||
231 | + | ||
232 | + return losses.avg | ||
233 | + | ||
234 | +class AverageMeter(object): | ||
235 | + """Computes and stores the average and current value""" | ||
236 | + def __init__(self): | ||
237 | + self.reset() | ||
238 | + def reset(self): | ||
239 | + self.val = 0 | ||
240 | + self.avg = 0 | ||
241 | + self.sum = 0 | ||
242 | + self.count = 0 | ||
243 | + def update(self, val, n=1): | ||
244 | + self.val = val | ||
245 | + self.sum += val * n | ||
246 | + self.count += n | ||
247 | + self.avg = self.sum / self.count | ||
248 | + | ||
249 | +def create_optimizer(optimizer, model, new_lr, wd): | ||
250 | + # setup optimizer | ||
251 | + if optimizer == 'sgd': | ||
252 | + optimizer = optim.SGD(model.parameters(), lr=new_lr, | ||
253 | + momentum=0.9, dampening=0, | ||
254 | + weight_decay=wd) | ||
255 | + elif optimizer == 'adam': | ||
256 | + optimizer = optim.Adam(model.parameters(), lr=new_lr, | ||
257 | + weight_decay=wd) | ||
258 | + elif optimizer == 'adagrad': | ||
259 | + optimizer = optim.Adagrad(model.parameters(), | ||
260 | + lr=new_lr, | ||
261 | + weight_decay=wd) | ||
262 | + return optimizer | ||
263 | + | ||
264 | +def visualize_the_losses(train_loss, valid_loss): | ||
265 | + fig = plt.figure(figsize=(10,8)) | ||
266 | + plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss') | ||
267 | + plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss') | ||
268 | + | ||
269 | + # find position of lowest validation loss | ||
270 | + minposs = valid_loss.index(min(valid_loss))+1 | ||
271 | + plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint') | ||
272 | + | ||
273 | + plt.xlabel('epochs') | ||
274 | + plt.ylabel('loss') | ||
275 | + plt.ylim(0, 3.5) # consistent scale | ||
276 | + plt.xlim(0, len(train_loss)+1) # consistent scale | ||
277 | + plt.grid(True) | ||
278 | + plt.legend() | ||
279 | + plt.tight_layout() | ||
280 | + #plt.show() | ||
281 | + fig.savefig('train3.png', bbox_inches='tight') | ||
282 | + | ||
283 | +if __name__ == '__main__': | ||
284 | + main() |
Speaker_Recognition/train4.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn as nn | ||
3 | +import torch.optim as optim | ||
4 | +import torchvision.transforms as transforms | ||
5 | + | ||
6 | +import time | ||
7 | +import os | ||
8 | +import numpy as np | ||
9 | +import configure as c | ||
10 | +import pandas as pd | ||
11 | +from DB_wav_reader import read_feats_structure | ||
12 | +from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded | ||
13 | +from model.model4 import background_resnet | ||
14 | +import matplotlib as mpl | ||
15 | +mpl.use('Agg') | ||
16 | +import matplotlib.pyplot as plt | ||
17 | +import pandas as pd | ||
18 | +def load_dataset(val_ratio): | ||
19 | + # Load training set and validation set | ||
20 | + | ||
21 | + | ||
22 | + # Split training set into training set and validation set according to "val_ratio" | ||
23 | + train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio) | ||
24 | + | ||
25 | + file_loader = read_MFB # numpy array:(n_frames, n_dims) | ||
26 | + | ||
27 | + transform = transforms.Compose([ | ||
28 | + TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims) | ||
29 | + ToTensorInput() # torch tensor:(1, n_dims, n_frames) | ||
30 | + ]) | ||
31 | + transform_T = ToTensorDevInput() | ||
32 | + | ||
33 | + | ||
34 | + speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers | ||
35 | + spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)} | ||
36 | + | ||
37 | + train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx) | ||
38 | + valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx) | ||
39 | + | ||
40 | + n_classes = len(speaker_list) # How many speakers? 240 | ||
41 | + return train_dataset, valid_dataset, n_classes | ||
42 | + | ||
43 | +def split_train_dev(train_feat_dir, valid_ratio): | ||
44 | + train_valid_DB = read_feats_structure(train_feat_dir) | ||
45 | + total_len = len(train_valid_DB) # 148642 | ||
46 | + valid_len = int(total_len * valid_ratio/100.) | ||
47 | + train_len = total_len - valid_len | ||
48 | + shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True) | ||
49 | + # Split the DB into train and valid set | ||
50 | + train_DB = shuffled_train_valid_DB.iloc[:train_len] | ||
51 | + valid_DB = shuffled_train_valid_DB.iloc[train_len:] | ||
52 | + # Reset the index | ||
53 | + train_DB = train_DB.reset_index(drop=True) | ||
54 | + valid_DB = valid_DB.reset_index(drop=True) | ||
55 | + print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100)) | ||
56 | + print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100)) | ||
57 | + print('Total %d utts' %(total_len)) | ||
58 | + | ||
59 | + return train_DB, valid_DB | ||
60 | + | ||
61 | +def main(): | ||
62 | + # Set hyperparameters | ||
63 | + use_cuda = True # use gpu or cpu | ||
64 | + val_ratio = 10 # Percentage of validation set | ||
65 | + embedding_size = 128 | ||
66 | + start = 1 # Start epoch | ||
67 | + n_epochs = 30 # How many epochs? | ||
68 | + end = start + n_epochs # Last epoch | ||
69 | + | ||
70 | + lr = 1e-1 # Initial learning rate | ||
71 | + wd = 1e-4 # Weight decay (L2 penalty) | ||
72 | + optimizer_type = 'sgd' # ex) sgd, adam, adagrad | ||
73 | + | ||
74 | + batch_size = 64 # Batch size for training | ||
75 | + valid_batch_size = 16 # Batch size for validation | ||
76 | + use_shuffle = True # Shuffle for training or not | ||
77 | + | ||
78 | + # Load dataset | ||
79 | + train_dataset, valid_dataset, n_classes = load_dataset(val_ratio) | ||
80 | + | ||
81 | + # print the experiment configuration | ||
82 | + print('\nNumber of classes (speakers):\n{}\n'.format(n_classes)) | ||
83 | + | ||
84 | + log_dir = 'new_model4' # where to save checkpoints | ||
85 | + | ||
86 | + if not os.path.exists(log_dir): | ||
87 | + os.makedirs(log_dir) | ||
88 | + | ||
89 | + # instantiate model and initialize weights | ||
90 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
91 | + | ||
92 | + if use_cuda: | ||
93 | + model.cuda() | ||
94 | + | ||
95 | + # define loss function (criterion), optimizer and scheduler | ||
96 | + criterion = nn.CrossEntropyLoss() | ||
97 | + optimizer = create_optimizer(optimizer_type, model, lr, wd) | ||
98 | + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1) | ||
99 | + | ||
100 | + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, | ||
101 | + batch_size=batch_size, | ||
102 | + shuffle=use_shuffle) | ||
103 | + valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset, | ||
104 | + batch_size=valid_batch_size, | ||
105 | + shuffle=False, | ||
106 | + collate_fn = collate_fn_feat_padded) | ||
107 | + | ||
108 | + # to track the average training loss per epoch as the model trains | ||
109 | + avg_train_losses = [] | ||
110 | + # to track the average validation loss per epoch as the model trains | ||
111 | + avg_valid_losses = [] | ||
112 | + | ||
113 | + | ||
114 | + for epoch in range(start, end): | ||
115 | + | ||
116 | + # train for one epoch | ||
117 | + train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes) | ||
118 | + | ||
119 | + # evaluate on validation set | ||
120 | + valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch) | ||
121 | + | ||
122 | + scheduler.step(valid_loss, epoch) | ||
123 | + | ||
124 | + # calculate average loss over an epoch | ||
125 | + avg_train_losses.append(train_loss) | ||
126 | + avg_valid_losses.append(valid_loss) | ||
127 | + # do checkpointing | ||
128 | + torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(), | ||
129 | + 'optimizer': optimizer.state_dict()}, | ||
130 | + '{}/checkpoint_{}.pth'.format(log_dir, epoch)) | ||
131 | + | ||
132 | + # find position of lowest validation loss | ||
133 | + minposs = avg_valid_losses.index(min(avg_valid_losses))+1 | ||
134 | + print('Lowest validation loss at epoch %d' %minposs) | ||
135 | + | ||
136 | + # visualize the loss and learning rate as the network trained | ||
137 | + visualize_the_losses(avg_train_losses, avg_valid_losses) | ||
138 | + | ||
139 | + | ||
140 | +def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes): | ||
141 | + batch_time = AverageMeter() | ||
142 | + losses = AverageMeter() | ||
143 | + train_acc = AverageMeter() | ||
144 | + | ||
145 | + n_correct, n_total = 0, 0 | ||
146 | + log_interval = 84 | ||
147 | + # switch to train mode | ||
148 | + model.train() | ||
149 | + | ||
150 | + end = time.time() | ||
151 | + # pbar = tqdm(enumerate(train_loader)) | ||
152 | + for batch_idx, (data) in enumerate(train_loader): | ||
153 | + inputs, targets = data # target size:(batch size,1), input size:(batch size, 1, dim, win) | ||
154 | + targets = targets.view(-1) # target size:(batch size) | ||
155 | + current_sample = inputs.size(0) # batch size | ||
156 | + | ||
157 | + if use_cuda: | ||
158 | + inputs = inputs.cuda() | ||
159 | + targets = targets.cuda() | ||
160 | + _, output = model(inputs) # out size:(batch size, #classes), for softmax | ||
161 | + | ||
162 | + # calculate accuracy of predictions in the current batch | ||
163 | + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item() | ||
164 | + n_total += current_sample | ||
165 | + train_acc_temp = 100. * n_correct / n_total | ||
166 | + train_acc.update(train_acc_temp, inputs.size(0)) | ||
167 | + | ||
168 | + loss = criterion(output, targets) | ||
169 | + losses.update(loss.item(), inputs.size(0)) | ||
170 | + | ||
171 | + # compute gradient and do SGD step | ||
172 | + optimizer.zero_grad() | ||
173 | + loss.backward() | ||
174 | + optimizer.step() | ||
175 | + | ||
176 | + # measure elapsed time | ||
177 | + batch_time.update(time.time() - end) | ||
178 | + end = time.time() | ||
179 | + | ||
180 | + if batch_idx % log_interval == 0: | ||
181 | + print( | ||
182 | + 'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t' | ||
183 | + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' | ||
184 | + 'Loss {loss.avg:.4f}\t' | ||
185 | + 'Acc {train_acc.avg:.4f}'.format( | ||
186 | + epoch, batch_idx * len(inputs), len(train_loader.dataset), | ||
187 | + 100. * batch_idx / len(train_loader), | ||
188 | + batch_time=batch_time, loss=losses, train_acc=train_acc)) | ||
189 | + return losses.avg | ||
190 | + | ||
191 | +def validate(val_loader, model, criterion, use_cuda, epoch): | ||
192 | + batch_time = AverageMeter() | ||
193 | + losses = AverageMeter() | ||
194 | + val_acc = AverageMeter() | ||
195 | + | ||
196 | + n_correct, n_total = 0, 0 | ||
197 | + | ||
198 | + # switch to evaluate mode | ||
199 | + model.eval() | ||
200 | + | ||
201 | + with torch.no_grad(): | ||
202 | + end = time.time() | ||
203 | + for i, (data) in enumerate(val_loader): | ||
204 | + inputs, targets = data | ||
205 | + current_sample = inputs.size(0) # batch size | ||
206 | + | ||
207 | + if use_cuda: | ||
208 | + inputs = inputs.cuda() | ||
209 | + targets = targets.cuda() | ||
210 | + | ||
211 | + # compute output | ||
212 | + _, output = model(inputs) | ||
213 | + | ||
214 | + # measure accuracy and record loss | ||
215 | + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item() | ||
216 | + n_total += current_sample | ||
217 | + val_acc_temp = 100. * n_correct / n_total | ||
218 | + val_acc.update(val_acc_temp, inputs.size(0)) | ||
219 | + | ||
220 | + loss = criterion(output, targets) | ||
221 | + losses.update(loss.item(), inputs.size(0)) | ||
222 | + # measure elapsed time | ||
223 | + batch_time.update(time.time() - end) | ||
224 | + end = time.time() | ||
225 | + | ||
226 | + print(' * Validation: ' | ||
227 | + 'Loss {loss.avg:.4f}\t' | ||
228 | + 'Acc {val_acc.avg:.4f}'.format( | ||
229 | + loss=losses, val_acc=val_acc)) | ||
230 | + | ||
231 | + return losses.avg | ||
232 | + | ||
233 | +class AverageMeter(object): | ||
234 | + """Computes and stores the average and current value""" | ||
235 | + def __init__(self): | ||
236 | + self.reset() | ||
237 | + def reset(self): | ||
238 | + self.val = 0 | ||
239 | + self.avg = 0 | ||
240 | + self.sum = 0 | ||
241 | + self.count = 0 | ||
242 | + def update(self, val, n=1): | ||
243 | + self.val = val | ||
244 | + self.sum += val * n | ||
245 | + self.count += n | ||
246 | + self.avg = self.sum / self.count | ||
247 | + | ||
248 | +def create_optimizer(optimizer, model, new_lr, wd): | ||
249 | + # setup optimizer | ||
250 | + if optimizer == 'sgd': | ||
251 | + optimizer = optim.SGD(model.parameters(), lr=new_lr, | ||
252 | + momentum=0.9, dampening=0, | ||
253 | + weight_decay=wd) | ||
254 | + elif optimizer == 'adam': | ||
255 | + optimizer = optim.Adam(model.parameters(), lr=new_lr, | ||
256 | + weight_decay=wd) | ||
257 | + elif optimizer == 'adagrad': | ||
258 | + optimizer = optim.Adagrad(model.parameters(), | ||
259 | + lr=new_lr, | ||
260 | + weight_decay=wd) | ||
261 | + return optimizer | ||
262 | + | ||
263 | +def visualize_the_losses(train_loss, valid_loss): | ||
264 | + fig = plt.figure(figsize=(10,8)) | ||
265 | + plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss') | ||
266 | + plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss') | ||
267 | + | ||
268 | + # find position of lowest validation loss | ||
269 | + minposs = valid_loss.index(min(valid_loss))+1 | ||
270 | + plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint') | ||
271 | + | ||
272 | + plt.xlabel('epochs') | ||
273 | + plt.ylabel('loss') | ||
274 | + plt.ylim(0, 3.5) # consistent scale | ||
275 | + plt.xlim(0, len(train_loss)+1) # consistent scale | ||
276 | + plt.grid(True) | ||
277 | + plt.legend() | ||
278 | + plt.tight_layout() | ||
279 | + #plt.show() | ||
280 | + fig.savefig('train4.png', bbox_inches='tight') | ||
281 | + | ||
282 | +if __name__ == '__main__': | ||
283 | + main() |
Speaker_Recognition/train4_merge.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn as nn | ||
3 | +import torch.optim as optim | ||
4 | +import torchvision.transforms as transforms | ||
5 | + | ||
6 | +import time | ||
7 | +import os | ||
8 | +import numpy as np | ||
9 | +import configure1_merge as c | ||
10 | +import pandas as pd | ||
11 | +from DB_wav_reader import read_feats_structure | ||
12 | +from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded | ||
13 | +from model.model4 import background_resnet | ||
14 | +import matplotlib as mpl | ||
15 | +mpl.use('Agg') | ||
16 | +import matplotlib.pyplot as plt | ||
17 | + | ||
18 | +import pandas as pd | ||
19 | +def load_dataset(val_ratio): | ||
20 | + # Load training set and validation set | ||
21 | + | ||
22 | + | ||
23 | + # Split training set into training set and validation set according to "val_ratio" | ||
24 | + train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio) | ||
25 | + | ||
26 | + file_loader = read_MFB # numpy array:(n_frames, n_dims) | ||
27 | + | ||
28 | + transform = transforms.Compose([ | ||
29 | + TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims) | ||
30 | + ToTensorInput() # torch tensor:(1, n_dims, n_frames) | ||
31 | + ]) | ||
32 | + transform_T = ToTensorDevInput() | ||
33 | + | ||
34 | + | ||
35 | + speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers | ||
36 | + spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)} | ||
37 | + | ||
38 | + train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx) | ||
39 | + valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx) | ||
40 | + | ||
41 | + n_classes = len(speaker_list) # How many speakers? 240 | ||
42 | + return train_dataset, valid_dataset, n_classes | ||
43 | + | ||
44 | +def split_train_dev(train_feat_dir, valid_ratio): | ||
45 | + train_valid_DB = read_feats_structure(train_feat_dir) | ||
46 | + total_len = len(train_valid_DB) # 148642 | ||
47 | + valid_len = int(total_len * valid_ratio/100.) | ||
48 | + train_len = total_len - valid_len | ||
49 | + shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True) | ||
50 | + # Split the DB into train and valid set | ||
51 | + train_DB = shuffled_train_valid_DB.iloc[:train_len] | ||
52 | + valid_DB = shuffled_train_valid_DB.iloc[train_len:] | ||
53 | + # Reset the index | ||
54 | + train_DB = train_DB.reset_index(drop=True) | ||
55 | + valid_DB = valid_DB.reset_index(drop=True) | ||
56 | + print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100)) | ||
57 | + print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100)) | ||
58 | + print('Total %d utts' %(total_len)) | ||
59 | + | ||
60 | + return train_DB, valid_DB | ||
61 | + | ||
62 | +def main(): | ||
63 | + # Set hyperparameters | ||
64 | + use_cuda = True # use gpu or cpu | ||
65 | + val_ratio = 10 # Percentage of validation set | ||
66 | + embedding_size = 128 | ||
67 | + start = 1 # Start epoch | ||
68 | + n_epochs = 60 # How many epochs? | ||
69 | + end = start + n_epochs # Last epoch | ||
70 | + | ||
71 | + lr = 1e-1 # Initial learning rate | ||
72 | + wd = 1e-4 # Weight decay (L2 penalty) | ||
73 | + optimizer_type = 'sgd' # ex) sgd, adam, adagrad | ||
74 | + | ||
75 | + batch_size = 64 # Batch size for training | ||
76 | + valid_batch_size = 16 # Batch size for validation | ||
77 | + use_shuffle = True # Shuffle for training or not | ||
78 | + | ||
79 | + # Load dataset | ||
80 | + train_dataset, valid_dataset, n_classes = load_dataset(val_ratio) | ||
81 | + | ||
82 | + # print the experiment configuration | ||
83 | + print('\nNumber of classes (speakers):\n{}\n'.format(n_classes)) | ||
84 | + | ||
85 | + log_dir = 'new_model4_merge' # where to save checkpoints | ||
86 | + | ||
87 | + if not os.path.exists(log_dir): | ||
88 | + os.makedirs(log_dir) | ||
89 | + | ||
90 | + # instantiate model and initialize weights | ||
91 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
92 | + | ||
93 | + if use_cuda: | ||
94 | + model.cuda() | ||
95 | + | ||
96 | + # define loss function (criterion), optimizer and scheduler | ||
97 | + criterion = nn.CrossEntropyLoss() | ||
98 | + optimizer = create_optimizer(optimizer_type, model, lr, wd) | ||
99 | + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1) | ||
100 | + | ||
101 | + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, | ||
102 | + batch_size=batch_size, | ||
103 | + shuffle=use_shuffle) | ||
104 | + valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset, | ||
105 | + batch_size=valid_batch_size, | ||
106 | + shuffle=False, | ||
107 | + collate_fn = collate_fn_feat_padded) | ||
108 | + | ||
109 | + # to track the average training loss per epoch as the model trains | ||
110 | + avg_train_losses = [] | ||
111 | + # to track the average validation loss per epoch as the model trains | ||
112 | + avg_valid_losses = [] | ||
113 | + | ||
114 | + | ||
115 | + for epoch in range(start, end): | ||
116 | + | ||
117 | + # train for one epoch | ||
118 | + train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes) | ||
119 | + | ||
120 | + # evaluate on validation set | ||
121 | + valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch) | ||
122 | + | ||
123 | + scheduler.step(valid_loss, epoch) | ||
124 | + | ||
125 | + # calculate average loss over an epoch | ||
126 | + avg_train_losses.append(train_loss) | ||
127 | + avg_valid_losses.append(valid_loss) | ||
128 | + # do checkpointing | ||
129 | + torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(), | ||
130 | + 'optimizer': optimizer.state_dict()}, | ||
131 | + '{}/checkpoint_{}.pth'.format(log_dir, epoch)) | ||
132 | + | ||
133 | + # find position of lowest validation loss | ||
134 | + minposs = avg_valid_losses.index(min(avg_valid_losses))+1 | ||
135 | + print('Lowest validation loss at epoch %d' %minposs) | ||
136 | + | ||
137 | + # visualize the loss and learning rate as the network trained | ||
138 | + visualize_the_losses(avg_train_losses, avg_valid_losses) | ||
139 | + | ||
140 | + | ||
141 | +def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes): | ||
142 | + batch_time = AverageMeter() | ||
143 | + losses = AverageMeter() | ||
144 | + train_acc = AverageMeter() | ||
145 | + | ||
146 | + n_correct, n_total = 0, 0 | ||
147 | + log_interval = 84 | ||
148 | + # switch to train mode | ||
149 | + model.train() | ||
150 | + | ||
151 | + end = time.time() | ||
152 | + # pbar = tqdm(enumerate(train_loader)) | ||
153 | + for batch_idx, (data) in enumerate(train_loader): | ||
154 | + inputs, targets = data # target size:(batch size,1), input size:(batch size, 1, dim, win) | ||
155 | + targets = targets.view(-1) # target size:(batch size) | ||
156 | + current_sample = inputs.size(0) # batch size | ||
157 | + | ||
158 | + if use_cuda: | ||
159 | + inputs = inputs.cuda() | ||
160 | + targets = targets.cuda() | ||
161 | + _, output = model(inputs) # out size:(batch size, #classes), for softmax | ||
162 | + | ||
163 | + # calculate accuracy of predictions in the current batch | ||
164 | + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item() | ||
165 | + n_total += current_sample | ||
166 | + train_acc_temp = 100. * n_correct / n_total | ||
167 | + train_acc.update(train_acc_temp, inputs.size(0)) | ||
168 | + | ||
169 | + loss = criterion(output, targets) | ||
170 | + losses.update(loss.item(), inputs.size(0)) | ||
171 | + | ||
172 | + # compute gradient and do SGD step | ||
173 | + optimizer.zero_grad() | ||
174 | + loss.backward() | ||
175 | + optimizer.step() | ||
176 | + | ||
177 | + # measure elapsed time | ||
178 | + batch_time.update(time.time() - end) | ||
179 | + end = time.time() | ||
180 | + | ||
181 | + if batch_idx % log_interval == 0: | ||
182 | + print( | ||
183 | + 'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t' | ||
184 | + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' | ||
185 | + 'Loss {loss.avg:.4f}\t' | ||
186 | + 'Acc {train_acc.avg:.4f}'.format( | ||
187 | + epoch, batch_idx * len(inputs), len(train_loader.dataset), | ||
188 | + 100. * batch_idx / len(train_loader), | ||
189 | + batch_time=batch_time, loss=losses, train_acc=train_acc)) | ||
190 | + return losses.avg | ||
191 | + | ||
192 | +def validate(val_loader, model, criterion, use_cuda, epoch): | ||
193 | + batch_time = AverageMeter() | ||
194 | + losses = AverageMeter() | ||
195 | + val_acc = AverageMeter() | ||
196 | + | ||
197 | + n_correct, n_total = 0, 0 | ||
198 | + | ||
199 | + # switch to evaluate mode | ||
200 | + model.eval() | ||
201 | + | ||
202 | + with torch.no_grad(): | ||
203 | + end = time.time() | ||
204 | + for i, (data) in enumerate(val_loader): | ||
205 | + inputs, targets = data | ||
206 | + current_sample = inputs.size(0) # batch size | ||
207 | + | ||
208 | + if use_cuda: | ||
209 | + inputs = inputs.cuda() | ||
210 | + targets = targets.cuda() | ||
211 | + | ||
212 | + # compute output | ||
213 | + _, output = model(inputs) | ||
214 | + | ||
215 | + # measure accuracy and record loss | ||
216 | + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item() | ||
217 | + n_total += current_sample | ||
218 | + val_acc_temp = 100. * n_correct / n_total | ||
219 | + val_acc.update(val_acc_temp, inputs.size(0)) | ||
220 | + | ||
221 | + loss = criterion(output, targets) | ||
222 | + losses.update(loss.item(), inputs.size(0)) | ||
223 | + # measure elapsed time | ||
224 | + batch_time.update(time.time() - end) | ||
225 | + end = time.time() | ||
226 | + | ||
227 | + print(' * Validation: ' | ||
228 | + 'Loss {loss.avg:.4f}\t' | ||
229 | + 'Acc {val_acc.avg:.4f}'.format( | ||
230 | + loss=losses, val_acc=val_acc)) | ||
231 | + | ||
232 | + return losses.avg | ||
233 | + | ||
234 | +class AverageMeter(object): | ||
235 | + """Computes and stores the average and current value""" | ||
236 | + def __init__(self): | ||
237 | + self.reset() | ||
238 | + def reset(self): | ||
239 | + self.val = 0 | ||
240 | + self.avg = 0 | ||
241 | + self.sum = 0 | ||
242 | + self.count = 0 | ||
243 | + def update(self, val, n=1): | ||
244 | + self.val = val | ||
245 | + self.sum += val * n | ||
246 | + self.count += n | ||
247 | + self.avg = self.sum / self.count | ||
248 | + | ||
249 | +def create_optimizer(optimizer, model, new_lr, wd): | ||
250 | + # setup optimizer | ||
251 | + if optimizer == 'sgd': | ||
252 | + optimizer = optim.SGD(model.parameters(), lr=new_lr, | ||
253 | + momentum=0.9, dampening=0, | ||
254 | + weight_decay=wd) | ||
255 | + elif optimizer == 'adam': | ||
256 | + optimizer = optim.Adam(model.parameters(), lr=new_lr, | ||
257 | + weight_decay=wd) | ||
258 | + elif optimizer == 'adagrad': | ||
259 | + optimizer = optim.Adagrad(model.parameters(), | ||
260 | + lr=new_lr, | ||
261 | + weight_decay=wd) | ||
262 | + return optimizer | ||
263 | + | ||
264 | +def visualize_the_losses(train_loss, valid_loss): | ||
265 | + fig = plt.figure(figsize=(10,8)) | ||
266 | + plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss') | ||
267 | + plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss') | ||
268 | + | ||
269 | + # find position of lowest validation loss | ||
270 | + minposs = valid_loss.index(min(valid_loss))+1 | ||
271 | + plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint') | ||
272 | + | ||
273 | + plt.xlabel('epochs') | ||
274 | + plt.ylabel('loss') | ||
275 | + plt.ylim(0, 3.5) # consistent scale | ||
276 | + plt.xlim(0, len(train_loss)+1) # consistent scale | ||
277 | + plt.grid(True) | ||
278 | + plt.legend() | ||
279 | + plt.tight_layout() | ||
280 | + #plt.show() | ||
281 | + fig.savefig('train4_merge.png', bbox_inches='tight') | ||
282 | + | ||
283 | +if __name__ == '__main__': | ||
284 | + main() |
Speaker_Recognition/train4_zeroth.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn as nn | ||
3 | +import torch.optim as optim | ||
4 | +import torchvision.transforms as transforms | ||
5 | + | ||
6 | +import time | ||
7 | +import os | ||
8 | +import numpy as np | ||
9 | +import configure1_zeroth as c | ||
10 | +import pandas as pd | ||
11 | +from DB_wav_reader import read_feats_structure | ||
12 | +from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded | ||
13 | +from model.model4 import background_resnet | ||
14 | + | ||
15 | +import matplotlib as mpl | ||
16 | +mpl.use('Agg') | ||
17 | +import matplotlib.pyplot as plt | ||
18 | + | ||
19 | +import pandas as pd | ||
20 | +def load_dataset(val_ratio): | ||
21 | + # Load training set and validation set | ||
22 | + | ||
23 | + | ||
24 | + # Split training set into training set and validation set according to "val_ratio" | ||
25 | + train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio) | ||
26 | + | ||
27 | + file_loader = read_MFB # numpy array:(n_frames, n_dims) | ||
28 | + | ||
29 | + transform = transforms.Compose([ | ||
30 | + TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims) | ||
31 | + ToTensorInput() # torch tensor:(1, n_dims, n_frames) | ||
32 | + ]) | ||
33 | + transform_T = ToTensorDevInput() | ||
34 | + | ||
35 | + | ||
36 | + speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers | ||
37 | + spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)} | ||
38 | + | ||
39 | + train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx) | ||
40 | + valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx) | ||
41 | + | ||
42 | + n_classes = len(speaker_list) # How many speakers? 240 | ||
43 | + return train_dataset, valid_dataset, n_classes | ||
44 | + | ||
45 | +def split_train_dev(train_feat_dir, valid_ratio): | ||
46 | + train_valid_DB = read_feats_structure(train_feat_dir) | ||
47 | + total_len = len(train_valid_DB) # 148642 | ||
48 | + valid_len = int(total_len * valid_ratio/100.) | ||
49 | + train_len = total_len - valid_len | ||
50 | + shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True) | ||
51 | + # Split the DB into train and valid set | ||
52 | + train_DB = shuffled_train_valid_DB.iloc[:train_len] | ||
53 | + valid_DB = shuffled_train_valid_DB.iloc[train_len:] | ||
54 | + # Reset the index | ||
55 | + train_DB = train_DB.reset_index(drop=True) | ||
56 | + valid_DB = valid_DB.reset_index(drop=True) | ||
57 | + print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100)) | ||
58 | + print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100)) | ||
59 | + print('Total %d utts' %(total_len)) | ||
60 | + | ||
61 | + return train_DB, valid_DB | ||
62 | + | ||
63 | +def main(): | ||
64 | + # Set hyperparameters | ||
65 | + use_cuda = True # use gpu or cpu | ||
66 | + val_ratio = 10 # Percentage of validation set | ||
67 | + embedding_size = 128 | ||
68 | + start = 1 # Start epoch | ||
69 | + n_epochs = 30 # How many epochs? | ||
70 | + end = start + n_epochs # Last epoch | ||
71 | + | ||
72 | + lr = 1e-1 # Initial learning rate | ||
73 | + wd = 1e-4 # Weight decay (L2 penalty) | ||
74 | + optimizer_type = 'sgd' # ex) sgd, adam, adagrad | ||
75 | + | ||
76 | + batch_size = 64 # Batch size for training | ||
77 | + valid_batch_size = 16 # Batch size for validation | ||
78 | + use_shuffle = True # Shuffle for training or not | ||
79 | + | ||
80 | + # Load dataset | ||
81 | + train_dataset, valid_dataset, n_classes = load_dataset(val_ratio) | ||
82 | + | ||
83 | + # print the experiment configuration | ||
84 | + print('\nNumber of classes (speakers):\n{}\n'.format(n_classes)) | ||
85 | + | ||
86 | + log_dir = 'new_model4_zeroth' # where to save checkpoints | ||
87 | + | ||
88 | + if not os.path.exists(log_dir): | ||
89 | + os.makedirs(log_dir) | ||
90 | + | ||
91 | + # instantiate model and initialize weights | ||
92 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
93 | + | ||
94 | + if use_cuda: | ||
95 | + model.cuda() | ||
96 | + | ||
97 | + # define loss function (criterion), optimizer and scheduler | ||
98 | + criterion = nn.CrossEntropyLoss() | ||
99 | + optimizer = create_optimizer(optimizer_type, model, lr, wd) | ||
100 | + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1) | ||
101 | + | ||
102 | + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, | ||
103 | + batch_size=batch_size, | ||
104 | + shuffle=use_shuffle) | ||
105 | + valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset, | ||
106 | + batch_size=valid_batch_size, | ||
107 | + shuffle=False, | ||
108 | + collate_fn = collate_fn_feat_padded) | ||
109 | + | ||
110 | + # to track the average training loss per epoch as the model trains | ||
111 | + avg_train_losses = [] | ||
112 | + # to track the average validation loss per epoch as the model trains | ||
113 | + avg_valid_losses = [] | ||
114 | + | ||
115 | + | ||
116 | + for epoch in range(start, end): | ||
117 | + | ||
118 | + # train for one epoch | ||
119 | + train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes) | ||
120 | + | ||
121 | + # evaluate on validation set | ||
122 | + valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch) | ||
123 | + | ||
124 | + scheduler.step(valid_loss, epoch) | ||
125 | + | ||
126 | + # calculate average loss over an epoch | ||
127 | + avg_train_losses.append(train_loss) | ||
128 | + avg_valid_losses.append(valid_loss) | ||
129 | + # do checkpointing | ||
130 | + torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(), | ||
131 | + 'optimizer': optimizer.state_dict()}, | ||
132 | + '{}/checkpoint_{}.pth'.format(log_dir, epoch)) | ||
133 | + | ||
134 | + # find position of lowest validation loss | ||
135 | + minposs = avg_valid_losses.index(min(avg_valid_losses))+1 | ||
136 | + print('Lowest validation loss at epoch %d' %minposs) | ||
137 | + | ||
138 | + # visualize the loss and learning rate as the network trained | ||
139 | + visualize_the_losses(avg_train_losses, avg_valid_losses) | ||
140 | + | ||
141 | + | ||
142 | +def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes): | ||
143 | + batch_time = AverageMeter() | ||
144 | + losses = AverageMeter() | ||
145 | + train_acc = AverageMeter() | ||
146 | + | ||
147 | + n_correct, n_total = 0, 0 | ||
148 | + log_interval = 84 | ||
149 | + # switch to train mode | ||
150 | + model.train() | ||
151 | + | ||
152 | + end = time.time() | ||
153 | + # pbar = tqdm(enumerate(train_loader)) | ||
154 | + for batch_idx, (data) in enumerate(train_loader): | ||
155 | + inputs, targets = data # target size:(batch size,1), input size:(batch size, 1, dim, win) | ||
156 | + targets = targets.view(-1) # target size:(batch size) | ||
157 | + current_sample = inputs.size(0) # batch size | ||
158 | + | ||
159 | + if use_cuda: | ||
160 | + inputs = inputs.cuda() | ||
161 | + targets = targets.cuda() | ||
162 | + _, output = model(inputs) # out size:(batch size, #classes), for softmax | ||
163 | + | ||
164 | + # calculate accuracy of predictions in the current batch | ||
165 | + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item() | ||
166 | + n_total += current_sample | ||
167 | + train_acc_temp = 100. * n_correct / n_total | ||
168 | + train_acc.update(train_acc_temp, inputs.size(0)) | ||
169 | + | ||
170 | + loss = criterion(output, targets) | ||
171 | + losses.update(loss.item(), inputs.size(0)) | ||
172 | + | ||
173 | + # compute gradient and do SGD step | ||
174 | + optimizer.zero_grad() | ||
175 | + loss.backward() | ||
176 | + optimizer.step() | ||
177 | + | ||
178 | + # measure elapsed time | ||
179 | + batch_time.update(time.time() - end) | ||
180 | + end = time.time() | ||
181 | + | ||
182 | + if batch_idx % log_interval == 0: | ||
183 | + print( | ||
184 | + 'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t' | ||
185 | + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' | ||
186 | + 'Loss {loss.avg:.4f}\t' | ||
187 | + 'Acc {train_acc.avg:.4f}'.format( | ||
188 | + epoch, batch_idx * len(inputs), len(train_loader.dataset), | ||
189 | + 100. * batch_idx / len(train_loader), | ||
190 | + batch_time=batch_time, loss=losses, train_acc=train_acc)) | ||
191 | + return losses.avg | ||
192 | + | ||
193 | +def validate(val_loader, model, criterion, use_cuda, epoch): | ||
194 | + batch_time = AverageMeter() | ||
195 | + losses = AverageMeter() | ||
196 | + val_acc = AverageMeter() | ||
197 | + | ||
198 | + n_correct, n_total = 0, 0 | ||
199 | + | ||
200 | + # switch to evaluate mode | ||
201 | + model.eval() | ||
202 | + | ||
203 | + with torch.no_grad(): | ||
204 | + end = time.time() | ||
205 | + for i, (data) in enumerate(val_loader): | ||
206 | + inputs, targets = data | ||
207 | + current_sample = inputs.size(0) # batch size | ||
208 | + | ||
209 | + if use_cuda: | ||
210 | + inputs = inputs.cuda() | ||
211 | + targets = targets.cuda() | ||
212 | + | ||
213 | + # compute output | ||
214 | + _, output = model(inputs) | ||
215 | + | ||
216 | + # measure accuracy and record loss | ||
217 | + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item() | ||
218 | + n_total += current_sample | ||
219 | + val_acc_temp = 100. * n_correct / n_total | ||
220 | + val_acc.update(val_acc_temp, inputs.size(0)) | ||
221 | + | ||
222 | + loss = criterion(output, targets) | ||
223 | + losses.update(loss.item(), inputs.size(0)) | ||
224 | + # measure elapsed time | ||
225 | + batch_time.update(time.time() - end) | ||
226 | + end = time.time() | ||
227 | + | ||
228 | + print(' * Validation: ' | ||
229 | + 'Loss {loss.avg:.4f}\t' | ||
230 | + 'Acc {val_acc.avg:.4f}'.format( | ||
231 | + loss=losses, val_acc=val_acc)) | ||
232 | + | ||
233 | + return losses.avg | ||
234 | + | ||
235 | +class AverageMeter(object): | ||
236 | + """Computes and stores the average and current value""" | ||
237 | + def __init__(self): | ||
238 | + self.reset() | ||
239 | + def reset(self): | ||
240 | + self.val = 0 | ||
241 | + self.avg = 0 | ||
242 | + self.sum = 0 | ||
243 | + self.count = 0 | ||
244 | + def update(self, val, n=1): | ||
245 | + self.val = val | ||
246 | + self.sum += val * n | ||
247 | + self.count += n | ||
248 | + self.avg = self.sum / self.count | ||
249 | + | ||
250 | +def create_optimizer(optimizer, model, new_lr, wd): | ||
251 | + # setup optimizer | ||
252 | + if optimizer == 'sgd': | ||
253 | + optimizer = optim.SGD(model.parameters(), lr=new_lr, | ||
254 | + momentum=0.9, dampening=0, | ||
255 | + weight_decay=wd) | ||
256 | + elif optimizer == 'adam': | ||
257 | + optimizer = optim.Adam(model.parameters(), lr=new_lr, | ||
258 | + weight_decay=wd) | ||
259 | + elif optimizer == 'adagrad': | ||
260 | + optimizer = optim.Adagrad(model.parameters(), | ||
261 | + lr=new_lr, | ||
262 | + weight_decay=wd) | ||
263 | + return optimizer | ||
264 | + | ||
265 | +def visualize_the_losses(train_loss, valid_loss): | ||
266 | + fig = plt.figure(figsize=(10,8)) | ||
267 | + plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss') | ||
268 | + plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss') | ||
269 | + | ||
270 | + # find position of lowest validation loss | ||
271 | + minposs = valid_loss.index(min(valid_loss))+1 | ||
272 | + plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint') | ||
273 | + | ||
274 | + plt.xlabel('epochs') | ||
275 | + plt.ylabel('loss') | ||
276 | + plt.ylim(0, 3.5) # consistent scale | ||
277 | + plt.xlim(0, len(train_loss)+1) # consistent scale | ||
278 | + plt.grid(True) | ||
279 | + plt.legend() | ||
280 | + plt.tight_layout() | ||
281 | + #plt.show() | ||
282 | + fig.savefig('train4_zeroth.png', bbox_inches='tight') | ||
283 | + | ||
284 | +if __name__ == '__main__': | ||
285 | + main() |
Speaker_Recognition/train5.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn as nn | ||
3 | +import torch.optim as optim | ||
4 | +import torchvision.transforms as transforms | ||
5 | + | ||
6 | +import time | ||
7 | +import os | ||
8 | +import numpy as np | ||
9 | +import configure as c | ||
10 | +import pandas as pd | ||
11 | +from DB_wav_reader import read_feats_structure | ||
12 | +from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded | ||
13 | +from model.model5 import background_resnet | ||
14 | +import matplotlib as mpl | ||
15 | +mpl.use('Agg') | ||
16 | +import matplotlib.pyplot as plt | ||
17 | + | ||
18 | +import pandas as pd | ||
19 | +def load_dataset(val_ratio): | ||
20 | + # Load training set and validation set | ||
21 | + | ||
22 | + | ||
23 | + # Split training set into training set and validation set according to "val_ratio" | ||
24 | + train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio) | ||
25 | + | ||
26 | + file_loader = read_MFB # numpy array:(n_frames, n_dims) | ||
27 | + | ||
28 | + transform = transforms.Compose([ | ||
29 | + TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims) | ||
30 | + ToTensorInput() # torch tensor:(1, n_dims, n_frames) | ||
31 | + ]) | ||
32 | + transform_T = ToTensorDevInput() | ||
33 | + | ||
34 | + | ||
35 | + speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers | ||
36 | + spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)} | ||
37 | + | ||
38 | + train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx) | ||
39 | + valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx) | ||
40 | + | ||
41 | + n_classes = len(speaker_list) # How many speakers? 240 | ||
42 | + return train_dataset, valid_dataset, n_classes | ||
43 | + | ||
44 | +def split_train_dev(train_feat_dir, valid_ratio): | ||
45 | + train_valid_DB = read_feats_structure(train_feat_dir) | ||
46 | + total_len = len(train_valid_DB) # 148642 | ||
47 | + valid_len = int(total_len * valid_ratio/100.) | ||
48 | + train_len = total_len - valid_len | ||
49 | + shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True) | ||
50 | + # Split the DB into train and valid set | ||
51 | + train_DB = shuffled_train_valid_DB.iloc[:train_len] | ||
52 | + valid_DB = shuffled_train_valid_DB.iloc[train_len:] | ||
53 | + # Reset the index | ||
54 | + train_DB = train_DB.reset_index(drop=True) | ||
55 | + valid_DB = valid_DB.reset_index(drop=True) | ||
56 | + print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100)) | ||
57 | + print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100)) | ||
58 | + print('Total %d utts' %(total_len)) | ||
59 | + | ||
60 | + return train_DB, valid_DB | ||
61 | + | ||
62 | +def main(): | ||
63 | + # Set hyperparameters | ||
64 | + use_cuda = True # use gpu or cpu | ||
65 | + val_ratio = 10 # Percentage of validation set | ||
66 | + embedding_size = 128 | ||
67 | + start = 1 # Start epoch | ||
68 | + n_epochs = 30 # How many epochs? | ||
69 | + end = start + n_epochs # Last epoch | ||
70 | + | ||
71 | + lr = 1e-1 # Initial learning rate | ||
72 | + wd = 1e-4 # Weight decay (L2 penalty) | ||
73 | + optimizer_type = 'sgd' # ex) sgd, adam, adagrad | ||
74 | + | ||
75 | + batch_size = 64 # Batch size for training | ||
76 | + valid_batch_size = 16 # Batch size for validation | ||
77 | + use_shuffle = True # Shuffle for training or not | ||
78 | + | ||
79 | + # Load dataset | ||
80 | + train_dataset, valid_dataset, n_classes = load_dataset(val_ratio) | ||
81 | + | ||
82 | + # print the experiment configuration | ||
83 | + print('\nNumber of classes (speakers):\n{}\n'.format(n_classes)) | ||
84 | + | ||
85 | + log_dir = 'new_model5' # where to save checkpoints | ||
86 | + | ||
87 | + if not os.path.exists(log_dir): | ||
88 | + os.makedirs(log_dir) | ||
89 | + | ||
90 | + # instantiate model and initialize weights | ||
91 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
92 | + | ||
93 | + if use_cuda: | ||
94 | + model.cuda() | ||
95 | + | ||
96 | + # define loss function (criterion), optimizer and scheduler | ||
97 | + criterion = nn.CrossEntropyLoss() | ||
98 | + optimizer = create_optimizer(optimizer_type, model, lr, wd) | ||
99 | + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1) | ||
100 | + | ||
101 | + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, | ||
102 | + batch_size=batch_size, | ||
103 | + shuffle=use_shuffle) | ||
104 | + valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset, | ||
105 | + batch_size=valid_batch_size, | ||
106 | + shuffle=False, | ||
107 | + collate_fn = collate_fn_feat_padded) | ||
108 | + | ||
109 | + # to track the average training loss per epoch as the model trains | ||
110 | + avg_train_losses = [] | ||
111 | + # to track the average validation loss per epoch as the model trains | ||
112 | + avg_valid_losses = [] | ||
113 | + | ||
114 | + | ||
115 | + for epoch in range(start, end): | ||
116 | + | ||
117 | + # train for one epoch | ||
118 | + train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes) | ||
119 | + | ||
120 | + # evaluate on validation set | ||
121 | + valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch) | ||
122 | + | ||
123 | + scheduler.step(valid_loss, epoch) | ||
124 | + | ||
125 | + # calculate average loss over an epoch | ||
126 | + avg_train_losses.append(train_loss) | ||
127 | + avg_valid_losses.append(valid_loss) | ||
128 | + # do checkpointing | ||
129 | + torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(), | ||
130 | + 'optimizer': optimizer.state_dict()}, | ||
131 | + '{}/checkpoint_{}.pth'.format(log_dir, epoch)) | ||
132 | + | ||
133 | + # find position of lowest validation loss | ||
134 | + minposs = avg_valid_losses.index(min(avg_valid_losses))+1 | ||
135 | + print('Lowest validation loss at epoch %d' %minposs) | ||
136 | + | ||
137 | + # visualize the loss and learning rate as the network trained | ||
138 | + visualize_the_losses(avg_train_losses, avg_valid_losses) | ||
139 | + | ||
140 | + | ||
141 | +def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes): | ||
142 | + batch_time = AverageMeter() | ||
143 | + losses = AverageMeter() | ||
144 | + train_acc = AverageMeter() | ||
145 | + | ||
146 | + n_correct, n_total = 0, 0 | ||
147 | + log_interval = 84 | ||
148 | + # switch to train mode | ||
149 | + model.train() | ||
150 | + | ||
151 | + end = time.time() | ||
152 | + # pbar = tqdm(enumerate(train_loader)) | ||
153 | + for batch_idx, (data) in enumerate(train_loader): | ||
154 | + inputs, targets = data # target size:(batch size,1), input size:(batch size, 1, dim, win) | ||
155 | + targets = targets.view(-1) # target size:(batch size) | ||
156 | + current_sample = inputs.size(0) # batch size | ||
157 | + | ||
158 | + if use_cuda: | ||
159 | + inputs = inputs.cuda() | ||
160 | + targets = targets.cuda() | ||
161 | + _, output = model(inputs) # out size:(batch size, #classes), for softmax | ||
162 | + | ||
163 | + # calculate accuracy of predictions in the current batch | ||
164 | + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item() | ||
165 | + n_total += current_sample | ||
166 | + train_acc_temp = 100. * n_correct / n_total | ||
167 | + train_acc.update(train_acc_temp, inputs.size(0)) | ||
168 | + | ||
169 | + loss = criterion(output, targets) | ||
170 | + losses.update(loss.item(), inputs.size(0)) | ||
171 | + | ||
172 | + # compute gradient and do SGD step | ||
173 | + optimizer.zero_grad() | ||
174 | + loss.backward() | ||
175 | + optimizer.step() | ||
176 | + | ||
177 | + # measure elapsed time | ||
178 | + batch_time.update(time.time() - end) | ||
179 | + end = time.time() | ||
180 | + | ||
181 | + if batch_idx % log_interval == 0: | ||
182 | + print( | ||
183 | + 'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t' | ||
184 | + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' | ||
185 | + 'Loss {loss.avg:.4f}\t' | ||
186 | + 'Acc {train_acc.avg:.4f}'.format( | ||
187 | + epoch, batch_idx * len(inputs), len(train_loader.dataset), | ||
188 | + 100. * batch_idx / len(train_loader), | ||
189 | + batch_time=batch_time, loss=losses, train_acc=train_acc)) | ||
190 | + return losses.avg | ||
191 | + | ||
192 | +def validate(val_loader, model, criterion, use_cuda, epoch): | ||
193 | + batch_time = AverageMeter() | ||
194 | + losses = AverageMeter() | ||
195 | + val_acc = AverageMeter() | ||
196 | + | ||
197 | + n_correct, n_total = 0, 0 | ||
198 | + | ||
199 | + # switch to evaluate mode | ||
200 | + model.eval() | ||
201 | + | ||
202 | + with torch.no_grad(): | ||
203 | + end = time.time() | ||
204 | + for i, (data) in enumerate(val_loader): | ||
205 | + inputs, targets = data | ||
206 | + current_sample = inputs.size(0) # batch size | ||
207 | + | ||
208 | + if use_cuda: | ||
209 | + inputs = inputs.cuda() | ||
210 | + targets = targets.cuda() | ||
211 | + | ||
212 | + # compute output | ||
213 | + _, output = model(inputs) | ||
214 | + | ||
215 | + # measure accuracy and record loss | ||
216 | + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item() | ||
217 | + n_total += current_sample | ||
218 | + val_acc_temp = 100. * n_correct / n_total | ||
219 | + val_acc.update(val_acc_temp, inputs.size(0)) | ||
220 | + | ||
221 | + loss = criterion(output, targets) | ||
222 | + losses.update(loss.item(), inputs.size(0)) | ||
223 | + # measure elapsed time | ||
224 | + batch_time.update(time.time() - end) | ||
225 | + end = time.time() | ||
226 | + | ||
227 | + print(' * Validation: ' | ||
228 | + 'Loss {loss.avg:.4f}\t' | ||
229 | + 'Acc {val_acc.avg:.4f}'.format( | ||
230 | + loss=losses, val_acc=val_acc)) | ||
231 | + | ||
232 | + return losses.avg | ||
233 | + | ||
234 | +class AverageMeter(object): | ||
235 | + """Computes and stores the average and current value""" | ||
236 | + def __init__(self): | ||
237 | + self.reset() | ||
238 | + def reset(self): | ||
239 | + self.val = 0 | ||
240 | + self.avg = 0 | ||
241 | + self.sum = 0 | ||
242 | + self.count = 0 | ||
243 | + def update(self, val, n=1): | ||
244 | + self.val = val | ||
245 | + self.sum += val * n | ||
246 | + self.count += n | ||
247 | + self.avg = self.sum / self.count | ||
248 | + | ||
249 | +def create_optimizer(optimizer, model, new_lr, wd): | ||
250 | + # setup optimizer | ||
251 | + if optimizer == 'sgd': | ||
252 | + optimizer = optim.SGD(model.parameters(), lr=new_lr, | ||
253 | + momentum=0.9, dampening=0, | ||
254 | + weight_decay=wd) | ||
255 | + elif optimizer == 'adam': | ||
256 | + optimizer = optim.Adam(model.parameters(), lr=new_lr, | ||
257 | + weight_decay=wd) | ||
258 | + elif optimizer == 'adagrad': | ||
259 | + optimizer = optim.Adagrad(model.parameters(), | ||
260 | + lr=new_lr, | ||
261 | + weight_decay=wd) | ||
262 | + return optimizer | ||
263 | + | ||
264 | +def visualize_the_losses(train_loss, valid_loss): | ||
265 | + fig = plt.figure(figsize=(10,8)) | ||
266 | + plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss') | ||
267 | + plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss') | ||
268 | + | ||
269 | + # find position of lowest validation loss | ||
270 | + minposs = valid_loss.index(min(valid_loss))+1 | ||
271 | + plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint') | ||
272 | + | ||
273 | + plt.xlabel('epochs') | ||
274 | + plt.ylabel('loss') | ||
275 | + plt.ylim(0, 3.5) # consistent scale | ||
276 | + plt.xlim(0, len(train_loss)+1) # consistent scale | ||
277 | + plt.grid(True) | ||
278 | + plt.legend() | ||
279 | + plt.tight_layout() | ||
280 | + #plt.show() | ||
281 | + fig.savefig('train5.png', bbox_inches='tight') | ||
282 | + | ||
283 | +if __name__ == '__main__': | ||
284 | + main() |
Speaker_Recognition/verification3.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import configure as c | ||
9 | + | ||
10 | +from DB_wav_reader import read_feats_structure | ||
11 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
12 | +from model.model3 import background_resnet | ||
13 | + | ||
14 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
15 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
16 | + if use_cuda: | ||
17 | + model.cuda() | ||
18 | + print('=> loading checkpoint') | ||
19 | + # original saved file with DataParallel | ||
20 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
21 | + # create new OrderedDict that does not contain `module.` | ||
22 | + model.load_state_dict(checkpoint['state_dict']) | ||
23 | + model.eval() | ||
24 | + return model | ||
25 | + | ||
26 | +def split_enroll_and_test(dataroot_dir): | ||
27 | + DB_all = read_feats_structure(dataroot_dir) | ||
28 | + enroll_DB = pd.DataFrame() | ||
29 | + test_DB = pd.DataFrame() | ||
30 | + | ||
31 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
32 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
33 | + | ||
34 | + # Reset the index | ||
35 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
36 | + test_DB = test_DB.reset_index(drop=True) | ||
37 | + return enroll_DB, test_DB | ||
38 | + | ||
39 | +def load_enroll_embeddings(embedding_dir): | ||
40 | + embeddings = {} | ||
41 | + for f in os.listdir(embedding_dir): | ||
42 | + spk = f.replace('.pth','') | ||
43 | + # Select the speakers who are in the 'enroll_spk_list' | ||
44 | + embedding_path = os.path.join(embedding_dir, f) | ||
45 | + tmp_embeddings = torch.load(embedding_path) | ||
46 | + embeddings[spk] = tmp_embeddings | ||
47 | + | ||
48 | + return embeddings | ||
49 | + | ||
50 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
51 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
52 | + | ||
53 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
54 | + activation = 0 | ||
55 | + with torch.no_grad(): | ||
56 | + for i in range(tot_segments): | ||
57 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
58 | + | ||
59 | + TT = ToTensorTestInput() | ||
60 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
61 | + | ||
62 | + if use_cuda: | ||
63 | + temp_input = temp_input.cuda() | ||
64 | + temp_activation,_ = model(temp_input) | ||
65 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
66 | + | ||
67 | + activation = l2_norm(activation, 1) | ||
68 | + | ||
69 | + return activation | ||
70 | + | ||
71 | +def l2_norm(input, alpha): | ||
72 | + input_size = input.size() # size:(n_frames, dim) | ||
73 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
74 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
75 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
76 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
77 | + output = _output.view(input_size) | ||
78 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
79 | + output = output * alpha | ||
80 | + return output | ||
81 | + | ||
82 | +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres): | ||
83 | + enroll_embedding = embeddings[enroll_speaker] | ||
84 | + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames) | ||
85 | + | ||
86 | + score = F.cosine_similarity(test_embedding, enroll_embedding) | ||
87 | + score = score.data.cpu().numpy() | ||
88 | + | ||
89 | + if score > thres: | ||
90 | + result = 'Accept' | ||
91 | + else: | ||
92 | + result = 'Reject' | ||
93 | + | ||
94 | + test_spk = test_filename.split('/')[-2].split('_')[0] | ||
95 | + print("\n=== Speaker verification ===") | ||
96 | + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result)) | ||
97 | + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres)) | ||
98 | + | ||
99 | +def main(): | ||
100 | + | ||
101 | + log_dir = 'new_model3' # Where the checkpoints are saved | ||
102 | + embedding_dir = 'enroll_embeddings3' # Where embeddings are saved | ||
103 | + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved | ||
104 | + | ||
105 | + # Settings | ||
106 | + use_cuda = True # Use cuda or not | ||
107 | + embedding_size = 128 # Dimension of speaker embeddings | ||
108 | + cp_num = 11 # Which checkpoint to use? | ||
109 | + n_classes = 241 # How many speakers in training data? | ||
110 | + test_frames = 100 # Split the test utterance | ||
111 | + | ||
112 | + # Load model from checkpoint | ||
113 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
114 | + | ||
115 | + # Get the dataframe for test DB | ||
116 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
117 | + | ||
118 | + # Load enroll embeddings | ||
119 | + embeddings = load_enroll_embeddings(embedding_dir) | ||
120 | + | ||
121 | + """ Test speaker list | ||
122 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
123 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
124 | + """ | ||
125 | + | ||
126 | + # Set the true speaker | ||
127 | + enroll_speaker = '103F3021' | ||
128 | + | ||
129 | + # Set the claimed speaker | ||
130 | + test_speaker = '207F2088' | ||
131 | + | ||
132 | + # Threshold | ||
133 | + thres = 0.95 | ||
134 | + | ||
135 | + test_path = os.path.join(test_dir, test_speaker, 'test.p') | ||
136 | + | ||
137 | + # Perform the test | ||
138 | + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres) | ||
139 | + | ||
140 | +if __name__ == '__main__': | ||
141 | + main() |
Speaker_Recognition/verification4.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import configure as c | ||
9 | + | ||
10 | +from DB_wav_reader import read_feats_structure | ||
11 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
12 | +from model.model4 import background_resnet | ||
13 | + | ||
14 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
15 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
16 | + if use_cuda: | ||
17 | + model.cuda() | ||
18 | + print('=> loading checkpoint') | ||
19 | + # original saved file with DataParallel | ||
20 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
21 | + # create new OrderedDict that does not contain `module.` | ||
22 | + model.load_state_dict(checkpoint['state_dict']) | ||
23 | + model.eval() | ||
24 | + return model | ||
25 | + | ||
26 | +def split_enroll_and_test(dataroot_dir): | ||
27 | + DB_all = read_feats_structure(dataroot_dir) | ||
28 | + enroll_DB = pd.DataFrame() | ||
29 | + test_DB = pd.DataFrame() | ||
30 | + | ||
31 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
32 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
33 | + | ||
34 | + # Reset the index | ||
35 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
36 | + test_DB = test_DB.reset_index(drop=True) | ||
37 | + return enroll_DB, test_DB | ||
38 | + | ||
39 | +def load_enroll_embeddings(embedding_dir): | ||
40 | + embeddings = {} | ||
41 | + for f in os.listdir(embedding_dir): | ||
42 | + spk = f.replace('.pth','') | ||
43 | + # Select the speakers who are in the 'enroll_spk_list' | ||
44 | + embedding_path = os.path.join(embedding_dir, f) | ||
45 | + tmp_embeddings = torch.load(embedding_path) | ||
46 | + embeddings[spk] = tmp_embeddings | ||
47 | + | ||
48 | + return embeddings | ||
49 | + | ||
50 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
51 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
52 | + | ||
53 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
54 | + activation = 0 | ||
55 | + with torch.no_grad(): | ||
56 | + for i in range(tot_segments): | ||
57 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
58 | + | ||
59 | + TT = ToTensorTestInput() | ||
60 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
61 | + | ||
62 | + if use_cuda: | ||
63 | + temp_input = temp_input.cuda() | ||
64 | + temp_activation,_ = model(temp_input) | ||
65 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
66 | + | ||
67 | + activation = l2_norm(activation, 1) | ||
68 | + | ||
69 | + return activation | ||
70 | + | ||
71 | +def l2_norm(input, alpha): | ||
72 | + input_size = input.size() # size:(n_frames, dim) | ||
73 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
74 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
75 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
76 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
77 | + output = _output.view(input_size) | ||
78 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
79 | + output = output * alpha | ||
80 | + return output | ||
81 | + | ||
82 | +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres): | ||
83 | + enroll_embedding = embeddings[enroll_speaker] | ||
84 | + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames) | ||
85 | + | ||
86 | + score = F.cosine_similarity(test_embedding, enroll_embedding) | ||
87 | + score = score.data.cpu().numpy() | ||
88 | + | ||
89 | + if score > thres: | ||
90 | + result = 'Accept' | ||
91 | + else: | ||
92 | + result = 'Reject' | ||
93 | + | ||
94 | + test_spk = test_filename.split('/')[-2].split('_')[0] | ||
95 | + print("\n=== Speaker verification ===") | ||
96 | + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result)) | ||
97 | + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres)) | ||
98 | + | ||
99 | +def main(): | ||
100 | + | ||
101 | + log_dir = 'new_model4' # Where the checkpoints are saved | ||
102 | + embedding_dir = 'enroll_embeddings4' # Where embeddings are saved | ||
103 | + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved | ||
104 | + | ||
105 | + # Settings | ||
106 | + use_cuda = True # Use cuda or not | ||
107 | + embedding_size = 128 # Dimension of speaker embeddings | ||
108 | + cp_num = 25 # Which checkpoint to use? | ||
109 | + n_classes = 241 # How many speakers in training data? | ||
110 | + test_frames = 100 # Split the test utterance | ||
111 | + | ||
112 | + # Load model from checkpoint | ||
113 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
114 | + | ||
115 | + # Get the dataframe for test DB | ||
116 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
117 | + | ||
118 | + # Load enroll embeddings | ||
119 | + embeddings = load_enroll_embeddings(embedding_dir) | ||
120 | + | ||
121 | + """ Test speaker list | ||
122 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
123 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
124 | + """ | ||
125 | + | ||
126 | + # Set the true speaker | ||
127 | + enroll_speaker = '229M2031' | ||
128 | + | ||
129 | + # Set the claimed speaker | ||
130 | + test_speaker = 'sunghwan1' | ||
131 | + | ||
132 | + # Threshold | ||
133 | + thres = 0.95 | ||
134 | + | ||
135 | + test_path = os.path.join(test_dir, test_speaker, 'test.p') | ||
136 | + | ||
137 | + # Perform the test | ||
138 | + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres) | ||
139 | + | ||
140 | +if __name__ == '__main__': | ||
141 | + main() |
Speaker_Recognition/verification4_merge.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import configure as c | ||
9 | + | ||
10 | +from DB_wav_reader import read_feats_structure | ||
11 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
12 | +from model.model4 import background_resnet | ||
13 | + | ||
14 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
15 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
16 | + if use_cuda: | ||
17 | + model.cuda() | ||
18 | + print('=> loading checkpoint') | ||
19 | + # original saved file with DataParallel | ||
20 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
21 | + # create new OrderedDict that does not contain `module.` | ||
22 | + model.load_state_dict(checkpoint['state_dict']) | ||
23 | + model.eval() | ||
24 | + return model | ||
25 | + | ||
26 | +def split_enroll_and_test(dataroot_dir): | ||
27 | + DB_all = read_feats_structure(dataroot_dir) | ||
28 | + enroll_DB = pd.DataFrame() | ||
29 | + test_DB = pd.DataFrame() | ||
30 | + | ||
31 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
32 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
33 | + | ||
34 | + # Reset the index | ||
35 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
36 | + test_DB = test_DB.reset_index(drop=True) | ||
37 | + return enroll_DB, test_DB | ||
38 | + | ||
39 | +def load_enroll_embeddings(embedding_dir): | ||
40 | + embeddings = {} | ||
41 | + for f in os.listdir(embedding_dir): | ||
42 | + spk = f.replace('.pth','') | ||
43 | + # Select the speakers who are in the 'enroll_spk_list' | ||
44 | + embedding_path = os.path.join(embedding_dir, f) | ||
45 | + tmp_embeddings = torch.load(embedding_path) | ||
46 | + embeddings[spk] = tmp_embeddings | ||
47 | + | ||
48 | + return embeddings | ||
49 | + | ||
50 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
51 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
52 | + | ||
53 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
54 | + activation = 0 | ||
55 | + with torch.no_grad(): | ||
56 | + for i in range(tot_segments): | ||
57 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
58 | + | ||
59 | + TT = ToTensorTestInput() | ||
60 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
61 | + | ||
62 | + if use_cuda: | ||
63 | + temp_input = temp_input.cuda() | ||
64 | + temp_activation,_ = model(temp_input) | ||
65 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
66 | + | ||
67 | + activation = l2_norm(activation, 1) | ||
68 | + | ||
69 | + return activation | ||
70 | + | ||
71 | +def l2_norm(input, alpha): | ||
72 | + input_size = input.size() # size:(n_frames, dim) | ||
73 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
74 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
75 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
76 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
77 | + output = _output.view(input_size) | ||
78 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
79 | + output = output * alpha | ||
80 | + return output | ||
81 | + | ||
82 | +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres): | ||
83 | + enroll_embedding = embeddings[enroll_speaker] | ||
84 | + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames) | ||
85 | + | ||
86 | + score = F.cosine_similarity(test_embedding, enroll_embedding) | ||
87 | + score = score.data.cpu().numpy() | ||
88 | + | ||
89 | + if score > thres: | ||
90 | + result = 'Accept' | ||
91 | + else: | ||
92 | + result = 'Reject' | ||
93 | + | ||
94 | + test_spk = test_filename.split('/')[-2].split('_')[0] | ||
95 | + print("\n=== Speaker verification ===") | ||
96 | + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result)) | ||
97 | + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres)) | ||
98 | + | ||
99 | +def main(): | ||
100 | + | ||
101 | + log_dir = 'new_model4_merge' # Where the checkpoints are saved | ||
102 | + embedding_dir = 'enroll_embeddings4_merge' # Where embeddings are saved | ||
103 | + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved | ||
104 | + | ||
105 | + # Settings | ||
106 | + use_cuda = True # Use cuda or not | ||
107 | + embedding_size = 128 # Dimension of speaker embeddings | ||
108 | + cp_num = 50 # Which checkpoint to use? | ||
109 | + n_classes = 348 # How many speakers in training data? | ||
110 | + test_frames = 100 # Split the test utterance | ||
111 | + | ||
112 | + # Load model from checkpoint | ||
113 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
114 | + | ||
115 | + # Get the dataframe for test DB | ||
116 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
117 | + | ||
118 | + # Load enroll embeddings | ||
119 | + embeddings = load_enroll_embeddings(embedding_dir) | ||
120 | + | ||
121 | + """ Test speaker list | ||
122 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
123 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
124 | + """ | ||
125 | + | ||
126 | + # Set the true speaker | ||
127 | + enroll_speaker = '213F5100' | ||
128 | + | ||
129 | + # Set the claimed speaker | ||
130 | + test_speaker = '207F2088' | ||
131 | + | ||
132 | + # Threshold | ||
133 | + thres = 0.95 | ||
134 | + | ||
135 | + test_path = os.path.join(test_dir, test_speaker, 'test.p') | ||
136 | + | ||
137 | + # Perform the test | ||
138 | + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres) | ||
139 | + | ||
140 | +if __name__ == '__main__': | ||
141 | + main() |
Speaker_Recognition/verification4_zeroth.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import configure as c | ||
9 | + | ||
10 | +from DB_wav_reader import read_feats_structure | ||
11 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
12 | +from model.model4 import background_resnet | ||
13 | + | ||
14 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
15 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
16 | + if use_cuda: | ||
17 | + model.cuda() | ||
18 | + print('=> loading checkpoint') | ||
19 | + # original saved file with DataParallel | ||
20 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
21 | + # create new OrderedDict that does not contain `module.` | ||
22 | + model.load_state_dict(checkpoint['state_dict']) | ||
23 | + model.eval() | ||
24 | + return model | ||
25 | + | ||
26 | +def split_enroll_and_test(dataroot_dir): | ||
27 | + DB_all = read_feats_structure(dataroot_dir) | ||
28 | + enroll_DB = pd.DataFrame() | ||
29 | + test_DB = pd.DataFrame() | ||
30 | + | ||
31 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
32 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
33 | + | ||
34 | + # Reset the index | ||
35 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
36 | + test_DB = test_DB.reset_index(drop=True) | ||
37 | + return enroll_DB, test_DB | ||
38 | + | ||
39 | +def load_enroll_embeddings(embedding_dir): | ||
40 | + embeddings = {} | ||
41 | + for f in os.listdir(embedding_dir): | ||
42 | + spk = f.replace('.pth','') | ||
43 | + # Select the speakers who are in the 'enroll_spk_list' | ||
44 | + embedding_path = os.path.join(embedding_dir, f) | ||
45 | + tmp_embeddings = torch.load(embedding_path) | ||
46 | + embeddings[spk] = tmp_embeddings | ||
47 | + | ||
48 | + return embeddings | ||
49 | + | ||
50 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
51 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
52 | + | ||
53 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
54 | + activation = 0 | ||
55 | + with torch.no_grad(): | ||
56 | + for i in range(tot_segments): | ||
57 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
58 | + | ||
59 | + TT = ToTensorTestInput() | ||
60 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
61 | + | ||
62 | + if use_cuda: | ||
63 | + temp_input = temp_input.cuda() | ||
64 | + temp_activation,_ = model(temp_input) | ||
65 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
66 | + | ||
67 | + activation = l2_norm(activation, 1) | ||
68 | + | ||
69 | + return activation | ||
70 | + | ||
71 | +def l2_norm(input, alpha): | ||
72 | + input_size = input.size() # size:(n_frames, dim) | ||
73 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
74 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
75 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
76 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
77 | + output = _output.view(input_size) | ||
78 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
79 | + output = output * alpha | ||
80 | + return output | ||
81 | + | ||
82 | +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres): | ||
83 | + enroll_embedding = embeddings[enroll_speaker] | ||
84 | + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames) | ||
85 | + | ||
86 | + score = F.cosine_similarity(test_embedding, enroll_embedding) | ||
87 | + score = score.data.cpu().numpy() | ||
88 | + | ||
89 | + if score > thres: | ||
90 | + result = 'Accept' | ||
91 | + else: | ||
92 | + result = 'Reject' | ||
93 | + | ||
94 | + test_spk = test_filename.split('/')[-2].split('_')[0] | ||
95 | + print("\n=== Speaker verification ===") | ||
96 | + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result)) | ||
97 | + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres)) | ||
98 | + | ||
99 | +def main(): | ||
100 | + | ||
101 | + log_dir = 'new_model4_zeroth' # Where the checkpoints are saved | ||
102 | + embedding_dir = 'enroll_embeddings4_zeroth' # Where embeddings are saved | ||
103 | + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved | ||
104 | + | ||
105 | + # Settings | ||
106 | + use_cuda = True # Use cuda or not | ||
107 | + embedding_size = 128 # Dimension of speaker embeddings | ||
108 | + cp_num = 30 # Which checkpoint to use? | ||
109 | + n_classes = 105 # How many speakers in training data? | ||
110 | + test_frames = 100 # Split the test utterance | ||
111 | + | ||
112 | + # Load model from checkpoint | ||
113 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
114 | + | ||
115 | + # Get the dataframe for test DB | ||
116 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
117 | + | ||
118 | + # Load enroll embeddings | ||
119 | + embeddings = load_enroll_embeddings(embedding_dir) | ||
120 | + | ||
121 | + """ Test speaker list | ||
122 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
123 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
124 | + """ | ||
125 | + | ||
126 | + # Set the true speaker | ||
127 | + enroll_speaker = '777M7777' | ||
128 | + | ||
129 | + # Set the claimed speaker | ||
130 | + test_speaker = '103F3021' | ||
131 | + | ||
132 | + # Threshold | ||
133 | + thres = 0.95 | ||
134 | + | ||
135 | + test_path = os.path.join(test_dir, test_speaker, 'test.p') | ||
136 | + | ||
137 | + # Perform the test | ||
138 | + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres) | ||
139 | + | ||
140 | +if __name__ == '__main__': | ||
141 | + main() |
Speaker_Recognition/verification5.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import configure as c | ||
9 | + | ||
10 | +from DB_wav_reader import read_feats_structure | ||
11 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
12 | +from model.model5 import background_resnet | ||
13 | + | ||
14 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
15 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
16 | + if use_cuda: | ||
17 | + model.cuda() | ||
18 | + print('=> loading checkpoint') | ||
19 | + # original saved file with DataParallel | ||
20 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
21 | + # create new OrderedDict that does not contain `module.` | ||
22 | + model.load_state_dict(checkpoint['state_dict']) | ||
23 | + model.eval() | ||
24 | + return model | ||
25 | + | ||
26 | +def split_enroll_and_test(dataroot_dir): | ||
27 | + DB_all = read_feats_structure(dataroot_dir) | ||
28 | + enroll_DB = pd.DataFrame() | ||
29 | + test_DB = pd.DataFrame() | ||
30 | + | ||
31 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
32 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
33 | + | ||
34 | + # Reset the index | ||
35 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
36 | + test_DB = test_DB.reset_index(drop=True) | ||
37 | + return enroll_DB, test_DB | ||
38 | + | ||
39 | +def load_enroll_embeddings(embedding_dir): | ||
40 | + embeddings = {} | ||
41 | + for f in os.listdir(embedding_dir): | ||
42 | + spk = f.replace('.pth','') | ||
43 | + # Select the speakers who are in the 'enroll_spk_list' | ||
44 | + embedding_path = os.path.join(embedding_dir, f) | ||
45 | + tmp_embeddings = torch.load(embedding_path) | ||
46 | + embeddings[spk] = tmp_embeddings | ||
47 | + | ||
48 | + return embeddings | ||
49 | + | ||
50 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
51 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
52 | + | ||
53 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
54 | + activation = 0 | ||
55 | + with torch.no_grad(): | ||
56 | + for i in range(tot_segments): | ||
57 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
58 | + | ||
59 | + TT = ToTensorTestInput() | ||
60 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
61 | + | ||
62 | + if use_cuda: | ||
63 | + temp_input = temp_input.cuda() | ||
64 | + temp_activation,_ = model(temp_input) | ||
65 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
66 | + | ||
67 | + activation = l2_norm(activation, 1) | ||
68 | + | ||
69 | + return activation | ||
70 | + | ||
71 | +def l2_norm(input, alpha): | ||
72 | + input_size = input.size() # size:(n_frames, dim) | ||
73 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
74 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
75 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
76 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
77 | + output = _output.view(input_size) | ||
78 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
79 | + output = output * alpha | ||
80 | + return output | ||
81 | + | ||
82 | +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres): | ||
83 | + enroll_embedding = embeddings[enroll_speaker] | ||
84 | + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames) | ||
85 | + | ||
86 | + score = F.cosine_similarity(test_embedding, enroll_embedding) | ||
87 | + score = score.data.cpu().numpy() | ||
88 | + | ||
89 | + if score > thres: | ||
90 | + result = 'Accept' | ||
91 | + else: | ||
92 | + result = 'Reject' | ||
93 | + | ||
94 | + test_spk = test_filename.split('/')[-2].split('_')[0] | ||
95 | + print("\n=== Speaker verification ===") | ||
96 | + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result)) | ||
97 | + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres)) | ||
98 | + | ||
99 | +def main(): | ||
100 | + | ||
101 | + log_dir = 'new_model5' # Where the checkpoints are saved | ||
102 | + embedding_dir = 'enroll_embeddings5' # Where embeddings are saved | ||
103 | + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved | ||
104 | + | ||
105 | + # Settings | ||
106 | + use_cuda = True # Use cuda or not | ||
107 | + embedding_size = 128 # Dimension of speaker embeddings | ||
108 | + cp_num = 30 # Which checkpoint to use? | ||
109 | + n_classes = 241 # How many speakers in training data? | ||
110 | + test_frames = 100 # Split the test utterance | ||
111 | + | ||
112 | + # Load model from checkpoint | ||
113 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
114 | + | ||
115 | + # Get the dataframe for test DB | ||
116 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
117 | + | ||
118 | + # Load enroll embeddings | ||
119 | + embeddings = load_enroll_embeddings(embedding_dir) | ||
120 | + | ||
121 | + """ Test speaker list | ||
122 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
123 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
124 | + """ | ||
125 | + | ||
126 | + # Set the true speaker | ||
127 | + enroll_speaker = '777M7777' | ||
128 | + | ||
129 | + # Set the claimed speaker | ||
130 | + test_speaker = 'sunghwan1' | ||
131 | + | ||
132 | + # Threshold | ||
133 | + thres = 0.95 | ||
134 | + | ||
135 | + test_path = os.path.join(test_dir, test_speaker, 'test.p') | ||
136 | + | ||
137 | + # Perform the test | ||
138 | + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres) | ||
139 | + | ||
140 | +if __name__ == '__main__': | ||
141 | + main() |
-
Please register or login to post a comment