김건

Resnet34 + Layer ver, Resnet50 ver commit

Speaker_Recognition @ df38711f
1 +Subproject commit df38711f36cfb15ee578d14a70d0141d1d0a8134
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model3 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
83 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
84 + max_score = -10**8
85 + best_spk = None
86 + for spk in spk_list:
87 + score = F.cosine_similarity(test_embedding, embeddings[spk])
88 + score = score.data.cpu().numpy()
89 + if score > max_score:
90 + max_score = score
91 + best_spk = spk
92 + #print("Speaker identification result : %s" %best_spk)
93 + true_spk = test_filename.split('/')[-2].split('_')[0]
94 + print("\n=== Speaker identification ===")
95 + print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
96 + return best_spk
97 +
98 +def main():
99 +
100 + log_dir = 'new_model3' # Where the checkpoints are saved
101 + embedding_dir = 'enroll_embeddings3' # Where embeddings are saved
102 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
103 +
104 + # Settings
105 + use_cuda = True # Use cuda or not
106 + embedding_size = 128 # Dimension of speaker embeddings
107 + cp_num = 11 # Which checkpoint to use?
108 + n_classes = 241 # How many speakers in training data?
109 + test_frames = 100 # Split the test utterance
110 +
111 + # Load model from checkpoint
112 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
113 +
114 + # Get the dataframe for test DB
115 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
116 +
117 + # Load enroll embeddings
118 + embeddings = load_enroll_embeddings(embedding_dir)
119 +
120 + """ Test speaker list
121 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
122 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
123 + """
124 +
125 + spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
126 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
127 +
128 + # Set the test speaker
129 + test_speaker = '233F4013'
130 +
131 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
132 +
133 + # Perform the test
134 + best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
135 +
136 +if __name__ == '__main__':
137 + main()
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model4 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
83 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
84 + max_score = -10**8
85 + best_spk = None
86 + for spk in spk_list:
87 + score = F.cosine_similarity(test_embedding, embeddings[spk])
88 + score = score.data.cpu().numpy()
89 + if score > max_score:
90 + max_score = score
91 + best_spk = spk
92 + #print("Speaker identification result : %s" %best_spk)
93 + true_spk = test_filename.split('/')[-2].split('_')[0]
94 + print("\n=== Speaker identification ===")
95 + print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
96 + return best_spk
97 +
98 +def main():
99 +
100 + log_dir = 'new_model4' # Where the checkpoints are saved
101 + embedding_dir = 'enroll_embeddings4' # Where embeddings are saved
102 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
103 +
104 + # Settings
105 + use_cuda = True # Use cuda or not
106 + embedding_size = 128 # Dimension of speaker embeddings
107 + cp_num = 25 # Which checkpoint to use?
108 + n_classes = 241 # How many speakers in training data?
109 + test_frames = 100 # Split the test utterance
110 +
111 + # Load model from checkpoint
112 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
113 +
114 + # Get the dataframe for test DB
115 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
116 +
117 + # Load enroll embeddings
118 + embeddings = load_enroll_embeddings(embedding_dir)
119 +
120 + """ Test speaker list
121 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
122 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
123 + """
124 +
125 + spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
126 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
127 +
128 + # Set the test speaker
129 + test_speaker = '207F2088'
130 +
131 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
132 +
133 + # Perform the test
134 + best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
135 +
136 +if __name__ == '__main__':
137 + main()
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model5 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
83 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
84 + max_score = -10**8
85 + best_spk = None
86 + for spk in spk_list:
87 + score = F.cosine_similarity(test_embedding, embeddings[spk])
88 + score = score.data.cpu().numpy()
89 + if score > max_score:
90 + max_score = score
91 + best_spk = spk
92 + #print("Speaker identification result : %s" %best_spk)
93 + true_spk = test_filename.split('/')[-2].split('_')[0]
94 + print("\n=== Speaker identification ===")
95 + print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
96 + return best_spk
97 +
98 +def main():
99 +
100 + log_dir = 'new_model5' # Where the checkpoints are saved
101 + embedding_dir = 'enroll_embeddings5' # Where embeddings are saved
102 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
103 +
104 + # Settings
105 + use_cuda = True # Use cuda or not
106 + embedding_size = 128 # Dimension of speaker embeddings
107 + cp_num = 30 # Which checkpoint to use?
108 + n_classes = 241 # How many speakers in training data?
109 + test_frames = 100 # Split the test utterance
110 +
111 + # Load model from checkpoint
112 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
113 +
114 + # Get the dataframe for test DB
115 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
116 +
117 + # Load enroll embeddings
118 + embeddings = load_enroll_embeddings(embedding_dir)
119 +
120 + """ Test speaker list
121 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
122 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
123 + """
124 +
125 + spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
126 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
127 +
128 + # Set the test speaker
129 + test_speaker = '207F2088'
130 +
131 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
132 +
133 + # Perform the test
134 + best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
135 +
136 +if __name__ == '__main__':
137 + main()
1 +import torch
2 +import torch.nn as nn
3 +import torch.nn.functional as F
4 +from torch.autograd import Function
5 +import model.resnet1 as resnet
6 +
7 +
8 +class background_resnet(nn.Module):
9 + def __init__(self, embedding_size, num_classes, backbone='resnet18'):
10 + super(background_resnet, self).__init__()
11 + self.backbone = backbone
12 + # copying modules from pretrained models
13 + if backbone == 'resnet50':
14 + self.pretrained = resnet.resnet50(pretrained=False)
15 + elif backbone == 'resnet101':
16 + self.pretrained = resnet.resnet101(pretrained=False)
17 + elif backbone == 'resnet152':
18 + self.pretrained = resnet.resnet152(pretrained=False)
19 + elif backbone == 'resnet18':
20 + self.pretrained = resnet.resnet18(pretrained=False)
21 + elif backbone == 'resnet34':
22 + self.pretrained = resnet.resnet34(pretrained=False)
23 + else:
24 + raise RuntimeError('unknown backbone: {}'.format(backbone))
25 +
26 + self.fc0 = nn.Linear(256, embedding_size)
27 + self.bn0 = nn.BatchNorm1d(embedding_size)
28 + self.relu = nn.ReLU()
29 + self.last = nn.Linear(embedding_size, num_classes)
30 +
31 + def forward(self, x):
32 + # input x: minibatch x 1 x 40 x 40
33 + x = self.pretrained.conv1(x)
34 + x = self.pretrained.bn1(x)
35 + x = self.pretrained.relu(x)
36 + x = self.pretrained.layer1(x)
37 + x = self.pretrained.layer2(x)
38 + x = self.pretrained.layer3(x)
39 + x = self.pretrained.layer4(x)
40 + x = self.pretrained.layer5(x)
41 +
42 + out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1]
43 + out = torch.squeeze(out) # [batch, n_embed]
44 + # flatten the out so that the fully connected layer can be connected from here
45 + out = out.view(x.size(0), -1) # (n_batch, n_embed)
46 + spk_embedding = self.fc0(out)
47 + out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed]
48 + out = self.last(out)
49 +
50 + return spk_embedding, out
...\ No newline at end of file ...\ No newline at end of file
1 +import torch
2 +import torch.nn as nn
3 +import torch.nn.functional as F
4 +from torch.autograd import Function
5 +import model.resnet1 as resnet
6 +
7 +
8 +class background_resnet(nn.Module):
9 + def __init__(self, embedding_size, num_classes, backbone='resnet34'):
10 + super(background_resnet, self).__init__()
11 + self.backbone = backbone
12 + # copying modules from pretrained models
13 + if backbone == 'resnet50':
14 + self.pretrained = resnet.resnet50(pretrained=False)
15 + elif backbone == 'resnet101':
16 + self.pretrained = resnet.resnet101(pretrained=False)
17 + elif backbone == 'resnet152':
18 + self.pretrained = resnet.resnet152(pretrained=False)
19 + elif backbone == 'resnet18':
20 + self.pretrained = resnet.resnet18(pretrained=False)
21 + elif backbone == 'resnet34':
22 + self.pretrained = resnet.resnet34(pretrained=False)
23 + else:
24 + raise RuntimeError('unknown backbone: {}'.format(backbone))
25 +
26 + self.fc0 = nn.Linear(256, embedding_size) # 512 등으로 바꿀 경우 resnet 사용할 수 있음
27 + self.bn0 = nn.BatchNorm1d(embedding_size)
28 + self.relu = nn.ReLU()
29 + self.last = nn.Linear(embedding_size, num_classes)
30 +
31 + def forward(self, x):
32 + # input x: minibatch x 1 x 40 x 40
33 + x = self.pretrained.conv1(x)
34 + x = self.pretrained.bn1(x)
35 + x = self.pretrained.relu(x)
36 + x = self.pretrained.layer1(x)
37 + x = self.pretrained.layer2(x)
38 + x = self.pretrained.layer3(x)
39 + x = self.pretrained.layer4(x)
40 + x = self.pretrained.layer5(x)
41 +
42 + out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1]
43 + out = torch.squeeze(out) # [batch, n_embed]
44 + # flatten the out so that the fully connected layer can be connected from here
45 + out = out.view(x.size(0), -1) # (n_batch, n_embed)
46 + spk_embedding = self.fc0(out)
47 + out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed]
48 + out = self.last(out)
49 +
50 + return spk_embedding, out
...\ No newline at end of file ...\ No newline at end of file
1 +import torch
2 +import torch.nn as nn
3 +import torch.nn.functional as F
4 +from torch.autograd import Function
5 +import model.resnet1 as resnet
6 +
7 +
8 +class background_resnet(nn.Module):
9 + def __init__(self, embedding_size, num_classes, backbone='resnet50'):
10 + super(background_resnet, self).__init__()
11 + self.backbone = backbone
12 + # copying modules from pretrained models
13 + if backbone == 'resnet50':
14 + self.pretrained = resnet.resnet50(pretrained=False)
15 + elif backbone == 'resnet101':
16 + self.pretrained = resnet.resnet101(pretrained=False)
17 + elif backbone == 'resnet152':
18 + self.pretrained = resnet.resnet152(pretrained=False)
19 + elif backbone == 'resnet18':
20 + self.pretrained = resnet.resnet18(pretrained=False)
21 + elif backbone == 'resnet34':
22 + self.pretrained = resnet.resnet34(pretrained=False)
23 + else:
24 + raise RuntimeError('unknown backbone: {}'.format(backbone))
25 +
26 + self.fc0 = nn.Linear(512, embedding_size) # 512 등으로 바꿀 경우 resnet 사용할 수 있음
27 + self.bn0 = nn.BatchNorm1d(embedding_size)
28 + self.relu = nn.ReLU()
29 + self.last = nn.Linear(embedding_size, num_classes)
30 +
31 + def forward(self, x):
32 + # input x: minibatch x 1 x 40 x 40
33 + x = self.pretrained.conv1(x)
34 + x = self.pretrained.bn1(x)
35 + x = self.pretrained.relu(x)
36 + x = self.pretrained.layer1(x)
37 + x = self.pretrained.layer2(x)
38 + x = self.pretrained.layer3(x)
39 + x = self.pretrained.layer4(x)
40 +
41 + out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1]
42 + out = torch.squeeze(out) # [batch, n_embed]
43 + # flatten the out so that the fully connected layer can be connected from here
44 + out = out.view(x.size(0), -1) # (n_batch, n_embed)
45 + spk_embedding = self.fc0(out)
46 + out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed]
47 + out = self.last(out)
48 +
49 + return spk_embedding, out
...\ No newline at end of file ...\ No newline at end of file
1 +import torch
2 +import torch.nn as nn
3 +import torch.optim as optim
4 +import torchvision.transforms as transforms
5 +
6 +import time
7 +import os
8 +import numpy as np
9 +import configure as c
10 +import pandas as pd
11 +from DB_wav_reader import read_feats_structure
12 +from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded
13 +from model.model3 import background_resnet
14 +import matplotlib as mpl
15 +mpl.use('Agg')
16 +import matplotlib.pyplot as plt
17 +
18 +import pandas as pd
19 +def load_dataset(val_ratio):
20 + # Load training set and validation set
21 +
22 +
23 + # Split training set into training set and validation set according to "val_ratio"
24 + train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio)
25 +
26 + file_loader = read_MFB # numpy array:(n_frames, n_dims)
27 +
28 + transform = transforms.Compose([
29 + TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims)
30 + ToTensorInput() # torch tensor:(1, n_dims, n_frames)
31 + ])
32 + transform_T = ToTensorDevInput()
33 +
34 +
35 + speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers
36 + spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)}
37 +
38 + train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx)
39 + valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx)
40 +
41 + n_classes = len(speaker_list) # How many speakers? 240
42 + return train_dataset, valid_dataset, n_classes
43 +
44 +def split_train_dev(train_feat_dir, valid_ratio):
45 + train_valid_DB = read_feats_structure(train_feat_dir)
46 + total_len = len(train_valid_DB) # 148642
47 + valid_len = int(total_len * valid_ratio/100.)
48 + train_len = total_len - valid_len
49 + shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True)
50 + # Split the DB into train and valid set
51 + train_DB = shuffled_train_valid_DB.iloc[:train_len]
52 + valid_DB = shuffled_train_valid_DB.iloc[train_len:]
53 + # Reset the index
54 + train_DB = train_DB.reset_index(drop=True)
55 + valid_DB = valid_DB.reset_index(drop=True)
56 + print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100))
57 + print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100))
58 + print('Total %d utts' %(total_len))
59 +
60 + return train_DB, valid_DB
61 +
62 +def main():
63 + # Set hyperparameters
64 + use_cuda = True # use gpu or cpu
65 + val_ratio = 10 # Percentage of validation set
66 + embedding_size = 128
67 + start = 1 # Start epoch
68 + n_epochs = 30 # How many epochs?
69 + end = start + n_epochs # Last epoch
70 +
71 + lr = 1e-1 # Initial learning rate
72 + wd = 1e-4 # Weight decay (L2 penalty)
73 + optimizer_type = 'sgd' # ex) sgd, adam, adagrad
74 +
75 + batch_size = 64 # Batch size for training
76 + valid_batch_size = 16 # Batch size for validation
77 + use_shuffle = True # Shuffle for training or not
78 +
79 + # Load dataset
80 + train_dataset, valid_dataset, n_classes = load_dataset(val_ratio)
81 +
82 + # print the experiment configuration
83 + print('\nNumber of classes (speakers):\n{}\n'.format(n_classes))
84 +
85 + log_dir = 'new_model3' # where to save checkpoints
86 +
87 + if not os.path.exists(log_dir):
88 + os.makedirs(log_dir)
89 +
90 + # instantiate model and initialize weights
91 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
92 +
93 + if use_cuda:
94 + model.cuda()
95 +
96 + # define loss function (criterion), optimizer and scheduler
97 + criterion = nn.CrossEntropyLoss()
98 + optimizer = create_optimizer(optimizer_type, model, lr, wd)
99 + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1)
100 +
101 + train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
102 + batch_size=batch_size,
103 + shuffle=use_shuffle)
104 + valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
105 + batch_size=valid_batch_size,
106 + shuffle=False,
107 + collate_fn = collate_fn_feat_padded)
108 +
109 + # to track the average training loss per epoch as the model trains
110 + avg_train_losses = []
111 + # to track the average validation loss per epoch as the model trains
112 + avg_valid_losses = []
113 +
114 +
115 + for epoch in range(start, end):
116 +
117 + # train for one epoch
118 + train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes)
119 +
120 + # evaluate on validation set
121 + valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch)
122 +
123 + scheduler.step(valid_loss, epoch)
124 +
125 + # calculate average loss over an epoch
126 + avg_train_losses.append(train_loss)
127 + avg_valid_losses.append(valid_loss)
128 + # do checkpointing
129 + torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(),
130 + 'optimizer': optimizer.state_dict()},
131 + '{}/checkpoint_{}.pth'.format(log_dir, epoch))
132 +
133 + # find position of lowest validation loss
134 + minposs = avg_valid_losses.index(min(avg_valid_losses))+1
135 + print('Lowest validation loss at epoch %d' %minposs)
136 +
137 + # visualize the loss and learning rate as the network trained
138 + visualize_the_losses(avg_train_losses, avg_valid_losses)
139 +
140 +
141 +def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes):
142 + batch_time = AverageMeter()
143 + losses = AverageMeter()
144 + train_acc = AverageMeter()
145 +
146 + n_correct, n_total = 0, 0
147 + log_interval = 84
148 + # switch to train mode
149 + model.train()
150 +
151 + end = time.time()
152 + # pbar = tqdm(enumerate(train_loader))
153 + for batch_idx, (data) in enumerate(train_loader):
154 + inputs, targets = data # target size:(batch size,1), input size:(batch size, 1, dim, win)
155 + targets = targets.view(-1) # target size:(batch size)
156 + current_sample = inputs.size(0) # batch size
157 +
158 + if use_cuda:
159 + inputs = inputs.cuda()
160 + targets = targets.cuda()
161 + _, output = model(inputs) # out size:(batch size, #classes), for softmax
162 +
163 + # calculate accuracy of predictions in the current batch
164 + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
165 + n_total += current_sample
166 + train_acc_temp = 100. * n_correct / n_total
167 + train_acc.update(train_acc_temp, inputs.size(0))
168 +
169 + loss = criterion(output, targets)
170 + losses.update(loss.item(), inputs.size(0))
171 +
172 + # compute gradient and do SGD step
173 + optimizer.zero_grad()
174 + loss.backward()
175 + optimizer.step()
176 +
177 + # measure elapsed time
178 + batch_time.update(time.time() - end)
179 + end = time.time()
180 +
181 + if batch_idx % log_interval == 0:
182 + print(
183 + 'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t'
184 + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
185 + 'Loss {loss.avg:.4f}\t'
186 + 'Acc {train_acc.avg:.4f}'.format(
187 + epoch, batch_idx * len(inputs), len(train_loader.dataset),
188 + 100. * batch_idx / len(train_loader),
189 + batch_time=batch_time, loss=losses, train_acc=train_acc))
190 + return losses.avg
191 +
192 +def validate(val_loader, model, criterion, use_cuda, epoch):
193 + batch_time = AverageMeter()
194 + losses = AverageMeter()
195 + val_acc = AverageMeter()
196 +
197 + n_correct, n_total = 0, 0
198 +
199 + # switch to evaluate mode
200 + model.eval()
201 +
202 + with torch.no_grad():
203 + end = time.time()
204 + for i, (data) in enumerate(val_loader):
205 + inputs, targets = data
206 + current_sample = inputs.size(0) # batch size
207 +
208 + if use_cuda:
209 + inputs = inputs.cuda()
210 + targets = targets.cuda()
211 +
212 + # compute output
213 + _, output = model(inputs)
214 +
215 + # measure accuracy and record loss
216 + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
217 + n_total += current_sample
218 + val_acc_temp = 100. * n_correct / n_total
219 + val_acc.update(val_acc_temp, inputs.size(0))
220 +
221 + loss = criterion(output, targets)
222 + losses.update(loss.item(), inputs.size(0))
223 + # measure elapsed time
224 + batch_time.update(time.time() - end)
225 + end = time.time()
226 +
227 + print(' * Validation: '
228 + 'Loss {loss.avg:.4f}\t'
229 + 'Acc {val_acc.avg:.4f}'.format(
230 + loss=losses, val_acc=val_acc))
231 +
232 + return losses.avg
233 +
234 +class AverageMeter(object):
235 + """Computes and stores the average and current value"""
236 + def __init__(self):
237 + self.reset()
238 + def reset(self):
239 + self.val = 0
240 + self.avg = 0
241 + self.sum = 0
242 + self.count = 0
243 + def update(self, val, n=1):
244 + self.val = val
245 + self.sum += val * n
246 + self.count += n
247 + self.avg = self.sum / self.count
248 +
249 +def create_optimizer(optimizer, model, new_lr, wd):
250 + # setup optimizer
251 + if optimizer == 'sgd':
252 + optimizer = optim.SGD(model.parameters(), lr=new_lr,
253 + momentum=0.9, dampening=0,
254 + weight_decay=wd)
255 + elif optimizer == 'adam':
256 + optimizer = optim.Adam(model.parameters(), lr=new_lr,
257 + weight_decay=wd)
258 + elif optimizer == 'adagrad':
259 + optimizer = optim.Adagrad(model.parameters(),
260 + lr=new_lr,
261 + weight_decay=wd)
262 + return optimizer
263 +
264 +def visualize_the_losses(train_loss, valid_loss):
265 + fig = plt.figure(figsize=(10,8))
266 + plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss')
267 + plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss')
268 +
269 + # find position of lowest validation loss
270 + minposs = valid_loss.index(min(valid_loss))+1
271 + plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')
272 +
273 + plt.xlabel('epochs')
274 + plt.ylabel('loss')
275 + plt.ylim(0, 3.5) # consistent scale
276 + plt.xlim(0, len(train_loss)+1) # consistent scale
277 + plt.grid(True)
278 + plt.legend()
279 + plt.tight_layout()
280 + #plt.show()
281 + fig.savefig('train3.png', bbox_inches='tight')
282 +
283 +if __name__ == '__main__':
284 + main()
1 +import torch
2 +import torch.nn as nn
3 +import torch.optim as optim
4 +import torchvision.transforms as transforms
5 +
6 +import time
7 +import os
8 +import numpy as np
9 +import configure as c
10 +import pandas as pd
11 +from DB_wav_reader import read_feats_structure
12 +from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded
13 +from model.model4 import background_resnet
14 +import matplotlib as mpl
15 +mpl.use('Agg')
16 +import matplotlib.pyplot as plt
17 +import pandas as pd
18 +def load_dataset(val_ratio):
19 + # Load training set and validation set
20 +
21 +
22 + # Split training set into training set and validation set according to "val_ratio"
23 + train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio)
24 +
25 + file_loader = read_MFB # numpy array:(n_frames, n_dims)
26 +
27 + transform = transforms.Compose([
28 + TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims)
29 + ToTensorInput() # torch tensor:(1, n_dims, n_frames)
30 + ])
31 + transform_T = ToTensorDevInput()
32 +
33 +
34 + speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers
35 + spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)}
36 +
37 + train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx)
38 + valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx)
39 +
40 + n_classes = len(speaker_list) # How many speakers? 240
41 + return train_dataset, valid_dataset, n_classes
42 +
43 +def split_train_dev(train_feat_dir, valid_ratio):
44 + train_valid_DB = read_feats_structure(train_feat_dir)
45 + total_len = len(train_valid_DB) # 148642
46 + valid_len = int(total_len * valid_ratio/100.)
47 + train_len = total_len - valid_len
48 + shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True)
49 + # Split the DB into train and valid set
50 + train_DB = shuffled_train_valid_DB.iloc[:train_len]
51 + valid_DB = shuffled_train_valid_DB.iloc[train_len:]
52 + # Reset the index
53 + train_DB = train_DB.reset_index(drop=True)
54 + valid_DB = valid_DB.reset_index(drop=True)
55 + print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100))
56 + print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100))
57 + print('Total %d utts' %(total_len))
58 +
59 + return train_DB, valid_DB
60 +
61 +def main():
62 + # Set hyperparameters
63 + use_cuda = True # use gpu or cpu
64 + val_ratio = 10 # Percentage of validation set
65 + embedding_size = 128
66 + start = 1 # Start epoch
67 + n_epochs = 30 # How many epochs?
68 + end = start + n_epochs # Last epoch
69 +
70 + lr = 1e-1 # Initial learning rate
71 + wd = 1e-4 # Weight decay (L2 penalty)
72 + optimizer_type = 'sgd' # ex) sgd, adam, adagrad
73 +
74 + batch_size = 64 # Batch size for training
75 + valid_batch_size = 16 # Batch size for validation
76 + use_shuffle = True # Shuffle for training or not
77 +
78 + # Load dataset
79 + train_dataset, valid_dataset, n_classes = load_dataset(val_ratio)
80 +
81 + # print the experiment configuration
82 + print('\nNumber of classes (speakers):\n{}\n'.format(n_classes))
83 +
84 + log_dir = 'new_model4' # where to save checkpoints
85 +
86 + if not os.path.exists(log_dir):
87 + os.makedirs(log_dir)
88 +
89 + # instantiate model and initialize weights
90 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
91 +
92 + if use_cuda:
93 + model.cuda()
94 +
95 + # define loss function (criterion), optimizer and scheduler
96 + criterion = nn.CrossEntropyLoss()
97 + optimizer = create_optimizer(optimizer_type, model, lr, wd)
98 + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1)
99 +
100 + train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
101 + batch_size=batch_size,
102 + shuffle=use_shuffle)
103 + valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
104 + batch_size=valid_batch_size,
105 + shuffle=False,
106 + collate_fn = collate_fn_feat_padded)
107 +
108 + # to track the average training loss per epoch as the model trains
109 + avg_train_losses = []
110 + # to track the average validation loss per epoch as the model trains
111 + avg_valid_losses = []
112 +
113 +
114 + for epoch in range(start, end):
115 +
116 + # train for one epoch
117 + train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes)
118 +
119 + # evaluate on validation set
120 + valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch)
121 +
122 + scheduler.step(valid_loss, epoch)
123 +
124 + # calculate average loss over an epoch
125 + avg_train_losses.append(train_loss)
126 + avg_valid_losses.append(valid_loss)
127 + # do checkpointing
128 + torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(),
129 + 'optimizer': optimizer.state_dict()},
130 + '{}/checkpoint_{}.pth'.format(log_dir, epoch))
131 +
132 + # find position of lowest validation loss
133 + minposs = avg_valid_losses.index(min(avg_valid_losses))+1
134 + print('Lowest validation loss at epoch %d' %minposs)
135 +
136 + # visualize the loss and learning rate as the network trained
137 + visualize_the_losses(avg_train_losses, avg_valid_losses)
138 +
139 +
140 +def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes):
141 + batch_time = AverageMeter()
142 + losses = AverageMeter()
143 + train_acc = AverageMeter()
144 +
145 + n_correct, n_total = 0, 0
146 + log_interval = 84
147 + # switch to train mode
148 + model.train()
149 +
150 + end = time.time()
151 + # pbar = tqdm(enumerate(train_loader))
152 + for batch_idx, (data) in enumerate(train_loader):
153 + inputs, targets = data # target size:(batch size,1), input size:(batch size, 1, dim, win)
154 + targets = targets.view(-1) # target size:(batch size)
155 + current_sample = inputs.size(0) # batch size
156 +
157 + if use_cuda:
158 + inputs = inputs.cuda()
159 + targets = targets.cuda()
160 + _, output = model(inputs) # out size:(batch size, #classes), for softmax
161 +
162 + # calculate accuracy of predictions in the current batch
163 + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
164 + n_total += current_sample
165 + train_acc_temp = 100. * n_correct / n_total
166 + train_acc.update(train_acc_temp, inputs.size(0))
167 +
168 + loss = criterion(output, targets)
169 + losses.update(loss.item(), inputs.size(0))
170 +
171 + # compute gradient and do SGD step
172 + optimizer.zero_grad()
173 + loss.backward()
174 + optimizer.step()
175 +
176 + # measure elapsed time
177 + batch_time.update(time.time() - end)
178 + end = time.time()
179 +
180 + if batch_idx % log_interval == 0:
181 + print(
182 + 'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t'
183 + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
184 + 'Loss {loss.avg:.4f}\t'
185 + 'Acc {train_acc.avg:.4f}'.format(
186 + epoch, batch_idx * len(inputs), len(train_loader.dataset),
187 + 100. * batch_idx / len(train_loader),
188 + batch_time=batch_time, loss=losses, train_acc=train_acc))
189 + return losses.avg
190 +
191 +def validate(val_loader, model, criterion, use_cuda, epoch):
192 + batch_time = AverageMeter()
193 + losses = AverageMeter()
194 + val_acc = AverageMeter()
195 +
196 + n_correct, n_total = 0, 0
197 +
198 + # switch to evaluate mode
199 + model.eval()
200 +
201 + with torch.no_grad():
202 + end = time.time()
203 + for i, (data) in enumerate(val_loader):
204 + inputs, targets = data
205 + current_sample = inputs.size(0) # batch size
206 +
207 + if use_cuda:
208 + inputs = inputs.cuda()
209 + targets = targets.cuda()
210 +
211 + # compute output
212 + _, output = model(inputs)
213 +
214 + # measure accuracy and record loss
215 + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
216 + n_total += current_sample
217 + val_acc_temp = 100. * n_correct / n_total
218 + val_acc.update(val_acc_temp, inputs.size(0))
219 +
220 + loss = criterion(output, targets)
221 + losses.update(loss.item(), inputs.size(0))
222 + # measure elapsed time
223 + batch_time.update(time.time() - end)
224 + end = time.time()
225 +
226 + print(' * Validation: '
227 + 'Loss {loss.avg:.4f}\t'
228 + 'Acc {val_acc.avg:.4f}'.format(
229 + loss=losses, val_acc=val_acc))
230 +
231 + return losses.avg
232 +
233 +class AverageMeter(object):
234 + """Computes and stores the average and current value"""
235 + def __init__(self):
236 + self.reset()
237 + def reset(self):
238 + self.val = 0
239 + self.avg = 0
240 + self.sum = 0
241 + self.count = 0
242 + def update(self, val, n=1):
243 + self.val = val
244 + self.sum += val * n
245 + self.count += n
246 + self.avg = self.sum / self.count
247 +
248 +def create_optimizer(optimizer, model, new_lr, wd):
249 + # setup optimizer
250 + if optimizer == 'sgd':
251 + optimizer = optim.SGD(model.parameters(), lr=new_lr,
252 + momentum=0.9, dampening=0,
253 + weight_decay=wd)
254 + elif optimizer == 'adam':
255 + optimizer = optim.Adam(model.parameters(), lr=new_lr,
256 + weight_decay=wd)
257 + elif optimizer == 'adagrad':
258 + optimizer = optim.Adagrad(model.parameters(),
259 + lr=new_lr,
260 + weight_decay=wd)
261 + return optimizer
262 +
263 +def visualize_the_losses(train_loss, valid_loss):
264 + fig = plt.figure(figsize=(10,8))
265 + plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss')
266 + plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss')
267 +
268 + # find position of lowest validation loss
269 + minposs = valid_loss.index(min(valid_loss))+1
270 + plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')
271 +
272 + plt.xlabel('epochs')
273 + plt.ylabel('loss')
274 + plt.ylim(0, 3.5) # consistent scale
275 + plt.xlim(0, len(train_loss)+1) # consistent scale
276 + plt.grid(True)
277 + plt.legend()
278 + plt.tight_layout()
279 + #plt.show()
280 + fig.savefig('train4.png', bbox_inches='tight')
281 +
282 +if __name__ == '__main__':
283 + main()
1 +import torch
2 +import torch.nn as nn
3 +import torch.optim as optim
4 +import torchvision.transforms as transforms
5 +
6 +import time
7 +import os
8 +import numpy as np
9 +import configure1_merge as c
10 +import pandas as pd
11 +from DB_wav_reader import read_feats_structure
12 +from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded
13 +from model.model4 import background_resnet
14 +import matplotlib as mpl
15 +mpl.use('Agg')
16 +import matplotlib.pyplot as plt
17 +
18 +import pandas as pd
19 +def load_dataset(val_ratio):
20 + # Load training set and validation set
21 +
22 +
23 + # Split training set into training set and validation set according to "val_ratio"
24 + train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio)
25 +
26 + file_loader = read_MFB # numpy array:(n_frames, n_dims)
27 +
28 + transform = transforms.Compose([
29 + TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims)
30 + ToTensorInput() # torch tensor:(1, n_dims, n_frames)
31 + ])
32 + transform_T = ToTensorDevInput()
33 +
34 +
35 + speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers
36 + spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)}
37 +
38 + train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx)
39 + valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx)
40 +
41 + n_classes = len(speaker_list) # How many speakers? 240
42 + return train_dataset, valid_dataset, n_classes
43 +
44 +def split_train_dev(train_feat_dir, valid_ratio):
45 + train_valid_DB = read_feats_structure(train_feat_dir)
46 + total_len = len(train_valid_DB) # 148642
47 + valid_len = int(total_len * valid_ratio/100.)
48 + train_len = total_len - valid_len
49 + shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True)
50 + # Split the DB into train and valid set
51 + train_DB = shuffled_train_valid_DB.iloc[:train_len]
52 + valid_DB = shuffled_train_valid_DB.iloc[train_len:]
53 + # Reset the index
54 + train_DB = train_DB.reset_index(drop=True)
55 + valid_DB = valid_DB.reset_index(drop=True)
56 + print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100))
57 + print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100))
58 + print('Total %d utts' %(total_len))
59 +
60 + return train_DB, valid_DB
61 +
62 +def main():
63 + # Set hyperparameters
64 + use_cuda = True # use gpu or cpu
65 + val_ratio = 10 # Percentage of validation set
66 + embedding_size = 128
67 + start = 1 # Start epoch
68 + n_epochs = 60 # How many epochs?
69 + end = start + n_epochs # Last epoch
70 +
71 + lr = 1e-1 # Initial learning rate
72 + wd = 1e-4 # Weight decay (L2 penalty)
73 + optimizer_type = 'sgd' # ex) sgd, adam, adagrad
74 +
75 + batch_size = 64 # Batch size for training
76 + valid_batch_size = 16 # Batch size for validation
77 + use_shuffle = True # Shuffle for training or not
78 +
79 + # Load dataset
80 + train_dataset, valid_dataset, n_classes = load_dataset(val_ratio)
81 +
82 + # print the experiment configuration
83 + print('\nNumber of classes (speakers):\n{}\n'.format(n_classes))
84 +
85 + log_dir = 'new_model4_merge' # where to save checkpoints
86 +
87 + if not os.path.exists(log_dir):
88 + os.makedirs(log_dir)
89 +
90 + # instantiate model and initialize weights
91 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
92 +
93 + if use_cuda:
94 + model.cuda()
95 +
96 + # define loss function (criterion), optimizer and scheduler
97 + criterion = nn.CrossEntropyLoss()
98 + optimizer = create_optimizer(optimizer_type, model, lr, wd)
99 + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1)
100 +
101 + train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
102 + batch_size=batch_size,
103 + shuffle=use_shuffle)
104 + valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
105 + batch_size=valid_batch_size,
106 + shuffle=False,
107 + collate_fn = collate_fn_feat_padded)
108 +
109 + # to track the average training loss per epoch as the model trains
110 + avg_train_losses = []
111 + # to track the average validation loss per epoch as the model trains
112 + avg_valid_losses = []
113 +
114 +
115 + for epoch in range(start, end):
116 +
117 + # train for one epoch
118 + train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes)
119 +
120 + # evaluate on validation set
121 + valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch)
122 +
123 + scheduler.step(valid_loss, epoch)
124 +
125 + # calculate average loss over an epoch
126 + avg_train_losses.append(train_loss)
127 + avg_valid_losses.append(valid_loss)
128 + # do checkpointing
129 + torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(),
130 + 'optimizer': optimizer.state_dict()},
131 + '{}/checkpoint_{}.pth'.format(log_dir, epoch))
132 +
133 + # find position of lowest validation loss
134 + minposs = avg_valid_losses.index(min(avg_valid_losses))+1
135 + print('Lowest validation loss at epoch %d' %minposs)
136 +
137 + # visualize the loss and learning rate as the network trained
138 + visualize_the_losses(avg_train_losses, avg_valid_losses)
139 +
140 +
141 +def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes):
142 + batch_time = AverageMeter()
143 + losses = AverageMeter()
144 + train_acc = AverageMeter()
145 +
146 + n_correct, n_total = 0, 0
147 + log_interval = 84
148 + # switch to train mode
149 + model.train()
150 +
151 + end = time.time()
152 + # pbar = tqdm(enumerate(train_loader))
153 + for batch_idx, (data) in enumerate(train_loader):
154 + inputs, targets = data # target size:(batch size,1), input size:(batch size, 1, dim, win)
155 + targets = targets.view(-1) # target size:(batch size)
156 + current_sample = inputs.size(0) # batch size
157 +
158 + if use_cuda:
159 + inputs = inputs.cuda()
160 + targets = targets.cuda()
161 + _, output = model(inputs) # out size:(batch size, #classes), for softmax
162 +
163 + # calculate accuracy of predictions in the current batch
164 + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
165 + n_total += current_sample
166 + train_acc_temp = 100. * n_correct / n_total
167 + train_acc.update(train_acc_temp, inputs.size(0))
168 +
169 + loss = criterion(output, targets)
170 + losses.update(loss.item(), inputs.size(0))
171 +
172 + # compute gradient and do SGD step
173 + optimizer.zero_grad()
174 + loss.backward()
175 + optimizer.step()
176 +
177 + # measure elapsed time
178 + batch_time.update(time.time() - end)
179 + end = time.time()
180 +
181 + if batch_idx % log_interval == 0:
182 + print(
183 + 'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t'
184 + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
185 + 'Loss {loss.avg:.4f}\t'
186 + 'Acc {train_acc.avg:.4f}'.format(
187 + epoch, batch_idx * len(inputs), len(train_loader.dataset),
188 + 100. * batch_idx / len(train_loader),
189 + batch_time=batch_time, loss=losses, train_acc=train_acc))
190 + return losses.avg
191 +
192 +def validate(val_loader, model, criterion, use_cuda, epoch):
193 + batch_time = AverageMeter()
194 + losses = AverageMeter()
195 + val_acc = AverageMeter()
196 +
197 + n_correct, n_total = 0, 0
198 +
199 + # switch to evaluate mode
200 + model.eval()
201 +
202 + with torch.no_grad():
203 + end = time.time()
204 + for i, (data) in enumerate(val_loader):
205 + inputs, targets = data
206 + current_sample = inputs.size(0) # batch size
207 +
208 + if use_cuda:
209 + inputs = inputs.cuda()
210 + targets = targets.cuda()
211 +
212 + # compute output
213 + _, output = model(inputs)
214 +
215 + # measure accuracy and record loss
216 + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
217 + n_total += current_sample
218 + val_acc_temp = 100. * n_correct / n_total
219 + val_acc.update(val_acc_temp, inputs.size(0))
220 +
221 + loss = criterion(output, targets)
222 + losses.update(loss.item(), inputs.size(0))
223 + # measure elapsed time
224 + batch_time.update(time.time() - end)
225 + end = time.time()
226 +
227 + print(' * Validation: '
228 + 'Loss {loss.avg:.4f}\t'
229 + 'Acc {val_acc.avg:.4f}'.format(
230 + loss=losses, val_acc=val_acc))
231 +
232 + return losses.avg
233 +
234 +class AverageMeter(object):
235 + """Computes and stores the average and current value"""
236 + def __init__(self):
237 + self.reset()
238 + def reset(self):
239 + self.val = 0
240 + self.avg = 0
241 + self.sum = 0
242 + self.count = 0
243 + def update(self, val, n=1):
244 + self.val = val
245 + self.sum += val * n
246 + self.count += n
247 + self.avg = self.sum / self.count
248 +
249 +def create_optimizer(optimizer, model, new_lr, wd):
250 + # setup optimizer
251 + if optimizer == 'sgd':
252 + optimizer = optim.SGD(model.parameters(), lr=new_lr,
253 + momentum=0.9, dampening=0,
254 + weight_decay=wd)
255 + elif optimizer == 'adam':
256 + optimizer = optim.Adam(model.parameters(), lr=new_lr,
257 + weight_decay=wd)
258 + elif optimizer == 'adagrad':
259 + optimizer = optim.Adagrad(model.parameters(),
260 + lr=new_lr,
261 + weight_decay=wd)
262 + return optimizer
263 +
264 +def visualize_the_losses(train_loss, valid_loss):
265 + fig = plt.figure(figsize=(10,8))
266 + plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss')
267 + plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss')
268 +
269 + # find position of lowest validation loss
270 + minposs = valid_loss.index(min(valid_loss))+1
271 + plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')
272 +
273 + plt.xlabel('epochs')
274 + plt.ylabel('loss')
275 + plt.ylim(0, 3.5) # consistent scale
276 + plt.xlim(0, len(train_loss)+1) # consistent scale
277 + plt.grid(True)
278 + plt.legend()
279 + plt.tight_layout()
280 + #plt.show()
281 + fig.savefig('train4_merge.png', bbox_inches='tight')
282 +
283 +if __name__ == '__main__':
284 + main()
1 +import torch
2 +import torch.nn as nn
3 +import torch.optim as optim
4 +import torchvision.transforms as transforms
5 +
6 +import time
7 +import os
8 +import numpy as np
9 +import configure1_zeroth as c
10 +import pandas as pd
11 +from DB_wav_reader import read_feats_structure
12 +from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded
13 +from model.model4 import background_resnet
14 +
15 +import matplotlib as mpl
16 +mpl.use('Agg')
17 +import matplotlib.pyplot as plt
18 +
19 +import pandas as pd
20 +def load_dataset(val_ratio):
21 + # Load training set and validation set
22 +
23 +
24 + # Split training set into training set and validation set according to "val_ratio"
25 + train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio)
26 +
27 + file_loader = read_MFB # numpy array:(n_frames, n_dims)
28 +
29 + transform = transforms.Compose([
30 + TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims)
31 + ToTensorInput() # torch tensor:(1, n_dims, n_frames)
32 + ])
33 + transform_T = ToTensorDevInput()
34 +
35 +
36 + speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers
37 + spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)}
38 +
39 + train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx)
40 + valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx)
41 +
42 + n_classes = len(speaker_list) # How many speakers? 240
43 + return train_dataset, valid_dataset, n_classes
44 +
45 +def split_train_dev(train_feat_dir, valid_ratio):
46 + train_valid_DB = read_feats_structure(train_feat_dir)
47 + total_len = len(train_valid_DB) # 148642
48 + valid_len = int(total_len * valid_ratio/100.)
49 + train_len = total_len - valid_len
50 + shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True)
51 + # Split the DB into train and valid set
52 + train_DB = shuffled_train_valid_DB.iloc[:train_len]
53 + valid_DB = shuffled_train_valid_DB.iloc[train_len:]
54 + # Reset the index
55 + train_DB = train_DB.reset_index(drop=True)
56 + valid_DB = valid_DB.reset_index(drop=True)
57 + print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100))
58 + print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100))
59 + print('Total %d utts' %(total_len))
60 +
61 + return train_DB, valid_DB
62 +
63 +def main():
64 + # Set hyperparameters
65 + use_cuda = True # use gpu or cpu
66 + val_ratio = 10 # Percentage of validation set
67 + embedding_size = 128
68 + start = 1 # Start epoch
69 + n_epochs = 30 # How many epochs?
70 + end = start + n_epochs # Last epoch
71 +
72 + lr = 1e-1 # Initial learning rate
73 + wd = 1e-4 # Weight decay (L2 penalty)
74 + optimizer_type = 'sgd' # ex) sgd, adam, adagrad
75 +
76 + batch_size = 64 # Batch size for training
77 + valid_batch_size = 16 # Batch size for validation
78 + use_shuffle = True # Shuffle for training or not
79 +
80 + # Load dataset
81 + train_dataset, valid_dataset, n_classes = load_dataset(val_ratio)
82 +
83 + # print the experiment configuration
84 + print('\nNumber of classes (speakers):\n{}\n'.format(n_classes))
85 +
86 + log_dir = 'new_model4_zeroth' # where to save checkpoints
87 +
88 + if not os.path.exists(log_dir):
89 + os.makedirs(log_dir)
90 +
91 + # instantiate model and initialize weights
92 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
93 +
94 + if use_cuda:
95 + model.cuda()
96 +
97 + # define loss function (criterion), optimizer and scheduler
98 + criterion = nn.CrossEntropyLoss()
99 + optimizer = create_optimizer(optimizer_type, model, lr, wd)
100 + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1)
101 +
102 + train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
103 + batch_size=batch_size,
104 + shuffle=use_shuffle)
105 + valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
106 + batch_size=valid_batch_size,
107 + shuffle=False,
108 + collate_fn = collate_fn_feat_padded)
109 +
110 + # to track the average training loss per epoch as the model trains
111 + avg_train_losses = []
112 + # to track the average validation loss per epoch as the model trains
113 + avg_valid_losses = []
114 +
115 +
116 + for epoch in range(start, end):
117 +
118 + # train for one epoch
119 + train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes)
120 +
121 + # evaluate on validation set
122 + valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch)
123 +
124 + scheduler.step(valid_loss, epoch)
125 +
126 + # calculate average loss over an epoch
127 + avg_train_losses.append(train_loss)
128 + avg_valid_losses.append(valid_loss)
129 + # do checkpointing
130 + torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(),
131 + 'optimizer': optimizer.state_dict()},
132 + '{}/checkpoint_{}.pth'.format(log_dir, epoch))
133 +
134 + # find position of lowest validation loss
135 + minposs = avg_valid_losses.index(min(avg_valid_losses))+1
136 + print('Lowest validation loss at epoch %d' %minposs)
137 +
138 + # visualize the loss and learning rate as the network trained
139 + visualize_the_losses(avg_train_losses, avg_valid_losses)
140 +
141 +
142 +def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes):
143 + batch_time = AverageMeter()
144 + losses = AverageMeter()
145 + train_acc = AverageMeter()
146 +
147 + n_correct, n_total = 0, 0
148 + log_interval = 84
149 + # switch to train mode
150 + model.train()
151 +
152 + end = time.time()
153 + # pbar = tqdm(enumerate(train_loader))
154 + for batch_idx, (data) in enumerate(train_loader):
155 + inputs, targets = data # target size:(batch size,1), input size:(batch size, 1, dim, win)
156 + targets = targets.view(-1) # target size:(batch size)
157 + current_sample = inputs.size(0) # batch size
158 +
159 + if use_cuda:
160 + inputs = inputs.cuda()
161 + targets = targets.cuda()
162 + _, output = model(inputs) # out size:(batch size, #classes), for softmax
163 +
164 + # calculate accuracy of predictions in the current batch
165 + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
166 + n_total += current_sample
167 + train_acc_temp = 100. * n_correct / n_total
168 + train_acc.update(train_acc_temp, inputs.size(0))
169 +
170 + loss = criterion(output, targets)
171 + losses.update(loss.item(), inputs.size(0))
172 +
173 + # compute gradient and do SGD step
174 + optimizer.zero_grad()
175 + loss.backward()
176 + optimizer.step()
177 +
178 + # measure elapsed time
179 + batch_time.update(time.time() - end)
180 + end = time.time()
181 +
182 + if batch_idx % log_interval == 0:
183 + print(
184 + 'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t'
185 + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
186 + 'Loss {loss.avg:.4f}\t'
187 + 'Acc {train_acc.avg:.4f}'.format(
188 + epoch, batch_idx * len(inputs), len(train_loader.dataset),
189 + 100. * batch_idx / len(train_loader),
190 + batch_time=batch_time, loss=losses, train_acc=train_acc))
191 + return losses.avg
192 +
193 +def validate(val_loader, model, criterion, use_cuda, epoch):
194 + batch_time = AverageMeter()
195 + losses = AverageMeter()
196 + val_acc = AverageMeter()
197 +
198 + n_correct, n_total = 0, 0
199 +
200 + # switch to evaluate mode
201 + model.eval()
202 +
203 + with torch.no_grad():
204 + end = time.time()
205 + for i, (data) in enumerate(val_loader):
206 + inputs, targets = data
207 + current_sample = inputs.size(0) # batch size
208 +
209 + if use_cuda:
210 + inputs = inputs.cuda()
211 + targets = targets.cuda()
212 +
213 + # compute output
214 + _, output = model(inputs)
215 +
216 + # measure accuracy and record loss
217 + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
218 + n_total += current_sample
219 + val_acc_temp = 100. * n_correct / n_total
220 + val_acc.update(val_acc_temp, inputs.size(0))
221 +
222 + loss = criterion(output, targets)
223 + losses.update(loss.item(), inputs.size(0))
224 + # measure elapsed time
225 + batch_time.update(time.time() - end)
226 + end = time.time()
227 +
228 + print(' * Validation: '
229 + 'Loss {loss.avg:.4f}\t'
230 + 'Acc {val_acc.avg:.4f}'.format(
231 + loss=losses, val_acc=val_acc))
232 +
233 + return losses.avg
234 +
235 +class AverageMeter(object):
236 + """Computes and stores the average and current value"""
237 + def __init__(self):
238 + self.reset()
239 + def reset(self):
240 + self.val = 0
241 + self.avg = 0
242 + self.sum = 0
243 + self.count = 0
244 + def update(self, val, n=1):
245 + self.val = val
246 + self.sum += val * n
247 + self.count += n
248 + self.avg = self.sum / self.count
249 +
250 +def create_optimizer(optimizer, model, new_lr, wd):
251 + # setup optimizer
252 + if optimizer == 'sgd':
253 + optimizer = optim.SGD(model.parameters(), lr=new_lr,
254 + momentum=0.9, dampening=0,
255 + weight_decay=wd)
256 + elif optimizer == 'adam':
257 + optimizer = optim.Adam(model.parameters(), lr=new_lr,
258 + weight_decay=wd)
259 + elif optimizer == 'adagrad':
260 + optimizer = optim.Adagrad(model.parameters(),
261 + lr=new_lr,
262 + weight_decay=wd)
263 + return optimizer
264 +
265 +def visualize_the_losses(train_loss, valid_loss):
266 + fig = plt.figure(figsize=(10,8))
267 + plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss')
268 + plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss')
269 +
270 + # find position of lowest validation loss
271 + minposs = valid_loss.index(min(valid_loss))+1
272 + plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')
273 +
274 + plt.xlabel('epochs')
275 + plt.ylabel('loss')
276 + plt.ylim(0, 3.5) # consistent scale
277 + plt.xlim(0, len(train_loss)+1) # consistent scale
278 + plt.grid(True)
279 + plt.legend()
280 + plt.tight_layout()
281 + #plt.show()
282 + fig.savefig('train4_zeroth.png', bbox_inches='tight')
283 +
284 +if __name__ == '__main__':
285 + main()
1 +import torch
2 +import torch.nn as nn
3 +import torch.optim as optim
4 +import torchvision.transforms as transforms
5 +
6 +import time
7 +import os
8 +import numpy as np
9 +import configure as c
10 +import pandas as pd
11 +from DB_wav_reader import read_feats_structure
12 +from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded
13 +from model.model5 import background_resnet
14 +import matplotlib as mpl
15 +mpl.use('Agg')
16 +import matplotlib.pyplot as plt
17 +
18 +import pandas as pd
19 +def load_dataset(val_ratio):
20 + # Load training set and validation set
21 +
22 +
23 + # Split training set into training set and validation set according to "val_ratio"
24 + train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio)
25 +
26 + file_loader = read_MFB # numpy array:(n_frames, n_dims)
27 +
28 + transform = transforms.Compose([
29 + TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims)
30 + ToTensorInput() # torch tensor:(1, n_dims, n_frames)
31 + ])
32 + transform_T = ToTensorDevInput()
33 +
34 +
35 + speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers
36 + spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)}
37 +
38 + train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx)
39 + valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx)
40 +
41 + n_classes = len(speaker_list) # How many speakers? 240
42 + return train_dataset, valid_dataset, n_classes
43 +
44 +def split_train_dev(train_feat_dir, valid_ratio):
45 + train_valid_DB = read_feats_structure(train_feat_dir)
46 + total_len = len(train_valid_DB) # 148642
47 + valid_len = int(total_len * valid_ratio/100.)
48 + train_len = total_len - valid_len
49 + shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True)
50 + # Split the DB into train and valid set
51 + train_DB = shuffled_train_valid_DB.iloc[:train_len]
52 + valid_DB = shuffled_train_valid_DB.iloc[train_len:]
53 + # Reset the index
54 + train_DB = train_DB.reset_index(drop=True)
55 + valid_DB = valid_DB.reset_index(drop=True)
56 + print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100))
57 + print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100))
58 + print('Total %d utts' %(total_len))
59 +
60 + return train_DB, valid_DB
61 +
62 +def main():
63 + # Set hyperparameters
64 + use_cuda = True # use gpu or cpu
65 + val_ratio = 10 # Percentage of validation set
66 + embedding_size = 128
67 + start = 1 # Start epoch
68 + n_epochs = 30 # How many epochs?
69 + end = start + n_epochs # Last epoch
70 +
71 + lr = 1e-1 # Initial learning rate
72 + wd = 1e-4 # Weight decay (L2 penalty)
73 + optimizer_type = 'sgd' # ex) sgd, adam, adagrad
74 +
75 + batch_size = 64 # Batch size for training
76 + valid_batch_size = 16 # Batch size for validation
77 + use_shuffle = True # Shuffle for training or not
78 +
79 + # Load dataset
80 + train_dataset, valid_dataset, n_classes = load_dataset(val_ratio)
81 +
82 + # print the experiment configuration
83 + print('\nNumber of classes (speakers):\n{}\n'.format(n_classes))
84 +
85 + log_dir = 'new_model5' # where to save checkpoints
86 +
87 + if not os.path.exists(log_dir):
88 + os.makedirs(log_dir)
89 +
90 + # instantiate model and initialize weights
91 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
92 +
93 + if use_cuda:
94 + model.cuda()
95 +
96 + # define loss function (criterion), optimizer and scheduler
97 + criterion = nn.CrossEntropyLoss()
98 + optimizer = create_optimizer(optimizer_type, model, lr, wd)
99 + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1)
100 +
101 + train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
102 + batch_size=batch_size,
103 + shuffle=use_shuffle)
104 + valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
105 + batch_size=valid_batch_size,
106 + shuffle=False,
107 + collate_fn = collate_fn_feat_padded)
108 +
109 + # to track the average training loss per epoch as the model trains
110 + avg_train_losses = []
111 + # to track the average validation loss per epoch as the model trains
112 + avg_valid_losses = []
113 +
114 +
115 + for epoch in range(start, end):
116 +
117 + # train for one epoch
118 + train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes)
119 +
120 + # evaluate on validation set
121 + valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch)
122 +
123 + scheduler.step(valid_loss, epoch)
124 +
125 + # calculate average loss over an epoch
126 + avg_train_losses.append(train_loss)
127 + avg_valid_losses.append(valid_loss)
128 + # do checkpointing
129 + torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(),
130 + 'optimizer': optimizer.state_dict()},
131 + '{}/checkpoint_{}.pth'.format(log_dir, epoch))
132 +
133 + # find position of lowest validation loss
134 + minposs = avg_valid_losses.index(min(avg_valid_losses))+1
135 + print('Lowest validation loss at epoch %d' %minposs)
136 +
137 + # visualize the loss and learning rate as the network trained
138 + visualize_the_losses(avg_train_losses, avg_valid_losses)
139 +
140 +
141 +def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes):
142 + batch_time = AverageMeter()
143 + losses = AverageMeter()
144 + train_acc = AverageMeter()
145 +
146 + n_correct, n_total = 0, 0
147 + log_interval = 84
148 + # switch to train mode
149 + model.train()
150 +
151 + end = time.time()
152 + # pbar = tqdm(enumerate(train_loader))
153 + for batch_idx, (data) in enumerate(train_loader):
154 + inputs, targets = data # target size:(batch size,1), input size:(batch size, 1, dim, win)
155 + targets = targets.view(-1) # target size:(batch size)
156 + current_sample = inputs.size(0) # batch size
157 +
158 + if use_cuda:
159 + inputs = inputs.cuda()
160 + targets = targets.cuda()
161 + _, output = model(inputs) # out size:(batch size, #classes), for softmax
162 +
163 + # calculate accuracy of predictions in the current batch
164 + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
165 + n_total += current_sample
166 + train_acc_temp = 100. * n_correct / n_total
167 + train_acc.update(train_acc_temp, inputs.size(0))
168 +
169 + loss = criterion(output, targets)
170 + losses.update(loss.item(), inputs.size(0))
171 +
172 + # compute gradient and do SGD step
173 + optimizer.zero_grad()
174 + loss.backward()
175 + optimizer.step()
176 +
177 + # measure elapsed time
178 + batch_time.update(time.time() - end)
179 + end = time.time()
180 +
181 + if batch_idx % log_interval == 0:
182 + print(
183 + 'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t'
184 + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
185 + 'Loss {loss.avg:.4f}\t'
186 + 'Acc {train_acc.avg:.4f}'.format(
187 + epoch, batch_idx * len(inputs), len(train_loader.dataset),
188 + 100. * batch_idx / len(train_loader),
189 + batch_time=batch_time, loss=losses, train_acc=train_acc))
190 + return losses.avg
191 +
192 +def validate(val_loader, model, criterion, use_cuda, epoch):
193 + batch_time = AverageMeter()
194 + losses = AverageMeter()
195 + val_acc = AverageMeter()
196 +
197 + n_correct, n_total = 0, 0
198 +
199 + # switch to evaluate mode
200 + model.eval()
201 +
202 + with torch.no_grad():
203 + end = time.time()
204 + for i, (data) in enumerate(val_loader):
205 + inputs, targets = data
206 + current_sample = inputs.size(0) # batch size
207 +
208 + if use_cuda:
209 + inputs = inputs.cuda()
210 + targets = targets.cuda()
211 +
212 + # compute output
213 + _, output = model(inputs)
214 +
215 + # measure accuracy and record loss
216 + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
217 + n_total += current_sample
218 + val_acc_temp = 100. * n_correct / n_total
219 + val_acc.update(val_acc_temp, inputs.size(0))
220 +
221 + loss = criterion(output, targets)
222 + losses.update(loss.item(), inputs.size(0))
223 + # measure elapsed time
224 + batch_time.update(time.time() - end)
225 + end = time.time()
226 +
227 + print(' * Validation: '
228 + 'Loss {loss.avg:.4f}\t'
229 + 'Acc {val_acc.avg:.4f}'.format(
230 + loss=losses, val_acc=val_acc))
231 +
232 + return losses.avg
233 +
234 +class AverageMeter(object):
235 + """Computes and stores the average and current value"""
236 + def __init__(self):
237 + self.reset()
238 + def reset(self):
239 + self.val = 0
240 + self.avg = 0
241 + self.sum = 0
242 + self.count = 0
243 + def update(self, val, n=1):
244 + self.val = val
245 + self.sum += val * n
246 + self.count += n
247 + self.avg = self.sum / self.count
248 +
249 +def create_optimizer(optimizer, model, new_lr, wd):
250 + # setup optimizer
251 + if optimizer == 'sgd':
252 + optimizer = optim.SGD(model.parameters(), lr=new_lr,
253 + momentum=0.9, dampening=0,
254 + weight_decay=wd)
255 + elif optimizer == 'adam':
256 + optimizer = optim.Adam(model.parameters(), lr=new_lr,
257 + weight_decay=wd)
258 + elif optimizer == 'adagrad':
259 + optimizer = optim.Adagrad(model.parameters(),
260 + lr=new_lr,
261 + weight_decay=wd)
262 + return optimizer
263 +
264 +def visualize_the_losses(train_loss, valid_loss):
265 + fig = plt.figure(figsize=(10,8))
266 + plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss')
267 + plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss')
268 +
269 + # find position of lowest validation loss
270 + minposs = valid_loss.index(min(valid_loss))+1
271 + plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')
272 +
273 + plt.xlabel('epochs')
274 + plt.ylabel('loss')
275 + plt.ylim(0, 3.5) # consistent scale
276 + plt.xlim(0, len(train_loss)+1) # consistent scale
277 + plt.grid(True)
278 + plt.legend()
279 + plt.tight_layout()
280 + #plt.show()
281 + fig.savefig('train5.png', bbox_inches='tight')
282 +
283 +if __name__ == '__main__':
284 + main()
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model3 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
83 + enroll_embedding = embeddings[enroll_speaker]
84 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
85 +
86 + score = F.cosine_similarity(test_embedding, enroll_embedding)
87 + score = score.data.cpu().numpy()
88 +
89 + if score > thres:
90 + result = 'Accept'
91 + else:
92 + result = 'Reject'
93 +
94 + test_spk = test_filename.split('/')[-2].split('_')[0]
95 + print("\n=== Speaker verification ===")
96 + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
97 + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
98 +
99 +def main():
100 +
101 + log_dir = 'new_model3' # Where the checkpoints are saved
102 + embedding_dir = 'enroll_embeddings3' # Where embeddings are saved
103 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
104 +
105 + # Settings
106 + use_cuda = True # Use cuda or not
107 + embedding_size = 128 # Dimension of speaker embeddings
108 + cp_num = 11 # Which checkpoint to use?
109 + n_classes = 241 # How many speakers in training data?
110 + test_frames = 100 # Split the test utterance
111 +
112 + # Load model from checkpoint
113 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
114 +
115 + # Get the dataframe for test DB
116 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
117 +
118 + # Load enroll embeddings
119 + embeddings = load_enroll_embeddings(embedding_dir)
120 +
121 + """ Test speaker list
122 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
123 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
124 + """
125 +
126 + # Set the true speaker
127 + enroll_speaker = '103F3021'
128 +
129 + # Set the claimed speaker
130 + test_speaker = '207F2088'
131 +
132 + # Threshold
133 + thres = 0.95
134 +
135 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
136 +
137 + # Perform the test
138 + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
139 +
140 +if __name__ == '__main__':
141 + main()
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model4 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
83 + enroll_embedding = embeddings[enroll_speaker]
84 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
85 +
86 + score = F.cosine_similarity(test_embedding, enroll_embedding)
87 + score = score.data.cpu().numpy()
88 +
89 + if score > thres:
90 + result = 'Accept'
91 + else:
92 + result = 'Reject'
93 +
94 + test_spk = test_filename.split('/')[-2].split('_')[0]
95 + print("\n=== Speaker verification ===")
96 + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
97 + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
98 +
99 +def main():
100 +
101 + log_dir = 'new_model4' # Where the checkpoints are saved
102 + embedding_dir = 'enroll_embeddings4' # Where embeddings are saved
103 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
104 +
105 + # Settings
106 + use_cuda = True # Use cuda or not
107 + embedding_size = 128 # Dimension of speaker embeddings
108 + cp_num = 25 # Which checkpoint to use?
109 + n_classes = 241 # How many speakers in training data?
110 + test_frames = 100 # Split the test utterance
111 +
112 + # Load model from checkpoint
113 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
114 +
115 + # Get the dataframe for test DB
116 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
117 +
118 + # Load enroll embeddings
119 + embeddings = load_enroll_embeddings(embedding_dir)
120 +
121 + """ Test speaker list
122 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
123 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
124 + """
125 +
126 + # Set the true speaker
127 + enroll_speaker = '229M2031'
128 +
129 + # Set the claimed speaker
130 + test_speaker = 'sunghwan1'
131 +
132 + # Threshold
133 + thres = 0.95
134 +
135 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
136 +
137 + # Perform the test
138 + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
139 +
140 +if __name__ == '__main__':
141 + main()
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model4 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
83 + enroll_embedding = embeddings[enroll_speaker]
84 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
85 +
86 + score = F.cosine_similarity(test_embedding, enroll_embedding)
87 + score = score.data.cpu().numpy()
88 +
89 + if score > thres:
90 + result = 'Accept'
91 + else:
92 + result = 'Reject'
93 +
94 + test_spk = test_filename.split('/')[-2].split('_')[0]
95 + print("\n=== Speaker verification ===")
96 + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
97 + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
98 +
99 +def main():
100 +
101 + log_dir = 'new_model4_merge' # Where the checkpoints are saved
102 + embedding_dir = 'enroll_embeddings4_merge' # Where embeddings are saved
103 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
104 +
105 + # Settings
106 + use_cuda = True # Use cuda or not
107 + embedding_size = 128 # Dimension of speaker embeddings
108 + cp_num = 50 # Which checkpoint to use?
109 + n_classes = 348 # How many speakers in training data?
110 + test_frames = 100 # Split the test utterance
111 +
112 + # Load model from checkpoint
113 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
114 +
115 + # Get the dataframe for test DB
116 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
117 +
118 + # Load enroll embeddings
119 + embeddings = load_enroll_embeddings(embedding_dir)
120 +
121 + """ Test speaker list
122 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
123 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
124 + """
125 +
126 + # Set the true speaker
127 + enroll_speaker = '213F5100'
128 +
129 + # Set the claimed speaker
130 + test_speaker = '207F2088'
131 +
132 + # Threshold
133 + thres = 0.95
134 +
135 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
136 +
137 + # Perform the test
138 + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
139 +
140 +if __name__ == '__main__':
141 + main()
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model4 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
83 + enroll_embedding = embeddings[enroll_speaker]
84 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
85 +
86 + score = F.cosine_similarity(test_embedding, enroll_embedding)
87 + score = score.data.cpu().numpy()
88 +
89 + if score > thres:
90 + result = 'Accept'
91 + else:
92 + result = 'Reject'
93 +
94 + test_spk = test_filename.split('/')[-2].split('_')[0]
95 + print("\n=== Speaker verification ===")
96 + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
97 + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
98 +
99 +def main():
100 +
101 + log_dir = 'new_model4_zeroth' # Where the checkpoints are saved
102 + embedding_dir = 'enroll_embeddings4_zeroth' # Where embeddings are saved
103 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
104 +
105 + # Settings
106 + use_cuda = True # Use cuda or not
107 + embedding_size = 128 # Dimension of speaker embeddings
108 + cp_num = 30 # Which checkpoint to use?
109 + n_classes = 105 # How many speakers in training data?
110 + test_frames = 100 # Split the test utterance
111 +
112 + # Load model from checkpoint
113 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
114 +
115 + # Get the dataframe for test DB
116 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
117 +
118 + # Load enroll embeddings
119 + embeddings = load_enroll_embeddings(embedding_dir)
120 +
121 + """ Test speaker list
122 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
123 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
124 + """
125 +
126 + # Set the true speaker
127 + enroll_speaker = '777M7777'
128 +
129 + # Set the claimed speaker
130 + test_speaker = '103F3021'
131 +
132 + # Threshold
133 + thres = 0.95
134 +
135 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
136 +
137 + # Perform the test
138 + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
139 +
140 +if __name__ == '__main__':
141 + main()
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model5 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
83 + enroll_embedding = embeddings[enroll_speaker]
84 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
85 +
86 + score = F.cosine_similarity(test_embedding, enroll_embedding)
87 + score = score.data.cpu().numpy()
88 +
89 + if score > thres:
90 + result = 'Accept'
91 + else:
92 + result = 'Reject'
93 +
94 + test_spk = test_filename.split('/')[-2].split('_')[0]
95 + print("\n=== Speaker verification ===")
96 + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
97 + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
98 +
99 +def main():
100 +
101 + log_dir = 'new_model5' # Where the checkpoints are saved
102 + embedding_dir = 'enroll_embeddings5' # Where embeddings are saved
103 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
104 +
105 + # Settings
106 + use_cuda = True # Use cuda or not
107 + embedding_size = 128 # Dimension of speaker embeddings
108 + cp_num = 30 # Which checkpoint to use?
109 + n_classes = 241 # How many speakers in training data?
110 + test_frames = 100 # Split the test utterance
111 +
112 + # Load model from checkpoint
113 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
114 +
115 + # Get the dataframe for test DB
116 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
117 +
118 + # Load enroll embeddings
119 + embeddings = load_enroll_embeddings(embedding_dir)
120 +
121 + """ Test speaker list
122 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
123 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
124 + """
125 +
126 + # Set the true speaker
127 + enroll_speaker = '777M7777'
128 +
129 + # Set the claimed speaker
130 + test_speaker = 'sunghwan1'
131 +
132 + # Threshold
133 + thres = 0.95
134 +
135 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
136 +
137 + # Perform the test
138 + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
139 +
140 +if __name__ == '__main__':
141 + main()