김건

Model2 - resnet34 commit

1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model2 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def get_embeddings(use_cuda, filename, model, test_frames):
40 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
41 +
42 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
43 + activation = 0
44 + with torch.no_grad():
45 + for i in range(tot_segments):
46 + temp_input = input[i*test_frames:i*test_frames+test_frames]
47 +
48 + TT = ToTensorTestInput()
49 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
50 +
51 + if use_cuda:
52 + temp_input = temp_input.cuda()
53 + temp_activation,_ = model(temp_input)
54 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
55 +
56 + activation = l2_norm(activation, 1)
57 +
58 + return activation
59 +
60 +def l2_norm(input, alpha):
61 + input_size = input.size() # size:(n_frames, dim)
62 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
63 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
64 + norm = torch.sqrt(normp) # size:(n_frames)
65 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
66 + output = _output.view(input_size)
67 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
68 + output = output * alpha
69 + return output
70 +
71 +def enroll_per_spk(use_cuda, test_frames, model, DB, embedding_dir):
72 + """
73 + Output the averaged d-vector for each speaker (enrollment)
74 + Return the dictionary (length of n_spk)
75 + """
76 + n_files = len(DB) # 10
77 + enroll_speaker_list = sorted(set(DB['speaker_id']))
78 +
79 + embeddings = {}
80 +
81 + # Aggregates all the activations
82 + print("Start to aggregate all the d-vectors per enroll speaker")
83 +
84 + for i in range(n_files):
85 + filename = DB['filename'][i]
86 + spk = DB['speaker_id'][i]
87 +
88 + activation = get_embeddings(use_cuda, filename, model, test_frames)
89 + if spk in embeddings:
90 + embeddings[spk] += activation
91 + else:
92 + embeddings[spk] = activation
93 +
94 + print("Aggregates the activation (spk : %s)" % (spk))
95 +
96 + if not os.path.exists(embedding_dir):
97 + os.makedirs(embedding_dir)
98 +
99 + # Save the embeddings
100 + for spk_index in enroll_speaker_list:
101 + embedding_path = os.path.join(embedding_dir, spk_index+'.pth')
102 + torch.save(embeddings[spk_index], embedding_path)
103 + print("Save the embeddings for %s" % (spk_index))
104 + return embeddings
105 +
106 +def main():
107 +
108 + # Settings
109 + use_cuda = True
110 + log_dir = 'new_model2'
111 + embedding_size = 128
112 + cp_num = 24 # Which checkpoint to use?
113 + n_classes = 241
114 + test_frames = 200
115 +
116 + # Load model from checkpoint
117 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
118 +
119 + # Get the dataframe for enroll DB
120 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
121 +
122 + # Where to save embeddings
123 + embedding_dir = 'enroll_embeddings2'
124 +
125 + # Perform the enrollment and save the results
126 + enroll_per_spk(use_cuda, test_frames, model, enroll_DB, embedding_dir)
127 +
128 + """ Test speaker list
129 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
130 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
131 + """
132 +
133 +if __name__ == '__main__':
134 + main()
...\ No newline at end of file ...\ No newline at end of file
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model2 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
83 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
84 + max_score = -10**8
85 + best_spk = None
86 + for spk in spk_list:
87 + score = F.cosine_similarity(test_embedding, embeddings[spk])
88 + score = score.data.cpu().numpy()
89 + if score > max_score:
90 + max_score = score
91 + best_spk = spk
92 + #print("Speaker identification result : %s" %best_spk)
93 + true_spk = test_filename.split('/')[-2].split('_')[0]
94 + print("\n=== Speaker identification ===")
95 + print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
96 + return best_spk
97 +
98 +def main():
99 +
100 + log_dir = 'new_model2' # Where the checkpoints are saved
101 + embedding_dir = 'enroll_embeddings2' # Where embeddings are saved
102 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
103 +
104 + # Settings
105 + use_cuda = True # Use cuda or not
106 + embedding_size = 128 # Dimension of speaker embeddings
107 + cp_num = 30 # Which checkpoint to use?
108 + n_classes = 241 # How many speakers in training data?
109 + test_frames = 100 # Split the test utterance
110 +
111 + # Load model from checkpoint
112 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
113 +
114 + # Get the dataframe for test DB
115 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
116 +
117 + # Load enroll embeddings
118 + embeddings = load_enroll_embeddings(embedding_dir)
119 +
120 + """ Test speaker list
121 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
122 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
123 + """
124 +
125 + spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
126 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
127 +
128 + # Set the test speaker
129 + test_speaker = '236M3043'
130 +
131 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
132 +
133 + # Perform the test
134 + best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
135 +
136 +if __name__ == '__main__':
137 + main()
1 +import torch
2 +import torch.nn as nn
3 +import torch.nn.functional as F
4 +from torch.autograd import Function
5 +import model.resnet as resnet
6 +
7 +
8 +class background_resnet(nn.Module):
9 + def __init__(self, embedding_size, num_classes, backbone='resnet34'):
10 + super(background_resnet, self).__init__()
11 + self.backbone = backbone
12 + # copying modules from pretrained models
13 + if backbone == 'resnet50':
14 + self.pretrained = resnet.resnet50(pretrained=False)
15 + elif backbone == 'resnet101':
16 + self.pretrained = resnet.resnet101(pretrained=False)
17 + elif backbone == 'resnet152':
18 + self.pretrained = resnet.resnet152(pretrained=False)
19 + elif backbone == 'resnet18':
20 + self.pretrained = resnet.resnet18(pretrained=False)
21 + elif backbone == 'resnet34':
22 + self.pretrained = resnet.resnet34(pretrained=False)
23 + else:
24 + raise RuntimeError('unknown backbone: {}'.format(backbone))
25 +
26 + self.fc0 = nn.Linear(128, embedding_size)
27 + self.bn0 = nn.BatchNorm1d(embedding_size)
28 + self.relu = nn.ReLU()
29 + self.last = nn.Linear(embedding_size, num_classes)
30 +
31 + def forward(self, x):
32 + # input x: minibatch x 1 x 40 x 40
33 + x = self.pretrained.conv1(x)
34 + x = self.pretrained.bn1(x)
35 + x = self.pretrained.relu(x)
36 +
37 + x = self.pretrained.layer1(x)
38 + x = self.pretrained.layer2(x)
39 + x = self.pretrained.layer3(x)
40 + x = self.pretrained.layer4(x)
41 +
42 + out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1]
43 + out = torch.squeeze(out) # [batch, n_embed]
44 + # flatten the out so that the fully connected layer can be connected from here
45 + out = out.view(x.size(0), -1) # (n_batch, n_embed)
46 + spk_embedding = self.fc0(out)
47 + out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed]
48 + out = self.last(out)
49 +
50 + return spk_embedding, out
...\ No newline at end of file ...\ No newline at end of file
This diff is collapsed. Click to expand it.
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model2 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
83 + enroll_embedding = embeddings[enroll_speaker]
84 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
85 +
86 + score = F.cosine_similarity(test_embedding, enroll_embedding)
87 + score = score.data.cpu().numpy()
88 +
89 + if score > thres:
90 + result = 'Accept'
91 + else:
92 + result = 'Reject'
93 +
94 + test_spk = test_filename.split('/')[-2].split('_')[0]
95 + print("\n=== Speaker verification ===")
96 + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
97 + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
98 +
99 +def main():
100 +
101 + log_dir = 'new_model2' # Where the checkpoints are saved
102 + embedding_dir = 'enroll_embeddings2' # Where embeddings are saved
103 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
104 +
105 + # Settings
106 + use_cuda = True # Use cuda or not
107 + embedding_size = 128 # Dimension of speaker embeddings
108 + cp_num = 30 # Which checkpoint to use?
109 + n_classes = 241 # How many speakers in training data?
110 + test_frames = 100 # Split the test utterance
111 +
112 + # Load model from checkpoint
113 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
114 +
115 + # Get the dataframe for test DB
116 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
117 +
118 + # Load enroll embeddings
119 + embeddings = load_enroll_embeddings(embedding_dir)
120 +
121 + """ Test speaker list
122 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
123 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
124 + """
125 +
126 + # Set the true speaker
127 + enroll_speaker = '103F3021'
128 +
129 + # Set the claimed speaker
130 + test_speaker = '207F2088'
131 +
132 + # Threshold
133 + thres = 0.95
134 +
135 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
136 +
137 + # Perform the test
138 + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
139 +
140 +if __name__ == '__main__':
141 + main()