Showing
16 changed files
with
591 additions
and
0 deletions
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
source/server/app.py
0 → 100644
1 | +from flask import Flask, request, send_file | ||
2 | +from extract_feature4 import extract | ||
3 | +from verification4_merge import load_model, load_enroll_embeddings,perform_verification | ||
4 | +from identification4 import perform_identification | ||
5 | +from enroll4_merge import split_enroll_and_test,enroll_per_spk | ||
6 | +import os | ||
7 | +import shutil | ||
8 | + | ||
9 | +app = Flask(__name__) | ||
10 | +log_dir = '../new_model4_merge' # Where the checkpoints are saved | ||
11 | +embedding_dir = '../enroll_embeddings4_merge' # Where embeddings are saved | ||
12 | +test_dir = '../feat_logfbank_nfilt40/test/' # Where test features are saved | ||
13 | + | ||
14 | + # Settings | ||
15 | +use_cuda = True # Use cuda or not | ||
16 | +embedding_size = 128 # Dimension of speaker embeddings | ||
17 | +cp_num = 50 # Which checkpoint to use? | ||
18 | +n_classes = 348 # How many speakers in training data? | ||
19 | +test_frames = 100 # Split the test utterance | ||
20 | + | ||
21 | +model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
22 | +embeddings = load_enroll_embeddings(embedding_dir) | ||
23 | +test_path = './test.p' | ||
24 | +spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\ | ||
25 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'] | ||
26 | + | ||
27 | + | ||
28 | +def enrollment(): | ||
29 | + try: | ||
30 | + global embeddings | ||
31 | + enroll_DB, test_DB = split_enroll_and_test(test_dir) | ||
32 | + enroll_per_spk(use_cuda, test_frames, model, enroll_DB, embedding_dir) | ||
33 | + embeddings = load_enroll_embeddings(embedding_dir) | ||
34 | + | ||
35 | + except Exception as e: | ||
36 | + print(e) | ||
37 | + | ||
38 | +def verification(enroll_speaker): | ||
39 | + test_speaker = 'TEST_SPEAKER' | ||
40 | + thres = 0.95 | ||
41 | + # Perform the test | ||
42 | + return perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres) | ||
43 | + | ||
44 | +def identification(): | ||
45 | + best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list) | ||
46 | + return best_spk | ||
47 | + | ||
48 | + | ||
49 | + | ||
50 | + | ||
51 | +@app.route('/enroll', methods=['POST', "GET"]) | ||
52 | +def enroll_controller(): | ||
53 | + if request.method == 'POST': | ||
54 | + f = request.files['file'] | ||
55 | + enroll_speaker = request.form['enroll_speaker'] | ||
56 | + print(f.name) | ||
57 | + f.save('./myrequest_enroll.wav') | ||
58 | + extract('./myrequest_enroll.wav',enroll_speaker) | ||
59 | + new_path = '../feat_logfbank_nfilt40/test/'+enroll_speaker+'/' | ||
60 | + os.mkdir(new_path) | ||
61 | + shutil.move('./enroll.p',new_path+'enroll.p') | ||
62 | + | ||
63 | + try: | ||
64 | + enrollment() | ||
65 | + spk_list.append(enroll_speaker) | ||
66 | + return 'enroll_complete' | ||
67 | + except: | ||
68 | + return 'failed' | ||
69 | + | ||
70 | + | ||
71 | + #return 'post' | ||
72 | + return 'get' | ||
73 | + | ||
74 | + | ||
75 | + | ||
76 | +@app.route('/verification', methods=['POST', "GET"]) | ||
77 | +def verfication_controller(): | ||
78 | + if request.method == 'POST': | ||
79 | + f = request.files['file'] | ||
80 | + enroll_speaker = request.form['enroll_speaker'] | ||
81 | + print(f.name) | ||
82 | + f.save('./myrequest.wav') | ||
83 | + extract('./myrequest.wav') | ||
84 | + speak, score = verification(enroll_speaker) | ||
85 | + return score | ||
86 | + | ||
87 | + #return 'post' | ||
88 | + return 'get' | ||
89 | + | ||
90 | +@app.route('/identification', methods=['POST', "GET"]) | ||
91 | +def identification_controller(): | ||
92 | + if request.method == 'POST': | ||
93 | + f = request.files['file'] | ||
94 | + print(f.name) | ||
95 | + f.save('./myrequest.wav') | ||
96 | + extract('./myrequest.wav') | ||
97 | + best_spk = identification() | ||
98 | + return best_spk | ||
99 | + | ||
100 | + #return 'post' | ||
101 | + return 'get' | ||
102 | + | ||
103 | +@app.route('/debugger', methods=['GET']) | ||
104 | +def debugger(): | ||
105 | + return anything | ||
106 | + | ||
107 | +@app.route('/robots.txt',methods=['GET']) | ||
108 | +def antirobot(): | ||
109 | + return send_file('robots.txt') | ||
110 | + | ||
111 | +if __name__ == '__main__': | ||
112 | + app.run(host='0.0.0.0', port="7777", debug=True) | ||
113 | + |
source/server/enroll4_merge.py
0 → 100755
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import sys | ||
9 | +sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__)))) | ||
10 | + | ||
11 | + | ||
12 | +import configure as c | ||
13 | + | ||
14 | +from DB_wav_reader import read_feats_structure | ||
15 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
16 | +from model.model4 import background_resnet | ||
17 | + | ||
18 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
19 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
20 | + if use_cuda: | ||
21 | + model.cuda() | ||
22 | + print('=> loading checkpoint') | ||
23 | + # original saved file with DataParallel | ||
24 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
25 | + # create new OrderedDict that does not contain `module.` | ||
26 | + model.load_state_dict(checkpoint['state_dict']) | ||
27 | + model.eval() | ||
28 | + return model | ||
29 | + | ||
30 | +def split_enroll_and_test(dataroot_dir): | ||
31 | + DB_all = read_feats_structure(dataroot_dir) | ||
32 | + enroll_DB = pd.DataFrame() | ||
33 | + test_DB = pd.DataFrame() | ||
34 | + | ||
35 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
36 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
37 | + | ||
38 | + # Reset the index | ||
39 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
40 | + test_DB = test_DB.reset_index(drop=True) | ||
41 | + return enroll_DB, test_DB | ||
42 | + | ||
43 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
44 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
45 | + | ||
46 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
47 | + activation = 0 | ||
48 | + with torch.no_grad(): | ||
49 | + for i in range(tot_segments): | ||
50 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
51 | + | ||
52 | + TT = ToTensorTestInput() | ||
53 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
54 | + | ||
55 | + if use_cuda: | ||
56 | + temp_input = temp_input.cuda() | ||
57 | + temp_activation,_ = model(temp_input) | ||
58 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
59 | + | ||
60 | + activation = l2_norm(activation, 1) | ||
61 | + | ||
62 | + return activation | ||
63 | + | ||
64 | +def l2_norm(input, alpha): | ||
65 | + input_size = input.size() # size:(n_frames, dim) | ||
66 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
67 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
68 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
69 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
70 | + output = _output.view(input_size) | ||
71 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
72 | + output = output * alpha | ||
73 | + return output | ||
74 | + | ||
75 | +def enroll_per_spk(use_cuda, test_frames, model, DB, embedding_dir): | ||
76 | + """ | ||
77 | + Output the averaged d-vector for each speaker (enrollment) | ||
78 | + Return the dictionary (length of n_spk) | ||
79 | + """ | ||
80 | + n_files = len(DB) # 10 | ||
81 | + enroll_speaker_list = sorted(set(DB['speaker_id'])) | ||
82 | + | ||
83 | + embeddings = {} | ||
84 | + | ||
85 | + # Aggregates all the activations | ||
86 | + print("Start to aggregate all the d-vectors per enroll speaker") | ||
87 | + | ||
88 | + for i in range(n_files): | ||
89 | + filename = DB['filename'][i] | ||
90 | + spk = DB['speaker_id'][i] | ||
91 | + | ||
92 | + activation = get_embeddings(use_cuda, filename, model, test_frames) | ||
93 | + if spk in embeddings: | ||
94 | + embeddings[spk] += activation | ||
95 | + else: | ||
96 | + embeddings[spk] = activation | ||
97 | + | ||
98 | + print("Aggregates the activation (spk : %s)" % (spk)) | ||
99 | + | ||
100 | + if not os.path.exists(embedding_dir): | ||
101 | + os.makedirs(embedding_dir) | ||
102 | + | ||
103 | + # Save the embeddings | ||
104 | + for spk_index in enroll_speaker_list: | ||
105 | + embedding_path = os.path.join(embedding_dir, spk_index+'.pth') | ||
106 | + torch.save(embeddings[spk_index], embedding_path) | ||
107 | + print("Save the embeddings for %s" % (spk_index)) | ||
108 | + return embeddings | ||
109 | + | ||
110 | +def main(): | ||
111 | + | ||
112 | + # Settings | ||
113 | + use_cuda = True | ||
114 | + log_dir = 'new_model4_merge' | ||
115 | + embedding_size = 128 | ||
116 | + cp_num = 50 # Which checkpoint to use? | ||
117 | + n_classes = 348 | ||
118 | + test_frames = 200 | ||
119 | + | ||
120 | + # Load model from checkpoint | ||
121 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
122 | + | ||
123 | + # Get the dataframe for enroll DB | ||
124 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
125 | + | ||
126 | + # Where to save embeddings | ||
127 | + embedding_dir = 'enroll_embeddings4_merge' | ||
128 | + | ||
129 | + # Perform the enrollment and save the results | ||
130 | + enroll_per_spk(use_cuda, test_frames, model, enroll_DB, embedding_dir) | ||
131 | + | ||
132 | + """ Test speaker list | ||
133 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
134 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
135 | + """ | ||
136 | + | ||
137 | +if __name__ == '__main__': | ||
138 | + main() |
source/server/extract_feature.py
0 → 100755
1 | +import librosa | ||
2 | +import numpy as np | ||
3 | +from python_speech_features import fbank | ||
4 | +import pickle | ||
5 | +sample_rate=16000 | ||
6 | +#filename='./sunghwan/8sec2.wav' | ||
7 | + | ||
8 | +def normalize_frames(m,Scale=True): | ||
9 | + if Scale: | ||
10 | + return (m - np.mean(m, axis=0)) / (np.std(m, axis=0) + 2e-12) | ||
11 | + else: | ||
12 | + return (m - np.mean(m, axis=0)) | ||
13 | + | ||
14 | +def extract(filename,savename='test.p'): | ||
15 | + audio, sr = librosa.load(filename, sr=sample_rate, mono=True) | ||
16 | + filter_banks, energies = fbank(audio, samplerate=sample_rate, nfilt=40, winlen=0.025) | ||
17 | + filter_banks = 20 * np.log10(np.maximum(filter_banks,1e-5)) | ||
18 | + feature = normalize_frames(filter_banks, Scale=False) | ||
19 | + label = savename.split('.')[0] | ||
20 | + todump = {'feat': feature, 'label': label} | ||
21 | + with open(savename,'wb') as f: | ||
22 | + pickle.dump(todump,f) | ||
23 | + | ||
24 | + |
source/server/extract_feature4.py
0 → 100644
1 | +import librosa | ||
2 | +import numpy as np | ||
3 | +from python_speech_features import fbank | ||
4 | +import pickle | ||
5 | +sample_rate = 16000 | ||
6 | +#filename='./sunghwan/8sec2.wav' | ||
7 | + | ||
8 | + | ||
9 | +def normalize_frames(m, Scale=True): | ||
10 | + if Scale: | ||
11 | + return (m - np.mean(m, axis=0)) / (np.std(m, axis=0) + 2e-12) | ||
12 | + else: | ||
13 | + return (m - np.mean(m, axis=0)) | ||
14 | + | ||
15 | + | ||
16 | +def extract(filename, label='test.p'): | ||
17 | + audio, sr = librosa.load(filename, sr=sample_rate, mono=True) | ||
18 | + filter_banks, energies = fbank(audio, | ||
19 | + samplerate=sample_rate, | ||
20 | + nfilt=40, | ||
21 | + winlen=0.025) | ||
22 | + filter_banks = 20 * np.log10(np.maximum(filter_banks, 1e-5)) | ||
23 | + feature = normalize_frames(filter_banks, Scale=False) | ||
24 | + savename = 'test.p' | ||
25 | + if label != savename: | ||
26 | + savename='enroll.p' | ||
27 | + todump = {'feat': feature, 'label': label} | ||
28 | + with open(savename, 'wb') as f: | ||
29 | + pickle.dump(todump, f) | ||
30 | + |
source/server/identification4.py
0 → 100755
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import sys | ||
9 | +sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__)))) | ||
10 | + | ||
11 | +import configure as c | ||
12 | + | ||
13 | +from DB_wav_reader import read_feats_structure | ||
14 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
15 | +from model.model4 import background_resnet | ||
16 | + | ||
17 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
18 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
19 | + if use_cuda: | ||
20 | + model.cuda() | ||
21 | + print('=> loading checkpoint') | ||
22 | + # original saved file with DataParallel | ||
23 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
24 | + # create new OrderedDict that does not contain `module.` | ||
25 | + model.load_state_dict(checkpoint['state_dict']) | ||
26 | + model.eval() | ||
27 | + return model | ||
28 | + | ||
29 | +def split_enroll_and_test(dataroot_dir): | ||
30 | + DB_all = read_feats_structure(dataroot_dir) | ||
31 | + enroll_DB = pd.DataFrame() | ||
32 | + test_DB = pd.DataFrame() | ||
33 | + | ||
34 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
35 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
36 | + | ||
37 | + # Reset the index | ||
38 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
39 | + test_DB = test_DB.reset_index(drop=True) | ||
40 | + return enroll_DB, test_DB | ||
41 | + | ||
42 | +def load_enroll_embeddings(embedding_dir): | ||
43 | + embeddings = {} | ||
44 | + for f in os.listdir(embedding_dir): | ||
45 | + spk = f.replace('.pth','') | ||
46 | + # Select the speakers who are in the 'enroll_spk_list' | ||
47 | + embedding_path = os.path.join(embedding_dir, f) | ||
48 | + tmp_embeddings = torch.load(embedding_path) | ||
49 | + embeddings[spk] = tmp_embeddings | ||
50 | + | ||
51 | + return embeddings | ||
52 | + | ||
53 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
54 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
55 | + | ||
56 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
57 | + activation = 0 | ||
58 | + with torch.no_grad(): | ||
59 | + for i in range(tot_segments): | ||
60 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
61 | + | ||
62 | + TT = ToTensorTestInput() | ||
63 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
64 | + | ||
65 | + if use_cuda: | ||
66 | + temp_input = temp_input.cuda() | ||
67 | + temp_activation,_ = model(temp_input) | ||
68 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
69 | + | ||
70 | + activation = l2_norm(activation, 1) | ||
71 | + | ||
72 | + return activation | ||
73 | + | ||
74 | +def l2_norm(input, alpha): | ||
75 | + input_size = input.size() # size:(n_frames, dim) | ||
76 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
77 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
78 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
79 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
80 | + output = _output.view(input_size) | ||
81 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
82 | + output = output * alpha | ||
83 | + return output | ||
84 | + | ||
85 | +def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list): | ||
86 | + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames) | ||
87 | + max_score = -10**8 | ||
88 | + best_spk = None | ||
89 | + for spk in spk_list: | ||
90 | + score = F.cosine_similarity(test_embedding, embeddings[spk]) | ||
91 | + score = score.data.cpu().numpy() | ||
92 | + if score > max_score: | ||
93 | + max_score = score | ||
94 | + best_spk = spk | ||
95 | + #print("Speaker identification result : %s" %best_spk) | ||
96 | + true_spk = test_filename.split('/')[-2].split('_')[0] | ||
97 | + print("\n=== Speaker identification ===") | ||
98 | + print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk)) | ||
99 | + return best_spk | ||
100 | + | ||
101 | +def main(): | ||
102 | + | ||
103 | + log_dir = 'new_model4' # Where the checkpoints are saved | ||
104 | + embedding_dir = 'enroll_embeddings4' # Where embeddings are saved | ||
105 | + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved | ||
106 | + | ||
107 | + # Settings | ||
108 | + use_cuda = True # Use cuda or not | ||
109 | + embedding_size = 128 # Dimension of speaker embeddings | ||
110 | + cp_num = 25 # Which checkpoint to use? | ||
111 | + n_classes = 241 # How many speakers in training data? | ||
112 | + test_frames = 100 # Split the test utterance | ||
113 | + | ||
114 | + # Load model from checkpoint | ||
115 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
116 | + | ||
117 | + # Get the dataframe for test DB | ||
118 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
119 | + | ||
120 | + # Load enroll embeddings | ||
121 | + embeddings = load_enroll_embeddings(embedding_dir) | ||
122 | + | ||
123 | + """ Test speaker list | ||
124 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
125 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
126 | + """ | ||
127 | + | ||
128 | + spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\ | ||
129 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777','sunghwan1'] | ||
130 | + | ||
131 | + # Set the test speaker | ||
132 | + test_speaker = '207F2088' | ||
133 | + | ||
134 | + test_path = os.path.join(test_dir, test_speaker, 'test.p') | ||
135 | + | ||
136 | + # Perform the test | ||
137 | + best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list) | ||
138 | + | ||
139 | +if __name__ == '__main__': | ||
140 | + main() |
source/server/myrequest.wav
0 → 100644
No preview for this file type
source/server/myrequest_enroll.wav
0 → 100644
No preview for this file type
source/server/robots.txt
0 → 100644
source/server/test.p
0 → 100644
No preview for this file type
source/server/verification4_merge.py
0 → 100755
1 | +import torch | ||
2 | +import torch.nn.functional as F | ||
3 | +from torch.autograd import Variable | ||
4 | + | ||
5 | +import pandas as pd | ||
6 | +import math | ||
7 | +import os | ||
8 | +import sys | ||
9 | +sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__)))) | ||
10 | +import configure as c | ||
11 | + | ||
12 | +from DB_wav_reader import read_feats_structure | ||
13 | +from SR_Dataset import read_MFB, ToTensorTestInput | ||
14 | +from model.model4 import background_resnet | ||
15 | + | ||
16 | +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes): | ||
17 | + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes) | ||
18 | + if use_cuda: | ||
19 | + model.cuda() | ||
20 | + print('=> loading checkpoint') | ||
21 | + # original saved file with DataParallel | ||
22 | + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth') | ||
23 | + # create new OrderedDict that does not contain `module.` | ||
24 | + model.load_state_dict(checkpoint['state_dict']) | ||
25 | + model.eval() | ||
26 | + return model | ||
27 | + | ||
28 | +def split_enroll_and_test(dataroot_dir): | ||
29 | + DB_all = read_feats_structure(dataroot_dir) | ||
30 | + enroll_DB = pd.DataFrame() | ||
31 | + test_DB = pd.DataFrame() | ||
32 | + | ||
33 | + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')] | ||
34 | + test_DB = DB_all[DB_all['filename'].str.contains('test.p')] | ||
35 | + | ||
36 | + # Reset the index | ||
37 | + enroll_DB = enroll_DB.reset_index(drop=True) | ||
38 | + test_DB = test_DB.reset_index(drop=True) | ||
39 | + return enroll_DB, test_DB | ||
40 | + | ||
41 | +def load_enroll_embeddings(embedding_dir): | ||
42 | + embeddings = {} | ||
43 | + for f in os.listdir(embedding_dir): | ||
44 | + spk = f.replace('.pth','') | ||
45 | + # Select the speakers who are in the 'enroll_spk_list' | ||
46 | + embedding_path = os.path.join(embedding_dir, f) | ||
47 | + tmp_embeddings = torch.load(embedding_path) | ||
48 | + embeddings[spk] = tmp_embeddings | ||
49 | + | ||
50 | + return embeddings | ||
51 | + | ||
52 | +def get_embeddings(use_cuda, filename, model, test_frames): | ||
53 | + input, label = read_MFB(filename) # input size:(n_frames, n_dims) | ||
54 | + | ||
55 | + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' | ||
56 | + activation = 0 | ||
57 | + with torch.no_grad(): | ||
58 | + for i in range(tot_segments): | ||
59 | + temp_input = input[i*test_frames:i*test_frames+test_frames] | ||
60 | + | ||
61 | + TT = ToTensorTestInput() | ||
62 | + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames) | ||
63 | + | ||
64 | + if use_cuda: | ||
65 | + temp_input = temp_input.cuda() | ||
66 | + temp_activation,_ = model(temp_input) | ||
67 | + activation += torch.sum(temp_activation, dim=0, keepdim=True) | ||
68 | + | ||
69 | + activation = l2_norm(activation, 1) | ||
70 | + | ||
71 | + return activation | ||
72 | + | ||
73 | +def l2_norm(input, alpha): | ||
74 | + input_size = input.size() # size:(n_frames, dim) | ||
75 | + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim) | ||
76 | + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames) | ||
77 | + norm = torch.sqrt(normp) # size:(n_frames) | ||
78 | + _output = torch.div(input, norm.view(-1, 1).expand_as(input)) | ||
79 | + output = _output.view(input_size) | ||
80 | + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf | ||
81 | + output = output * alpha | ||
82 | + return output | ||
83 | + | ||
84 | +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres): | ||
85 | + enroll_embedding = embeddings[enroll_speaker] | ||
86 | + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames) | ||
87 | + | ||
88 | + score = F.cosine_similarity(test_embedding, enroll_embedding) | ||
89 | + score = score.data.cpu().numpy() | ||
90 | + | ||
91 | + if score > thres: | ||
92 | + result = 'Accept' | ||
93 | + else: | ||
94 | + result = 'Reject' | ||
95 | + | ||
96 | + test_spk = test_filename.split('/')[-2].split('_')[0] | ||
97 | + print("\n=== Speaker verification ===") | ||
98 | + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result)) | ||
99 | + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres)) | ||
100 | + return (enroll_speaker,'%0.4f'%score) | ||
101 | + | ||
102 | +def main(): | ||
103 | + | ||
104 | + log_dir = 'new_model4_merge' # Where the checkpoints are saved | ||
105 | + embedding_dir = 'enroll_embeddings4_merge' # Where embeddings are saved | ||
106 | + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved | ||
107 | + | ||
108 | + # Settings | ||
109 | + use_cuda = True # Use cuda or not | ||
110 | + embedding_size = 128 # Dimension of speaker embeddings | ||
111 | + cp_num = 50 # Which checkpoint to use? | ||
112 | + n_classes = 348 # How many speakers in training data? | ||
113 | + test_frames = 100 # Split the test utterance | ||
114 | + | ||
115 | + # Load model from checkpoint | ||
116 | + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes) | ||
117 | + | ||
118 | + # Get the dataframe for test DB | ||
119 | + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR) | ||
120 | + | ||
121 | + # Load enroll embeddings | ||
122 | + embeddings = load_enroll_embeddings(embedding_dir) | ||
123 | + | ||
124 | + """ Test speaker list | ||
125 | + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', | ||
126 | + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063' | ||
127 | + """ | ||
128 | + | ||
129 | + # Set the true speaker | ||
130 | + enroll_speaker = '213F5100' | ||
131 | + | ||
132 | + # Set the claimed speaker | ||
133 | + test_speaker = '207F2088' | ||
134 | + | ||
135 | + # Threshold | ||
136 | + thres = 0.95 | ||
137 | + | ||
138 | + test_path = os.path.join(test_dir, test_speaker, 'test.p') | ||
139 | + | ||
140 | + # Perform the test | ||
141 | + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres) | ||
142 | + | ||
143 | +if __name__ == '__main__': | ||
144 | + main() |
-
Please register or login to post a comment