Speaker Recognition V1 resnet18

김건
Commit 6e786b83fc53151492ef351e04f797659abf28a6 6e786b83 1 parent a0bc9b6b
Showing 17 changed files with 1270 additions and 0 deletions
Speaker_Recognition/DB_wav_reader.py
Speaker_Recognition/SR_Dataset.py
Speaker_Recognition/__pycache__/DB_wav_reader.cpython-35.pyc
Speaker_Recognition/__pycache__/SR_Dataset.cpython-35.pyc
Speaker_Recognition/__pycache__/configure.cpython-35.pyc
Speaker_Recognition/configure.py
Speaker_Recognition/enroll.py
Speaker_Recognition/identification.py
Speaker_Recognition/loss_plot.png
Speaker_Recognition/model/__pycache__/model.cpython-35.pyc
Speaker_Recognition/model/__pycache__/resnet.cpython-35.pyc
Speaker_Recognition/model/model.py
Speaker_Recognition/model/resnet.py
Speaker_Recognition/model_saved/checkpoint_24.pth
Speaker_Recognition/model_saved/checkpoint_24_cpu.pth
Speaker_Recognition/train.py
Speaker_Recognition/verification.py
--- a/Speaker_Recognition/DB_wav_reader.py 0 → 100644
View file @6e786b8
+++ b/Speaker_Recognition/DB_wav_reader.py 0 → 100644
View file @6e786b8
+ """
+ Modification of the function 'DBspeech_wav_reader.py' of the deep-speaker created by philipperemy 
+ Working on python 3
+ Input : DB path
+ Output : 1) Make DB structure using pd.DataFrame which has 3 columns (file id, file path, speaker id, DB id)
+             => 'read_DB_structure' function
+          2) Read a wav file from DB structure
+             => 'read_audio' function
+ """
+ import logging
+ import os
+ from glob import glob
+ 
+ import librosa
+ import numpy as np
+ import pandas as pd
+ 
+ from configure import SAMPLE_RATE
+ 
+ np.set_printoptions(threshold=np.nan)
+ pd.set_option('display.max_rows', 500)
+ pd.set_option('display.max_columns', 500)
+ pd.set_option('display.width', 1000)
+ pd.set_option('max_colwidth', 100)
+ 
+ 
+ def find_wavs(directory, pattern='**/*.wav'):
+     """Recursively finds all files matching the pattern."""
+     return glob(os.path.join(directory, pattern), recursive=True)
+ 
+ def find_feats(directory, pattern='**/*.p'):
+     """Recursively finds all files matching the pattern."""
+     return glob(os.path.join(directory, pattern), recursive=True)
+ 
+ def read_audio(filename, sample_rate=SAMPLE_RATE):
+     audio, sr = librosa.load(filename, sr=sample_rate, mono=True)
+     audio = audio.flatten()
+     return audio
+ 
+ def read_DB_structure(directory):
+     DB = pd.DataFrame()
+     DB['filename'] = find_wavs(directory) # filename
+     DB['filename'] = DB['filename'].apply(lambda x: x.replace('\\', '/')) # normalize windows paths
+     DB['speaker_id'] = DB['filename'].apply(lambda x: x.split('/')[-2]) # speaker folder name
+     DB['dataset_id'] = DB['filename'].apply(lambda x: x.split('/')[-3]) # dataset folder name
+     num_speakers = len(DB['speaker_id'].unique())
+     logging.info('Found {} files with {} different speakers.'.format(str(len(DB)).zfill(7), str(num_speakers).zfill(5)))
+     logging.info(DB.head(10))
+     return DB
+ 
+ def read_feats_structure(directory):
+     DB = pd.DataFrame()
+     DB['filename'] = find_feats(directory) # filename
+     DB['filename'] = DB['filename'].apply(lambda x: x.replace('\\', '/')) # normalize windows paths
+     DB['speaker_id'] = DB['filename'].apply(lambda x: x.split('/')[-2]) # speaker folder name
+     DB['dataset_id'] = DB['filename'].apply(lambda x: x.split('/')[-3]) # dataset folder name
+     num_speakers = len(DB['speaker_id'].unique())
+     logging.info('Found {} files with {} different speakers.'.format(str(len(DB)).zfill(7), str(num_speakers).zfill(5)))
+     logging.info(DB.head(10))
+     return DB
+     
+ def test():
+     DB_dir = '/home/administrator/Desktop/DB/Speaker_robot_train_DB'
+     DB = read_DB_structure(DB_dir)
+     test_wav = read_audio(DB[0:1]['filename'].values[0])
+     return DB, test_wav
+ 
+ 
+ if __name__ == '__main__':
+     DB, test_wav = test()
\ No newline at end of file
--- a/Speaker_Recognition/SR_Dataset.py 0 → 100644
View file @6e786b8
+++ b/Speaker_Recognition/SR_Dataset.py 0 → 100644
View file @6e786b8
+ import torch
+ import torch.utils.data as data
+ import torchvision.transforms as transforms
+ import random
+ import os
+ import pickle # For python3 
+ import numpy as np
+ import configure as c
+ from DB_wav_reader import read_DB_structure
+ 
+ def read_MFB(filename):
+     with open(filename, 'rb') as f:
+         feat_and_label = pickle.load(f)
+         
+     feature = feat_and_label['feat'] # size : (n_frames, dim=40)
+     label = feat_and_label['label']
+     """
+     VAD
+     """
+     start_sec, end_sec = 0.5, 0.5
+     start_frame = int(start_sec / 0.01)
+     end_frame = len(feature) - int(end_sec / 0.01)
+     ori_feat = feature
+     feature = feature[start_frame:end_frame,:]
+     assert len(feature) > 40, (
+                 'length is too short. len:%s, ori_len:%s, file:%s' % (len(feature), len(ori_feat), filename))
+     return feature, label
+ 
+ class TruncatedInputfromMFB(object):
+     """
+     input size : (n_frames, dim=40)
+     output size : (1, n_win=40, dim=40) => one context window is chosen randomly
+     """
+     def __init__(self, input_per_file=1):
+         super(TruncatedInputfromMFB, self).__init__()
+         self.input_per_file = input_per_file
+     
+     def __call__(self, frames_features):
+         network_inputs = []
+         num_frames = len(frames_features)
+         
+         win_size = c.NUM_WIN_SIZE
+         half_win_size = int(win_size/2)
+         #if num_frames - half_win_size < half_win_size:
+         while num_frames - half_win_size <= half_win_size:
+             frames_features = np.append(frames_features, frames_features[:num_frames,:], axis=0)
+             num_frames =  len(frames_features)
+             
+         for i in range(self.input_per_file):
+             j = random.randrange(half_win_size, num_frames - half_win_size)
+             if not j:
+                 frames_slice = np.zeros(num_frames, c.FILTER_BANK, 'float64')
+                 frames_slice[0:(frames_features.shape)[0]] = frames_features.shape
+             else:
+                 frames_slice = frames_features[j - half_win_size:j + half_win_size]
+             network_inputs.append(frames_slice)
+         return np.array(network_inputs)
+ 
+ 
+ class TruncatedInputfromMFB_test(object):
+     def __init__(self, input_per_file=1):
+         super(TruncatedInputfromMFB_test, self).__init__()
+         self.input_per_file = input_per_file
+ 
+     def __call__(self, frames_features):
+         network_inputs = []
+         num_frames = len(frames_features)
+ 
+         for i in range(self.input_per_file):
+ 
+             for j in range(c.NUM_PREVIOUS_FRAME, num_frames - c.NUM_NEXT_FRAME):
+                 frames_slice = frames_features[j - c.NUM_PREVIOUS_FRAME:j + c.NUM_NEXT_FRAME]
+                 # network_inputs.append(np.reshape(frames_slice, (32, 20, 3)))
+                 network_inputs.append(frames_slice)
+         return np.array(network_inputs)
+ 
+ class TruncatedInputfromMFB_CNN_test(object):
+     def __init__(self, input_per_file=1):
+         super(TruncatedInputfromMFB_CNN_test, self).__init__()
+         self.input_per_file = input_per_file
+ 
+     def __call__(self, frames_features):
+         network_inputs = []
+         num_frames = len(frames_features)
+ 
+         for i in range(self.input_per_file):
+ 
+             for j in range(c.NUM_PREVIOUS_FRAME, num_frames - c.NUM_NEXT_FRAME):
+                 frames_slice = frames_features[j - c.NUM_PREVIOUS_FRAME:j + c.NUM_NEXT_FRAME]
+                 #network_inputs.append(np.reshape(frames_slice, (-1, c.NUM_PREVIOUS_FRAME+c.NUM_NEXT_FRAME, c.FILTER_BANK)))
+                 network_inputs.append(frames_slice)
+         network_inputs = np.expand_dims(network_inputs, axis=1)
+         assert network_inputs.ndim == 4, 'Data is not a 4D tensor. size:%s' % (np.shape(network_inputs),)
+         return np.array(network_inputs)
+ 
+ class ToTensorInput(object):
+     """Convert ndarrays in sample to Tensors."""
+     def __call__(self, np_feature):
+         """
+         Args:
+             feature (numpy.ndarray): feature to be converted to tensor.
+         Returns:
+             Tensor: Converted feature.
+         """
+         if isinstance(np_feature, np.ndarray):
+             # handle numpy array
+             ten_feature = torch.from_numpy(np_feature.transpose((0,2,1))).float() # output type => torch.FloatTensor, fast
+             
+             # input size : (1, n_win=200, dim=40)
+             # output size : (1, dim=40, n_win=200)
+             return ten_feature
+ 
+ class ToTensorDevInput(object):
+     """Convert ndarrays in sample to Tensors."""
+     def __call__(self, np_feature):
+         """
+         Args:
+             feature (numpy.ndarray): feature to be converted to tensor.
+         Returns:
+             Tensor: Converted feature.
+         """
+         if isinstance(np_feature, np.ndarray):
+             # handle numpy array
+             np_feature = np.expand_dims(np_feature, axis=0)
+             assert np_feature.ndim == 3, 'Data is not a 3D tensor. size:%s' %(np.shape(np_feature),)
+             ten_feature = torch.from_numpy(np_feature.transpose((0,2,1))).float() # output type => torch.FloatTensor, fast
+             # input size : (1, n_win=40, dim=40)
+             # output size : (1, dim=40, n_win=40)
+             return ten_feature
+ 
+ class ToTensorTestInput(object):
+     """Convert ndarrays in sample to Tensors."""
+     def __call__(self, np_feature):
+         """
+         Args:
+             feature (numpy.ndarray): feature to be converted to tensor.
+         Returns:
+             Tensor: Converted feature.
+         """
+         if isinstance(np_feature, np.ndarray):
+             # handle numpy array
+             np_feature = np.expand_dims(np_feature, axis=0)
+             np_feature = np.expand_dims(np_feature, axis=1)
+             assert np_feature.ndim == 4, 'Data is not a 4D tensor. size:%s' %(np.shape(np_feature),)
+             ten_feature = torch.from_numpy(np_feature.transpose((0,1,3,2))).float() # output type => torch.FloatTensor, fast
+             # input size : (1, 1, n_win=200, dim=40)
+             # output size : (1, 1, dim=40, n_win=200)
+             return ten_feature
+ 
+ def collate_fn_feat_padded(batch):
+     """
+     Sort a data list by frame length (descending order)
+     batch : list of tuple (feature, label). len(batch) = batch_size
+         - feature : torch tensor of shape [1, 40, 80] ; variable size of frames
+         - labels : torch tensor of shape (1)
+     ex) samples = collate_fn([batch])
+         batch = [dataset[i] for i in batch_indices]. ex) [Dvector_train_dataset[i] for i in [0,1,2,3,4]]
+         batch[0][0].shape = torch.Size([1,64,774]). "774" is the number of frames per utterance. 
+         
+     """
+     batch.sort(key=lambda x: x[0].shape[2], reverse=True)
+     feats, labels = zip(*batch)
+     
+     # Merge labels => torch.Size([batch_size,1])
+     labels = torch.stack(labels, 0)
+     labels = labels.view(-1)
+     
+     # Merge frames
+     lengths = [feat.shape[2] for feat in feats] # in decreasing order 
+     max_length = lengths[0]
+     # features_mod.shape => torch.Size([batch_size, n_channel, dim, max(n_win)])
+     padded_features = torch.zeros(len(feats), feats[0].shape[0], feats[0].shape[1], feats[0].shape[2]).float() # convert to FloatTensor (it should be!). torch.Size([batch, 1, feat_dim, max(n_win)])
+     for i, feat in enumerate(feats):
+         end = lengths[i]
+         num_frames = feat.shape[2]
+         while max_length > num_frames:
+             feat = torch.cat((feat, feat[:,:,:end]), 2)
+             num_frames = feat.shape[2]
+         
+         padded_features[i, :, :, :] = feat[:,:,:max_length]
+     
+     return padded_features, labels
+ 
+ class DvectorDataset(data.Dataset):
+     def __init__(self, DB, loader, spk_to_idx, transform=None, *arg, **kw):
+         self.DB = DB
+         self.len = len(DB)
+         self.transform = transform
+         self.loader = loader
+         self.spk_to_idx = spk_to_idx
+     
+     def __getitem__(self, index):
+         feat_path = self.DB['filename'][index]
+         feature, label = self.loader(feat_path)
+         label = self.spk_to_idx[label]
+         label = torch.Tensor([label]).long()
+         if self.transform:
+             feature = self.transform(feature)
+         
+         return feature, label
+     
+     def __len__(self):
+         return self.len
+         
+ def main():
+     train_DB = read_DB_structure(c.TRAIN_DATAROOT_DIR)
+     transform = transforms.Compose([
+         truncatedinputfromMFB(),
+         totensor_DNN_input()
+     ])
+     file_loader = read_MFB
+     speaker_list = sorted(set(train_DB['speaker_id']))
+     spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)}
+     batch_size = 128
+     Dvector_train_dataset = Dvector_Dataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx)
+     Dvector_train_loader = torch.utils.data.DataLoader(dataset=Dvector_train_dataset,
+                                                        batch_size=batch_size,
+                                                        shuffle=False)
+ 
+ if __name__ == '__main__':
+     main()
\ No newline at end of file
--- a/Speaker_Recognition/__pycache__/DB_wav_reader.cpython-35.pyc 0 → 100644
View file @6e786b8
+++ b/Speaker_Recognition/__pycache__/DB_wav_reader.cpython-35.pyc 0 → 100644
View file @6e786b8
--- a/Speaker_Recognition/__pycache__/SR_Dataset.cpython-35.pyc 0 → 100644
View file @6e786b8
+++ b/Speaker_Recognition/__pycache__/SR_Dataset.cpython-35.pyc 0 → 100644
View file @6e786b8
--- a/Speaker_Recognition/__pycache__/configure.cpython-35.pyc 0 → 100644
View file @6e786b8
+++ b/Speaker_Recognition/__pycache__/configure.cpython-35.pyc 0 → 100644
View file @6e786b8
--- a/Speaker_Recognition/configure.py 0 → 100644
View file @6e786b8
+++ b/Speaker_Recognition/configure.py 0 → 100644
View file @6e786b8
+ # Wave path
+ TRAIN_WAV_DIR = '/home/admin/Desktop/read_25h_2/train'
+ DEV_WAV_DIR = '/home/admin/Desktop/read_25h_2/dev'
+ TEST_WAV_DIR = 'test_wavs'
+ 
+ # Feature path
+ TRAIN_FEAT_DIR = 'feat_logfbank_nfilt40/train'
+ TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test'
+ 
+ # Context window size
+ NUM_WIN_SIZE = 100 #10
+ 
+ # Settings for feature extraction
+ USE_LOGSCALE = True
+ USE_DELTA = False
+ USE_SCALE = False
+ SAMPLE_RATE = 16000
+ FILTER_BANK = 40
\ No newline at end of file
--- a/Speaker_Recognition/enroll.py 0 → 100644
View file @6e786b8
+++ b/Speaker_Recognition/enroll.py 0 → 100644
View file @6e786b8
+ import torch
+ import torch.nn.functional as F
+ from torch.autograd import Variable
+ 
+ import pandas as pd
+ import math
+ import os
+ import configure as c
+ 
+ from DB_wav_reader import read_feats_structure
+ from SR_Dataset import read_MFB, ToTensorTestInput
+ from model.model import background_resnet
+ 
+ def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+     model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+     if use_cuda:
+         model.cuda()
+     print('=> loading checkpoint')
+     # original saved file with DataParallel
+     checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+     # create new OrderedDict that does not contain `module.`
+     model.load_state_dict(checkpoint['state_dict'])
+     model.eval()
+     return model
+ 
+ def split_enroll_and_test(dataroot_dir):
+     DB_all = read_feats_structure(dataroot_dir)
+     enroll_DB = pd.DataFrame()
+     test_DB = pd.DataFrame()
+     
+     enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+     test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+     
+     # Reset the index
+     enroll_DB = enroll_DB.reset_index(drop=True)
+     test_DB = test_DB.reset_index(drop=True)
+     return enroll_DB, test_DB
+ 
+ def get_embeddings(use_cuda, filename, model, test_frames):
+     input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+     
+     tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+     activation = 0
+     with torch.no_grad():
+         for i in range(tot_segments):
+             temp_input = input[i*test_frames:i*test_frames+test_frames]
+             
+             TT = ToTensorTestInput()
+             temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+     
+             if use_cuda:
+                 temp_input = temp_input.cuda()
+             temp_activation,_ = model(temp_input)
+             activation += torch.sum(temp_activation, dim=0, keepdim=True)
+     
+     activation = l2_norm(activation, 1)
+                 
+     return activation
+ 
+ def l2_norm(input, alpha):
+     input_size = input.size()  # size:(n_frames, dim)
+     buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+     normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+     norm = torch.sqrt(normp)  # size:(n_frames)
+     _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+     output = _output.view(input_size)
+     # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+     output = output * alpha
+     return output
+ 
+ def enroll_per_spk(use_cuda, test_frames, model, DB, embedding_dir):
+     """
+     Output the averaged d-vector for each speaker (enrollment)
+     Return the dictionary (length of n_spk)
+     """
+     n_files = len(DB) # 10
+     enroll_speaker_list = sorted(set(DB['speaker_id']))
+     
+     embeddings = {}
+     
+     # Aggregates all the activations
+     print("Start to aggregate all the d-vectors per enroll speaker")
+     
+     for i in range(n_files):
+         filename = DB['filename'][i]
+         spk = DB['speaker_id'][i]
+         
+         activation = get_embeddings(use_cuda, filename, model, test_frames)
+         if spk in embeddings:
+             embeddings[spk] += activation
+         else:
+             embeddings[spk] = activation
+             
+         print("Aggregates the activation (spk : %s)" % (spk))
+         
+     if not os.path.exists(embedding_dir):
+         os.makedirs(embedding_dir)
+         
+     # Save the embeddings
+     for spk_index in enroll_speaker_list:
+         embedding_path = os.path.join(embedding_dir, spk_index+'.pth')
+         torch.save(embeddings[spk_index], embedding_path)
+         print("Save the embeddings for %s" % (spk_index))
+     return embeddings
+     
+ def main():
+         
+     # Settings
+     use_cuda = True
+     log_dir = 'model_saved'
+     embedding_size = 128
+     cp_num = 24 # Which checkpoint to use?
+     n_classes = 240
+     test_frames = 200
+     
+     # Load model from checkpoint
+     model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+     
+     # Get the dataframe for enroll DB
+     enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+     
+     # Where to save embeddings
+     embedding_dir = 'enroll_embeddings'
+     
+     # Perform the enrollment and save the results
+     enroll_per_spk(use_cuda, test_frames, model, enroll_DB, embedding_dir)
+     
+     """ Test speaker list
+     '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+     '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+     """ 
+ 
+ if __name__ == '__main__':
+     main()
\ No newline at end of file
--- a/Speaker_Recognition/identification.py 0 → 100644
View file @6e786b8
+++ b/Speaker_Recognition/identification.py 0 → 100644
View file @6e786b8
+ import torch
+ import torch.nn.functional as F
+ from torch.autograd import Variable
+ 
+ import pandas as pd
+ import math
+ import os
+ import configure as c
+ 
+ from DB_wav_reader import read_feats_structure
+ from SR_Dataset import read_MFB, ToTensorTestInput
+ from model.model import background_resnet
+ 
+ def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+     model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+     if use_cuda:
+         model.cuda()
+     print('=> loading checkpoint')
+     # original saved file with DataParallel
+     checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+     # create new OrderedDict that does not contain `module.`
+     model.load_state_dict(checkpoint['state_dict'])
+     model.eval()
+     return model
+ 
+ def split_enroll_and_test(dataroot_dir):
+     DB_all = read_feats_structure(dataroot_dir)
+     enroll_DB = pd.DataFrame()
+     test_DB = pd.DataFrame()
+     
+     enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+     test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+     
+     # Reset the index
+     enroll_DB = enroll_DB.reset_index(drop=True)
+     test_DB = test_DB.reset_index(drop=True)
+     return enroll_DB, test_DB
+ 
+ def load_enroll_embeddings(embedding_dir):
+     embeddings = {}
+     for f in os.listdir(embedding_dir):
+         spk = f.replace('.pth','')
+         # Select the speakers who are in the 'enroll_spk_list'
+         embedding_path = os.path.join(embedding_dir, f)
+         tmp_embeddings = torch.load(embedding_path)
+         embeddings[spk] = tmp_embeddings
+         
+     return embeddings
+ 
+ def get_embeddings(use_cuda, filename, model, test_frames):
+     input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+     
+     tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+     activation = 0
+     with torch.no_grad():
+         for i in range(tot_segments):
+             temp_input = input[i*test_frames:i*test_frames+test_frames]
+             
+             TT = ToTensorTestInput()
+             temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+     
+             if use_cuda:
+                 temp_input = temp_input.cuda()
+             temp_activation,_ = model(temp_input)
+             activation += torch.sum(temp_activation, dim=0, keepdim=True)
+     
+     activation = l2_norm(activation, 1)
+                 
+     return activation
+ 
+ def l2_norm(input, alpha):
+     input_size = input.size()  # size:(n_frames, dim)
+     buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+     normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+     norm = torch.sqrt(normp)  # size:(n_frames)
+     _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+     output = _output.view(input_size)
+     # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+     output = output * alpha
+     return output
+ 
+ def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
+     test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+     max_score = -10**8
+     best_spk = None
+     for spk in spk_list:
+         score = F.cosine_similarity(test_embedding, embeddings[spk])
+         score = score.data.cpu().numpy() 
+         if score > max_score:
+             max_score = score
+             best_spk = spk
+     #print("Speaker identification result : %s" %best_spk)
+     true_spk = test_filename.split('/')[-2].split('_')[0]
+     print("\n=== Speaker identification ===")
+     print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
+     return best_spk
+ 
+ def main():
+     
+     log_dir = 'model_saved' # Where the checkpoints are saved
+     embedding_dir = 'enroll_embeddings' # Where embeddings are saved
+     test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+     
+     # Settings
+     use_cuda = True # Use cuda or not
+     embedding_size = 128 # Dimension of speaker embeddings
+     cp_num = 24 # Which checkpoint to use?
+     n_classes = 240 # How many speakers in training data?
+     test_frames = 100 # Split the test utterance 
+ 
+     # Load model from checkpoint
+     model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+ 
+     # Get the dataframe for test DB
+     enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+     
+     # Load enroll embeddings
+     embeddings = load_enroll_embeddings(embedding_dir)
+     
+     """ Test speaker list
+     '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+     '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+     """ 
+     
+     spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
+     '229M2031', '230M4087', '233F4013', '236M3043', '240M3063']
+     
+     # Set the test speaker
+     test_speaker = '230M4087' 
+     
+     test_path = os.path.join(test_dir, test_speaker, 'test.p')
+     
+     # Perform the test 
+     best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
+ 
+ if __name__ == '__main__':
+     main()
\ No newline at end of file
--- a/Speaker_Recognition/loss_plot.png 0 → 100644
View file @6e786b8
+++ b/Speaker_Recognition/loss_plot.png 0 → 100644
View file @6e786b8
--- a/Speaker_Recognition/model/__pycache__/model.cpython-35.pyc 0 → 100644
View file @6e786b8
+++ b/Speaker_Recognition/model/__pycache__/model.cpython-35.pyc 0 → 100644
View file @6e786b8
--- a/Speaker_Recognition/model/__pycache__/resnet.cpython-35.pyc 0 → 100644
View file @6e786b8
+++ b/Speaker_Recognition/model/__pycache__/resnet.cpython-35.pyc 0 → 100644
View file @6e786b8
--- a/Speaker_Recognition/model/model.py 0 → 100644
View file @6e786b8
+++ b/Speaker_Recognition/model/model.py 0 → 100644
View file @6e786b8
+ import torch
+ import torch.nn as nn
+ import torch.nn.functional as F
+ from torch.autograd import Function
+ import model.resnet as resnet
+ 
+ 
+ class background_resnet(nn.Module):
+     def __init__(self, embedding_size, num_classes, backbone='resnet18'):
+         super(background_resnet, self).__init__()
+         self.backbone = backbone
+         # copying modules from pretrained models
+         if backbone == 'resnet50':
+             self.pretrained = resnet.resnet50(pretrained=False)
+         elif backbone == 'resnet101':
+             self.pretrained = resnet.resnet101(pretrained=False)
+         elif backbone == 'resnet152':
+             self.pretrained = resnet.resnet152(pretrained=False)
+         elif backbone == 'resnet18':
+             self.pretrained = resnet.resnet18(pretrained=False)
+         elif backbone == 'resnet34':
+             self.pretrained = resnet.resnet34(pretrained=False)
+         else:
+             raise RuntimeError('unknown backbone: {}'.format(backbone))
+             
+         self.fc0 = nn.Linear(128, embedding_size)
+         self.bn0 = nn.BatchNorm1d(embedding_size)
+         self.relu = nn.ReLU()
+         self.last = nn.Linear(embedding_size, num_classes)
+ 
+     def forward(self, x):
+         # input x: minibatch x 1 x 40 x 40
+         x = self.pretrained.conv1(x)
+         x = self.pretrained.bn1(x)
+         x = self.pretrained.relu(x)
+         
+         x = self.pretrained.layer1(x)
+         x = self.pretrained.layer2(x)
+         x = self.pretrained.layer3(x)
+         x = self.pretrained.layer4(x)
+         
+         out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1]
+         out = torch.squeeze(out) # [batch, n_embed]
+         # flatten the out so that the fully connected layer can be connected from here
+         out = out.view(x.size(0), -1) # (n_batch, n_embed)
+         spk_embedding = self.fc0(out)
+         out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed]
+         out = self.last(out)
+         
+         return spk_embedding, out
\ No newline at end of file
--- a/Speaker_Recognition/model/resnet.py 0 → 100644
View file @6e786b8
+++ b/Speaker_Recognition/model/resnet.py 0 → 100644
View file @6e786b8
+ """Imported from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
+ and added support for the 1x32x32 mel spectrogram for the speech recognition.
+ Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun: Deep Residual Learning for Image Recognition
+ https://arxiv.org/abs/1512.03385
+ """
+ 
+ import torch.nn as nn
+ import math
+ import torch.utils.model_zoo as model_zoo
+ 
+ 
+ __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+            'resnet152']
+ 
+ 
+ model_urls = {
+     'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+     'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+     'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+     'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+     'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+ }
+ 
+ 
+ def conv3x3(in_planes, out_planes, stride=1):
+     """3x3 convolution with padding"""
+     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                      padding=1, bias=False)
+ 
+ 
+ class BasicBlock(nn.Module):
+     expansion = 1
+ 
+     def __init__(self, inplanes, planes, stride=1, downsample=None):
+         super(BasicBlock, self).__init__()
+         self.conv1 = conv3x3(inplanes, planes, stride)
+         self.bn1 = nn.BatchNorm2d(planes)
+         self.relu = nn.ReLU(inplace=True)
+         self.conv2 = conv3x3(planes, planes)
+         self.bn2 = nn.BatchNorm2d(planes)
+         self.downsample = downsample
+         self.stride = stride
+ 
+     def forward(self, x):
+         residual = x
+ 
+         out = self.conv1(x)
+         out = self.bn1(out)
+         out = self.relu(out)
+ 
+         out = self.conv2(out)
+         out = self.bn2(out)
+ 
+         if self.downsample is not None:
+             residual = self.downsample(x)
+ 
+         out += residual
+         out = self.relu(out)
+ 
+         return out
+ 
+ 
+ class Bottleneck(nn.Module):
+     expansion = 4
+ 
+     def __init__(self, inplanes, planes, stride=1, downsample=None):
+         super(Bottleneck, self).__init__()
+         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+         self.bn1 = nn.BatchNorm2d(planes)
+         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                                padding=1, bias=False)
+         self.bn2 = nn.BatchNorm2d(planes)
+         self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+         self.bn3 = nn.BatchNorm2d(planes * 4)
+         self.relu = nn.ReLU(inplace=True)
+         self.downsample = downsample
+         self.stride = stride
+ 
+     def forward(self, x):
+         residual = x
+ 
+         out = self.conv1(x)
+         out = self.bn1(out)
+         out = self.relu(out)
+ 
+         out = self.conv2(out)
+         out = self.bn2(out)
+         out = self.relu(out)
+ 
+         out = self.conv3(out)
+         out = self.bn3(out)
+ 
+         if self.downsample is not None:
+             residual = self.downsample(x)
+ 
+         out += residual
+         out = self.relu(out)
+ 
+         return out
+ 
+ 
+ class ResNet(nn.Module):
+ 
+     def __init__(self, block, layers, num_classes=1000, in_channels=1):
+         self.inplanes = 16
+         super(ResNet, self).__init__()
+         self.conv1 = nn.Conv2d(in_channels, 16, kernel_size=7, stride=1, padding=3,
+                                bias=False) # ori : stride = 2
+         self.bn1 = nn.BatchNorm2d(16)
+         self.relu = nn.ReLU(inplace=True)
+         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+         self.layer1 = self._make_layer(block, 16, layers[0])
+         self.layer2 = self._make_layer(block, 32, layers[1], stride=2)
+         self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
+         self.layer4 = self._make_layer(block, 128, layers[3], stride=2)
+         self.avgpool = nn.AvgPool2d(1, stride=1)
+         self.fc = nn.Linear(128 * block.expansion, num_classes)
+ 
+         for m in self.modules():
+             if isinstance(m, nn.Conv2d):
+                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                 m.weight.data.normal_(0, math.sqrt(2. / n))
+             elif isinstance(m, nn.BatchNorm2d):
+                 m.weight.data.fill_(1)
+                 m.bias.data.zero_()
+ 
+     def _make_layer(self, block, planes, blocks, stride=1):
+         downsample = None
+         if stride != 1 or self.inplanes != planes * block.expansion:
+             downsample = nn.Sequential(
+                 nn.Conv2d(self.inplanes, planes * block.expansion,
+                           kernel_size=1, stride=stride, bias=False),
+                 nn.BatchNorm2d(planes * block.expansion),
+             )
+ 
+         layers = []
+         layers.append(block(self.inplanes, planes, stride, downsample))
+         self.inplanes = planes * block.expansion
+         for i in range(1, blocks):
+             layers.append(block(self.inplanes, planes))
+ 
+         return nn.Sequential(*layers)
+ 
+     def forward(self, x):
+         x = self.conv1(x)
+         x = self.bn1(x)
+         x = self.relu(x)
+         x = self.maxpool(x)
+ 
+         x = self.layer1(x)
+         x = self.layer2(x)
+         x = self.layer3(x)
+         x = self.layer4(x)
+ 
+         x = self.avgpool(x)
+         x = x.view(x.size(0), -1)
+         x = self.fc(x)
+ 
+         return x
+ 
+ 
+ def resnet18(pretrained=False, **kwargs):
+     """Constructs a ResNet-18 model.
+     Args:
+         pretrained (bool): If True, returns a model pre-trained on ImageNet
+     """
+     model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+     if pretrained:
+         model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
+     return model
+ 
+ 
+ def resnet34(pretrained=False, **kwargs):
+     """Constructs a ResNet-34 model.
+     Args:
+         pretrained (bool): If True, returns a model pre-trained on ImageNet
+     """
+     model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
+     if pretrained:
+         model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
+     return model
+ 
+ 
+ def resnet50(pretrained=False, **kwargs):
+     """Constructs a ResNet-50 model.
+     Args:
+         pretrained (bool): If True, returns a model pre-trained on ImageNet
+     """
+     model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+     if pretrained:
+         model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
+     return model
+ 
+ 
+ def resnet101(pretrained=False, **kwargs):
+     """Constructs a ResNet-101 model.
+     Args:
+         pretrained (bool): If True, returns a model pre-trained on ImageNet
+     """
+     model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+     if pretrained:
+         model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
+     return model
+ 
+ 
+ def resnet152(pretrained=False, **kwargs):
+     """Constructs a ResNet-152 model.
+     Args:
+         pretrained (bool): If True, returns a model pre-trained on ImageNet
+     """
+     model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
+     if pretrained:
+         model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
+     return model
\ No newline at end of file
--- a/Speaker_Recognition/model_saved/checkpoint_24.pth 0 → 100644
View file @6e786b8
+++ b/Speaker_Recognition/model_saved/checkpoint_24.pth 0 → 100644
View file @6e786b8
--- a/Speaker_Recognition/model_saved/checkpoint_24_cpu.pth 0 → 100644
View file @6e786b8
+++ b/Speaker_Recognition/model_saved/checkpoint_24_cpu.pth 0 → 100644
View file @6e786b8
--- a/Speaker_Recognition/train.py 0 → 100644
View file @6e786b8
+++ b/Speaker_Recognition/train.py 0 → 100644
View file @6e786b8
+ import torch
+ import torch.nn as nn
+ import torch.optim as optim
+ import torchvision.transforms as transforms
+ 
+ import time
+ import os
+ import numpy as np
+ import configure as c
+ import pandas as pd
+ from DB_wav_reader import read_feats_structure
+ from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded
+ from model.model import background_resnet
+ import matplotlib.pyplot as plt
+ 
+ def load_dataset(val_ratio):
+     # Load training set and validation set
+     
+     
+     # Split training set into training set and validation set according to "val_ratio"
+     train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio) 
+     
+     file_loader = read_MFB # numpy array:(n_frames, n_dims)
+      
+     transform = transforms.Compose([
+         TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims)
+         ToTensorInput() # torch tensor:(1, n_dims, n_frames)
+     ])
+     transform_T = ToTensorDevInput()
+    
+     
+     speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers
+     spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)}
+     
+     train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx)
+     valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx)
+     
+     n_classes = len(speaker_list) # How many speakers? 240
+     return train_dataset, valid_dataset, n_classes
+ 
+ def split_train_dev(train_feat_dir, valid_ratio):
+     train_valid_DB = read_feats_structure(train_feat_dir)
+     total_len = len(train_valid_DB) # 148642
+     valid_len = int(total_len * valid_ratio/100.)
+     train_len = total_len - valid_len
+     shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True)
+     # Split the DB into train and valid set
+     train_DB = shuffled_train_valid_DB.iloc[:train_len]
+     valid_DB = shuffled_train_valid_DB.iloc[train_len:]
+     # Reset the index
+     train_DB = train_DB.reset_index(drop=True)
+     valid_DB = valid_DB.reset_index(drop=True)
+     print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100))
+     print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100))
+     print('Total %d utts' %(total_len))
+     
+     return train_DB, valid_DB
+ 
+ def main():
+     
+     # Set hyperparameters
+     use_cuda = True # use gpu or cpu
+     val_ratio = 10 # Percentage of validation set
+     embedding_size = 128
+     start = 1 # Start epoch
+     n_epochs = 30 # How many epochs?
+     end = start + n_epochs # Last epoch
+     
+     lr = 1e-1 # Initial learning rate
+     wd = 1e-4 # Weight decay (L2 penalty)
+     optimizer_type = 'sgd' # ex) sgd, adam, adagrad
+     
+     batch_size = 64 # Batch size for training 
+     valid_batch_size = 16 # Batch size for validation
+     use_shuffle = True # Shuffle for training or not
+     
+     # Load dataset
+     train_dataset, valid_dataset, n_classes = load_dataset(val_ratio)
+     
+     # print the experiment configuration
+     print('\nNumber of classes (speakers):\n{}\n'.format(n_classes))
+     
+     log_dir = 'model_saved' # where to save checkpoints
+     
+     if not os.path.exists(log_dir):
+         os.makedirs(log_dir)
+         
+     # instantiate model and initialize weights
+     model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+     
+     if use_cuda:
+         model.cuda()
+     
+     # define loss function (criterion), optimizer and scheduler
+     criterion = nn.CrossEntropyLoss()
+     optimizer = create_optimizer(optimizer_type, model, lr, wd)
+     scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1)
+     
+     train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+                                                        batch_size=batch_size,
+                                                        shuffle=use_shuffle)
+     valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
+                                                        batch_size=valid_batch_size,
+                                                        shuffle=False,
+                                                        collate_fn = collate_fn_feat_padded)
+                                
+     # to track the average training loss per epoch as the model trains
+     avg_train_losses = []
+     # to track the average validation loss per epoch as the model trains
+     avg_valid_losses = []
+ 
+     
+     for epoch in range(start, end):
+     
+         # train for one epoch
+         train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes)
+         
+         # evaluate on validation set
+         valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch)
+         
+         scheduler.step(valid_loss, epoch)
+         
+         # calculate average loss over an epoch
+         avg_train_losses.append(train_loss)
+         avg_valid_losses.append(valid_loss)
+         
+         # do checkpointing
+         torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(),
+                     'optimizer': optimizer.state_dict()},
+                    '{}/checkpoint_{}.pth'.format(log_dir, epoch))
+                    
+     # find position of lowest validation loss
+     minposs = avg_valid_losses.index(min(avg_valid_losses))+1 
+     print('Lowest validation loss at epoch %d' %minposs)
+     
+     # visualize the loss and learning rate as the network trained
+     visualize_the_losses(avg_train_losses, avg_valid_losses)
+     
+ 
+ def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes):
+     batch_time = AverageMeter()
+     losses = AverageMeter()
+     train_acc = AverageMeter()
+     
+     n_correct, n_total = 0, 0
+     log_interval = 84
+     # switch to train mode
+     model.train()
+     
+     end = time.time()
+     # pbar = tqdm(enumerate(train_loader))
+     for batch_idx, (data) in enumerate(train_loader):
+         inputs, targets = data  # target size:(batch size,1), input size:(batch size, 1, dim, win)
+         targets = targets.view(-1) # target size:(batch size)
+         current_sample = inputs.size(0)  # batch size
+        
+         if use_cuda:
+             inputs = inputs.cuda()
+             targets = targets.cuda()
+         _, output = model(inputs) # out size:(batch size, #classes), for softmax
+         
+         # calculate accuracy of predictions in the current batch
+         n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
+         n_total += current_sample
+         train_acc_temp = 100. * n_correct / n_total
+         train_acc.update(train_acc_temp, inputs.size(0))
+         
+         loss = criterion(output, targets)
+         losses.update(loss.item(), inputs.size(0))
+         
+         # compute gradient and do SGD step
+         optimizer.zero_grad()
+         loss.backward()
+         optimizer.step()
+ 
+         # measure elapsed time
+         batch_time.update(time.time() - end)
+         end = time.time()
+ 
+         if batch_idx % log_interval == 0:
+             print(
+                     'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t'
+                     'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                     'Loss {loss.avg:.4f}\t'
+                     'Acc {train_acc.avg:.4f}'.format(
+                      epoch, batch_idx * len(inputs), len(train_loader.dataset),
+                      100. * batch_idx / len(train_loader), 
+                      batch_time=batch_time, loss=losses, train_acc=train_acc))
+     return losses.avg
+                      
+ def validate(val_loader, model, criterion, use_cuda, epoch):
+     batch_time = AverageMeter()
+     losses = AverageMeter()
+     val_acc = AverageMeter()
+     
+     n_correct, n_total = 0, 0
+     
+     # switch to evaluate mode
+     model.eval()
+ 
+     with torch.no_grad():
+         end = time.time()
+         for i, (data) in enumerate(val_loader):
+             inputs, targets = data
+             current_sample = inputs.size(0)  # batch size
+             
+             if use_cuda:
+                 inputs = inputs.cuda()
+                 targets = targets.cuda()
+             
+             # compute output
+             _, output = model(inputs)
+             
+             # measure accuracy and record loss
+             n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
+             n_total += current_sample
+             val_acc_temp = 100. * n_correct / n_total
+             val_acc.update(val_acc_temp, inputs.size(0))
+             
+             loss = criterion(output, targets)
+             losses.update(loss.item(), inputs.size(0))
+             # measure elapsed time
+             batch_time.update(time.time() - end)
+             end = time.time()
+         
+         print('  * Validation: '
+                   'Loss {loss.avg:.4f}\t'
+                   'Acc {val_acc.avg:.4f}'.format(
+                   loss=losses, val_acc=val_acc))
+     
+     return losses.avg
+ 
+ class AverageMeter(object):
+     """Computes and stores the average and current value"""
+     def __init__(self):
+         self.reset()
+     def reset(self):
+         self.val = 0
+         self.avg = 0
+         self.sum = 0
+         self.count = 0
+     def update(self, val, n=1):
+         self.val = val
+         self.sum += val * n
+         self.count += n
+         self.avg = self.sum / self.count
+ 
+ def create_optimizer(optimizer, model, new_lr, wd):
+     # setup optimizer
+     if optimizer == 'sgd':
+         optimizer = optim.SGD(model.parameters(), lr=new_lr,
+                               momentum=0.9, dampening=0,
+                               weight_decay=wd)
+     elif optimizer == 'adam':
+         optimizer = optim.Adam(model.parameters(), lr=new_lr,
+                                weight_decay=wd)
+     elif optimizer == 'adagrad':
+         optimizer = optim.Adagrad(model.parameters(),
+                                   lr=new_lr,
+                                   weight_decay=wd)
+     return optimizer
+ 
+ def visualize_the_losses(train_loss, valid_loss):
+     # https://github.com/Bjarten/early-stopping-pytorch/blob/master/MNIST_Early_Stopping_example.ipynb
+     # visualize the loss as the network trained
+     fig = plt.figure(figsize=(10,8))
+     plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss')
+     plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss')
+     
+     # find position of lowest validation loss
+     minposs = valid_loss.index(min(valid_loss))+1 
+     plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')
+     
+     plt.xlabel('epochs')
+     plt.ylabel('loss')
+     plt.ylim(0, 3.5) # consistent scale
+     plt.xlim(0, len(train_loss)+1) # consistent scale
+     plt.grid(True)
+     plt.legend()
+     plt.tight_layout()
+     #plt.show()
+     fig.savefig('loss_plot.png', bbox_inches='tight')
+ 
+ if __name__ == '__main__':
+     main()
\ No newline at end of file
--- a/Speaker_Recognition/verification.py 0 → 100644
View file @6e786b8
+++ b/Speaker_Recognition/verification.py 0 → 100644
View file @6e786b8
+ import torch
+ import torch.nn.functional as F
+ from torch.autograd import Variable
+ 
+ import pandas as pd
+ import math
+ import os
+ import configure as c
+ 
+ from DB_wav_reader import read_feats_structure
+ from SR_Dataset import read_MFB, ToTensorTestInput
+ from model.model import background_resnet
+ 
+ def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+     model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+     if use_cuda:
+         model.cuda()
+     print('=> loading checkpoint')
+     # original saved file with DataParallel
+     checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+     # create new OrderedDict that does not contain `module.`
+     model.load_state_dict(checkpoint['state_dict'])
+     model.eval()
+     return model
+ 
+ def split_enroll_and_test(dataroot_dir):
+     DB_all = read_feats_structure(dataroot_dir)
+     enroll_DB = pd.DataFrame()
+     test_DB = pd.DataFrame()
+     
+     enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+     test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+     
+     # Reset the index
+     enroll_DB = enroll_DB.reset_index(drop=True)
+     test_DB = test_DB.reset_index(drop=True)
+     return enroll_DB, test_DB
+ 
+ def load_enroll_embeddings(embedding_dir):
+     embeddings = {}
+     for f in os.listdir(embedding_dir):
+         spk = f.replace('.pth','')
+         # Select the speakers who are in the 'enroll_spk_list'
+         embedding_path = os.path.join(embedding_dir, f)
+         tmp_embeddings = torch.load(embedding_path)
+         embeddings[spk] = tmp_embeddings
+         
+     return embeddings
+ 
+ def get_embeddings(use_cuda, filename, model, test_frames):
+     input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+     
+     tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+     activation = 0
+     with torch.no_grad():
+         for i in range(tot_segments):
+             temp_input = input[i*test_frames:i*test_frames+test_frames]
+             
+             TT = ToTensorTestInput()
+             temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+     
+             if use_cuda:
+                 temp_input = temp_input.cuda()
+             temp_activation,_ = model(temp_input)
+             activation += torch.sum(temp_activation, dim=0, keepdim=True)
+     
+     activation = l2_norm(activation, 1)
+                 
+     return activation
+ 
+ def l2_norm(input, alpha):
+     input_size = input.size()  # size:(n_frames, dim)
+     buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+     normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+     norm = torch.sqrt(normp)  # size:(n_frames)
+     _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+     output = _output.view(input_size)
+     # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+     output = output * alpha
+     return output
+ 
+ def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
+     enroll_embedding = embeddings[enroll_speaker]
+     test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+ 
+     score = F.cosine_similarity(test_embedding, enroll_embedding)
+     score = score.data.cpu().numpy() 
+         
+     if score > thres:
+         result = 'Accept'
+     else:
+         result = 'Reject'
+         
+     test_spk = test_filename.split('/')[-2].split('_')[0]
+     print("\n=== Speaker verification ===")
+     print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
+     print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
+ 
+ def main():
+     
+     log_dir = 'model_saved' # Where the checkpoints are saved
+     embedding_dir = 'enroll_embeddings' # Where embeddings are saved
+     test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+     
+     # Settings
+     use_cuda = True # Use cuda or not
+     embedding_size = 128 # Dimension of speaker embeddings
+     cp_num = 24 # Which checkpoint to use?
+     n_classes = 240 # How many speakers in training data?
+     test_frames = 100 # Split the test utterance 
+ 
+     # Load model from checkpoint
+     model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+     
+     # Get the dataframe for test DB
+     enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+     
+     # Load enroll embeddings
+     embeddings = load_enroll_embeddings(embedding_dir)
+     
+     """ Test speaker list
+     '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+     '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+     """ 
+     
+     # Set the true speaker
+     enroll_speaker = '230M4087'
+     
+     # Set the claimed speaker
+     test_speaker = '230M4087' 
+     
+     # Threshold
+     thres = 0.95
+     
+     test_path = os.path.join(test_dir, test_speaker, 'test.p')
+     
+     # Perform the test 
+     perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
+ 
+ if __name__ == '__main__':
+     main()
\ No newline at end of file