Resnet34 + Layer ver, Resnet50 ver commit

김건
Commit cdaaea5f1b07e06a81f5880da426020862f75578 cdaaea5f 1 parent 80274f58
Showing 17 changed files with 2686 additions and 0 deletions
Speaker_Recognition/Speaker_Recognition
Speaker_Recognition/identification3.py
Speaker_Recognition/identification4.py
Speaker_Recognition/identification5.py
Speaker_Recognition/model/model3.py
Speaker_Recognition/model/model4.py
Speaker_Recognition/model/model5.py
Speaker_Recognition/train3.py
Speaker_Recognition/train4.py
Speaker_Recognition/train4_merge.py
Speaker_Recognition/train4_zeroth.py
Speaker_Recognition/train5.py
Speaker_Recognition/verification3.py
Speaker_Recognition/verification4.py
Speaker_Recognition/verification4_merge.py
Speaker_Recognition/verification4_zeroth.py
Speaker_Recognition/verification5.py
--- a/Speaker_Recognition @ df38711f
+++ b/Speaker_Recognition @ df38711f
+Subproject commit df38711f36cfb15ee578d14a70d0141d1d0a8134
--- a/Speaker_Recognition/identification3.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/identification3.py 0 → 100644
View file @cdaaea5
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+import pandas as pd
+import math
+import os
+import configure as c
+
+from DB_wav_reader import read_feats_structure
+from SR_Dataset import read_MFB, ToTensorTestInput
+from model.model3 import background_resnet
+
+def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+    model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+    if use_cuda:
+        model.cuda()
+    print('=> loading checkpoint')
+    # original saved file with DataParallel
+    checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+    # create new OrderedDict that does not contain `module.`
+    model.load_state_dict(checkpoint['state_dict'])
+    model.eval()
+    return model
+
+def split_enroll_and_test(dataroot_dir):
+    DB_all = read_feats_structure(dataroot_dir)
+    enroll_DB = pd.DataFrame()
+    test_DB = pd.DataFrame()
+    
+    enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+    test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+    
+    # Reset the index
+    enroll_DB = enroll_DB.reset_index(drop=True)
+    test_DB = test_DB.reset_index(drop=True)
+    return enroll_DB, test_DB
+
+def load_enroll_embeddings(embedding_dir):
+    embeddings = {}
+    for f in os.listdir(embedding_dir):
+        spk = f.replace('.pth','')
+        # Select the speakers who are in the 'enroll_spk_list'
+        embedding_path = os.path.join(embedding_dir, f)
+        tmp_embeddings = torch.load(embedding_path)
+        embeddings[spk] = tmp_embeddings
+        
+    return embeddings
+
+def get_embeddings(use_cuda, filename, model, test_frames):
+    input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+    
+    tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+    activation = 0
+    with torch.no_grad():
+        for i in range(tot_segments):
+            temp_input = input[i*test_frames:i*test_frames+test_frames]
+            
+            TT = ToTensorTestInput()
+            temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+    
+            if use_cuda:
+                temp_input = temp_input.cuda()
+            temp_activation,_ = model(temp_input)
+            activation += torch.sum(temp_activation, dim=0, keepdim=True)
+    
+    activation = l2_norm(activation, 1)
+                
+    return activation
+
+def l2_norm(input, alpha):
+    input_size = input.size()  # size:(n_frames, dim)
+    buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+    normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+    norm = torch.sqrt(normp)  # size:(n_frames)
+    _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+    output = _output.view(input_size)
+    # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+    output = output * alpha
+    return output
+
+def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
+    test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+    max_score = -10**8
+    best_spk = None
+    for spk in spk_list:
+        score = F.cosine_similarity(test_embedding, embeddings[spk])
+        score = score.data.cpu().numpy() 
+        if score > max_score:
+            max_score = score
+            best_spk = spk
+    #print("Speaker identification result : %s" %best_spk)
+    true_spk = test_filename.split('/')[-2].split('_')[0]
+    print("\n=== Speaker identification ===")
+    print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
+    return best_spk
+
+def main():
+    
+    log_dir = 'new_model3' # Where the checkpoints are saved
+    embedding_dir = 'enroll_embeddings3' # Where embeddings are saved
+    test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+    
+    # Settings
+    use_cuda = True # Use cuda or not
+    embedding_size = 128 # Dimension of speaker embeddings
+    cp_num = 11 # Which checkpoint to use?
+    n_classes = 241 # How many speakers in training data?
+    test_frames = 100 # Split the test utterance 
+
+    # Load model from checkpoint
+    model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+
+    # Get the dataframe for test DB
+    enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+    
+    # Load enroll embeddings
+    embeddings = load_enroll_embeddings(embedding_dir)
+    
+    """ Test speaker list
+    '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+    '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+    """ 
+    
+    spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
+    '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
+    
+    # Set the test speaker
+    test_speaker = '233F4013' 
+    
+    test_path = os.path.join(test_dir, test_speaker, 'test.p')
+    
+    # Perform the test 
+    best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
+
+if __name__ == '__main__':
+    main()
--- a/Speaker_Recognition/identification4.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/identification4.py 0 → 100644
View file @cdaaea5
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+import pandas as pd
+import math
+import os
+import configure as c
+
+from DB_wav_reader import read_feats_structure
+from SR_Dataset import read_MFB, ToTensorTestInput
+from model.model4 import background_resnet
+
+def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+    model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+    if use_cuda:
+        model.cuda()
+    print('=> loading checkpoint')
+    # original saved file with DataParallel
+    checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+    # create new OrderedDict that does not contain `module.`
+    model.load_state_dict(checkpoint['state_dict'])
+    model.eval()
+    return model
+
+def split_enroll_and_test(dataroot_dir):
+    DB_all = read_feats_structure(dataroot_dir)
+    enroll_DB = pd.DataFrame()
+    test_DB = pd.DataFrame()
+    
+    enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+    test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+    
+    # Reset the index
+    enroll_DB = enroll_DB.reset_index(drop=True)
+    test_DB = test_DB.reset_index(drop=True)
+    return enroll_DB, test_DB
+
+def load_enroll_embeddings(embedding_dir):
+    embeddings = {}
+    for f in os.listdir(embedding_dir):
+        spk = f.replace('.pth','')
+        # Select the speakers who are in the 'enroll_spk_list'
+        embedding_path = os.path.join(embedding_dir, f)
+        tmp_embeddings = torch.load(embedding_path)
+        embeddings[spk] = tmp_embeddings
+        
+    return embeddings
+
+def get_embeddings(use_cuda, filename, model, test_frames):
+    input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+    
+    tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+    activation = 0
+    with torch.no_grad():
+        for i in range(tot_segments):
+            temp_input = input[i*test_frames:i*test_frames+test_frames]
+            
+            TT = ToTensorTestInput()
+            temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+    
+            if use_cuda:
+                temp_input = temp_input.cuda()
+            temp_activation,_ = model(temp_input)
+            activation += torch.sum(temp_activation, dim=0, keepdim=True)
+    
+    activation = l2_norm(activation, 1)
+                
+    return activation
+
+def l2_norm(input, alpha):
+    input_size = input.size()  # size:(n_frames, dim)
+    buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+    normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+    norm = torch.sqrt(normp)  # size:(n_frames)
+    _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+    output = _output.view(input_size)
+    # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+    output = output * alpha
+    return output
+
+def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
+    test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+    max_score = -10**8
+    best_spk = None
+    for spk in spk_list:
+        score = F.cosine_similarity(test_embedding, embeddings[spk])
+        score = score.data.cpu().numpy() 
+        if score > max_score:
+            max_score = score
+            best_spk = spk
+    #print("Speaker identification result : %s" %best_spk)
+    true_spk = test_filename.split('/')[-2].split('_')[0]
+    print("\n=== Speaker identification ===")
+    print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
+    return best_spk
+
+def main():
+    
+    log_dir = 'new_model4' # Where the checkpoints are saved
+    embedding_dir = 'enroll_embeddings4' # Where embeddings are saved
+    test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+    
+    # Settings
+    use_cuda = True # Use cuda or not
+    embedding_size = 128 # Dimension of speaker embeddings
+    cp_num = 25 # Which checkpoint to use?
+    n_classes = 241 # How many speakers in training data?
+    test_frames = 100 # Split the test utterance 
+
+    # Load model from checkpoint
+    model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+
+    # Get the dataframe for test DB
+    enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+    
+    # Load enroll embeddings
+    embeddings = load_enroll_embeddings(embedding_dir)
+    
+    """ Test speaker list
+    '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+    '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+    """ 
+    
+    spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
+    '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
+    
+    # Set the test speaker
+    test_speaker = '207F2088' 
+    
+    test_path = os.path.join(test_dir, test_speaker, 'test.p')
+    
+    # Perform the test 
+    best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
+
+if __name__ == '__main__':
+    main()
--- a/Speaker_Recognition/identification5.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/identification5.py 0 → 100644
View file @cdaaea5
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+import pandas as pd
+import math
+import os
+import configure as c
+
+from DB_wav_reader import read_feats_structure
+from SR_Dataset import read_MFB, ToTensorTestInput
+from model.model5 import background_resnet
+
+def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+    model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+    if use_cuda:
+        model.cuda()
+    print('=> loading checkpoint')
+    # original saved file with DataParallel
+    checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+    # create new OrderedDict that does not contain `module.`
+    model.load_state_dict(checkpoint['state_dict'])
+    model.eval()
+    return model
+
+def split_enroll_and_test(dataroot_dir):
+    DB_all = read_feats_structure(dataroot_dir)
+    enroll_DB = pd.DataFrame()
+    test_DB = pd.DataFrame()
+    
+    enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+    test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+    
+    # Reset the index
+    enroll_DB = enroll_DB.reset_index(drop=True)
+    test_DB = test_DB.reset_index(drop=True)
+    return enroll_DB, test_DB
+
+def load_enroll_embeddings(embedding_dir):
+    embeddings = {}
+    for f in os.listdir(embedding_dir):
+        spk = f.replace('.pth','')
+        # Select the speakers who are in the 'enroll_spk_list'
+        embedding_path = os.path.join(embedding_dir, f)
+        tmp_embeddings = torch.load(embedding_path)
+        embeddings[spk] = tmp_embeddings
+        
+    return embeddings
+
+def get_embeddings(use_cuda, filename, model, test_frames):
+    input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+    
+    tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+    activation = 0
+    with torch.no_grad():
+        for i in range(tot_segments):
+            temp_input = input[i*test_frames:i*test_frames+test_frames]
+            
+            TT = ToTensorTestInput()
+            temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+    
+            if use_cuda:
+                temp_input = temp_input.cuda()
+            temp_activation,_ = model(temp_input)
+            activation += torch.sum(temp_activation, dim=0, keepdim=True)
+    
+    activation = l2_norm(activation, 1)
+                
+    return activation
+
+def l2_norm(input, alpha):
+    input_size = input.size()  # size:(n_frames, dim)
+    buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+    normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+    norm = torch.sqrt(normp)  # size:(n_frames)
+    _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+    output = _output.view(input_size)
+    # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+    output = output * alpha
+    return output
+
+def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
+    test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+    max_score = -10**8
+    best_spk = None
+    for spk in spk_list:
+        score = F.cosine_similarity(test_embedding, embeddings[spk])
+        score = score.data.cpu().numpy() 
+        if score > max_score:
+            max_score = score
+            best_spk = spk
+    #print("Speaker identification result : %s" %best_spk)
+    true_spk = test_filename.split('/')[-2].split('_')[0]
+    print("\n=== Speaker identification ===")
+    print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
+    return best_spk
+
+def main():
+    
+    log_dir = 'new_model5' # Where the checkpoints are saved
+    embedding_dir = 'enroll_embeddings5' # Where embeddings are saved
+    test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+    
+    # Settings
+    use_cuda = True # Use cuda or not
+    embedding_size = 128 # Dimension of speaker embeddings
+    cp_num = 30  # Which checkpoint to use?
+    n_classes = 241 # How many speakers in training data?
+    test_frames = 100 # Split the test utterance 
+
+    # Load model from checkpoint
+    model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+
+    # Get the dataframe for test DB
+    enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+    
+    # Load enroll embeddings
+    embeddings = load_enroll_embeddings(embedding_dir)
+    
+    """ Test speaker list
+    '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+    '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+    """ 
+    
+    spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
+    '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
+    
+    # Set the test speaker
+    test_speaker = '207F2088' 
+    
+    test_path = os.path.join(test_dir, test_speaker, 'test.p')
+    
+    # Perform the test 
+    best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
+
+if __name__ == '__main__':
+    main()
--- a/Speaker_Recognition/model/model3.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/model/model3.py 0 → 100644
View file @cdaaea5
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function
+import model.resnet1 as resnet
+
+
+class background_resnet(nn.Module):
+    def __init__(self, embedding_size, num_classes, backbone='resnet18'):
+        super(background_resnet, self).__init__()
+        self.backbone = backbone
+        # copying modules from pretrained models
+        if backbone == 'resnet50':
+            self.pretrained = resnet.resnet50(pretrained=False)
+        elif backbone == 'resnet101':
+            self.pretrained = resnet.resnet101(pretrained=False)
+        elif backbone == 'resnet152':
+            self.pretrained = resnet.resnet152(pretrained=False)
+        elif backbone == 'resnet18':
+            self.pretrained = resnet.resnet18(pretrained=False)
+        elif backbone == 'resnet34':
+            self.pretrained = resnet.resnet34(pretrained=False)
+        else:
+            raise RuntimeError('unknown backbone: {}'.format(backbone))
+            
+        self.fc0 = nn.Linear(256, embedding_size)
+        self.bn0 = nn.BatchNorm1d(embedding_size)
+        self.relu = nn.ReLU()
+        self.last = nn.Linear(embedding_size, num_classes)
+
+    def forward(self, x):
+        # input x: minibatch x 1 x 40 x 40
+        x = self.pretrained.conv1(x)
+        x = self.pretrained.bn1(x)
+        x = self.pretrained.relu(x)
+        x = self.pretrained.layer1(x)
+        x = self.pretrained.layer2(x)
+        x = self.pretrained.layer3(x)
+        x = self.pretrained.layer4(x)
+        x = self.pretrained.layer5(x)
+
+        out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1]
+        out = torch.squeeze(out) # [batch, n_embed]
+        # flatten the out so that the fully connected layer can be connected from here
+        out = out.view(x.size(0), -1) # (n_batch, n_embed)
+        spk_embedding = self.fc0(out)
+        out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed]
+        out = self.last(out)
+        
+        return spk_embedding, out
\ No newline at end of file
--- a/Speaker_Recognition/model/model4.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/model/model4.py 0 → 100644
View file @cdaaea5
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function
+import model.resnet1 as resnet
+
+
+class background_resnet(nn.Module):
+    def __init__(self, embedding_size, num_classes, backbone='resnet34'):
+        super(background_resnet, self).__init__()
+        self.backbone = backbone
+        # copying modules from pretrained models
+        if backbone == 'resnet50':
+            self.pretrained = resnet.resnet50(pretrained=False)
+        elif backbone == 'resnet101':
+            self.pretrained = resnet.resnet101(pretrained=False)
+        elif backbone == 'resnet152':
+            self.pretrained = resnet.resnet152(pretrained=False)
+        elif backbone == 'resnet18':
+            self.pretrained = resnet.resnet18(pretrained=False)
+        elif backbone == 'resnet34':
+            self.pretrained = resnet.resnet34(pretrained=False)
+        else:
+            raise RuntimeError('unknown backbone: {}'.format(backbone))
+            
+        self.fc0 = nn.Linear(256, embedding_size) # 512 등으로 바꿀 경우 resnet 사용할 수 있음
+        self.bn0 = nn.BatchNorm1d(embedding_size)
+        self.relu = nn.ReLU()
+        self.last = nn.Linear(embedding_size, num_classes)
+
+    def forward(self, x):
+        # input x: minibatch x 1 x 40 x 40
+        x = self.pretrained.conv1(x)
+        x = self.pretrained.bn1(x)
+        x = self.pretrained.relu(x)
+        x = self.pretrained.layer1(x)
+        x = self.pretrained.layer2(x)
+        x = self.pretrained.layer3(x)
+        x = self.pretrained.layer4(x)
+        x = self.pretrained.layer5(x)
+
+        out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1]
+        out = torch.squeeze(out) # [batch, n_embed]
+        # flatten the out so that the fully connected layer can be connected from here
+        out = out.view(x.size(0), -1) # (n_batch, n_embed)
+        spk_embedding = self.fc0(out)
+        out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed]
+        out = self.last(out)
+        
+        return spk_embedding, out
\ No newline at end of file
--- a/Speaker_Recognition/model/model5.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/model/model5.py 0 → 100644
View file @cdaaea5
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function
+import model.resnet1 as resnet
+
+
+class background_resnet(nn.Module):
+    def __init__(self, embedding_size, num_classes, backbone='resnet50'):
+        super(background_resnet, self).__init__()
+        self.backbone = backbone
+        # copying modules from pretrained models
+        if backbone == 'resnet50':
+            self.pretrained = resnet.resnet50(pretrained=False)
+        elif backbone == 'resnet101':
+            self.pretrained = resnet.resnet101(pretrained=False)
+        elif backbone == 'resnet152':
+            self.pretrained = resnet.resnet152(pretrained=False)
+        elif backbone == 'resnet18':
+            self.pretrained = resnet.resnet18(pretrained=False)
+        elif backbone == 'resnet34':
+            self.pretrained = resnet.resnet34(pretrained=False)
+        else:
+            raise RuntimeError('unknown backbone: {}'.format(backbone))
+            
+        self.fc0 = nn.Linear(512, embedding_size) # 512 등으로 바꿀 경우 resnet 사용할 수 있음
+        self.bn0 = nn.BatchNorm1d(embedding_size)
+        self.relu = nn.ReLU()
+        self.last = nn.Linear(embedding_size, num_classes)
+
+    def forward(self, x):
+        # input x: minibatch x 1 x 40 x 40
+        x = self.pretrained.conv1(x)
+        x = self.pretrained.bn1(x)
+        x = self.pretrained.relu(x)
+        x = self.pretrained.layer1(x)
+        x = self.pretrained.layer2(x)
+        x = self.pretrained.layer3(x)
+        x = self.pretrained.layer4(x)
+
+        out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1]
+        out = torch.squeeze(out) # [batch, n_embed]
+        # flatten the out so that the fully connected layer can be connected from here
+        out = out.view(x.size(0), -1) # (n_batch, n_embed)
+        spk_embedding = self.fc0(out)
+        out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed]
+        out = self.last(out)
+        
+        return spk_embedding, out
\ No newline at end of file
--- a/Speaker_Recognition/train3.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/train3.py 0 → 100644
View file @cdaaea5
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchvision.transforms as transforms
+
+import time
+import os
+import numpy as np
+import configure as c
+import pandas as pd
+from DB_wav_reader import read_feats_structure
+from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded
+from model.model3 import background_resnet
+import matplotlib as mpl
+mpl.use('Agg')
+import matplotlib.pyplot as plt
+
+import pandas as pd
+def load_dataset(val_ratio):
+    # Load training set and validation set
+
+
+    # Split training set into training set and validation set according to "val_ratio"
+    train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio)
+
+    file_loader = read_MFB # numpy array:(n_frames, n_dims)
+
+    transform = transforms.Compose([
+        TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims)
+        ToTensorInput() # torch tensor:(1, n_dims, n_frames)
+    ])
+    transform_T = ToTensorDevInput()
+
+
+    speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers
+    spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)}
+
+    train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx)
+    valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx)
+
+    n_classes = len(speaker_list) # How many speakers? 240
+    return train_dataset, valid_dataset, n_classes
+
+def split_train_dev(train_feat_dir, valid_ratio):
+    train_valid_DB = read_feats_structure(train_feat_dir)
+    total_len = len(train_valid_DB) # 148642
+    valid_len = int(total_len * valid_ratio/100.)
+    train_len = total_len - valid_len
+    shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True)
+    # Split the DB into train and valid set
+    train_DB = shuffled_train_valid_DB.iloc[:train_len]
+    valid_DB = shuffled_train_valid_DB.iloc[train_len:]
+    # Reset the index
+    train_DB = train_DB.reset_index(drop=True)
+    valid_DB = valid_DB.reset_index(drop=True)
+    print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100))
+    print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100))
+    print('Total %d utts' %(total_len))
+
+    return train_DB, valid_DB
+
+def main():
+    # Set hyperparameters
+    use_cuda = True # use gpu or cpu
+    val_ratio = 10 # Percentage of validation set
+    embedding_size = 128
+    start = 1 # Start epoch
+    n_epochs = 30 # How many epochs?
+    end = start + n_epochs # Last epoch
+
+    lr = 1e-1 # Initial learning rate
+    wd = 1e-4 # Weight decay (L2 penalty)
+    optimizer_type = 'sgd' # ex) sgd, adam, adagrad
+
+    batch_size = 64 # Batch size for training
+    valid_batch_size = 16 # Batch size for validation
+    use_shuffle = True # Shuffle for training or not
+
+    # Load dataset
+    train_dataset, valid_dataset, n_classes = load_dataset(val_ratio)
+
+    # print the experiment configuration
+    print('\nNumber of classes (speakers):\n{}\n'.format(n_classes))
+
+    log_dir = 'new_model3' # where to save checkpoints
+
+    if not os.path.exists(log_dir):
+        os.makedirs(log_dir)
+
+    # instantiate model and initialize weights
+    model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+
+    if use_cuda:
+        model.cuda()
+
+    # define loss function (criterion), optimizer and scheduler
+    criterion = nn.CrossEntropyLoss()
+    optimizer = create_optimizer(optimizer_type, model, lr, wd)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1)
+
+    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+                                                       batch_size=batch_size,
+                                                       shuffle=use_shuffle)
+    valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
+                                                       batch_size=valid_batch_size,
+                                                       shuffle=False,
+                                                       collate_fn = collate_fn_feat_padded)
+
+    # to track the average training loss per epoch as the model trains
+    avg_train_losses = []
+    # to track the average validation loss per epoch as the model trains
+    avg_valid_losses = []
+
+
+    for epoch in range(start, end):
+
+        # train for one epoch
+        train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes)
+
+        # evaluate on validation set
+        valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch)
+
+        scheduler.step(valid_loss, epoch)
+
+        # calculate average loss over an epoch
+        avg_train_losses.append(train_loss)
+        avg_valid_losses.append(valid_loss)
+        # do checkpointing
+        torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(),
+                    'optimizer': optimizer.state_dict()},
+                   '{}/checkpoint_{}.pth'.format(log_dir, epoch))
+
+    # find position of lowest validation loss
+    minposs = avg_valid_losses.index(min(avg_valid_losses))+1
+    print('Lowest validation loss at epoch %d' %minposs)
+
+    # visualize the loss and learning rate as the network trained
+    visualize_the_losses(avg_train_losses, avg_valid_losses)
+
+
+def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes):
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    train_acc = AverageMeter()
+
+    n_correct, n_total = 0, 0
+    log_interval = 84
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    # pbar = tqdm(enumerate(train_loader))
+    for batch_idx, (data) in enumerate(train_loader):
+        inputs, targets = data  # target size:(batch size,1), input size:(batch size, 1, dim, win)
+        targets = targets.view(-1) # target size:(batch size)
+        current_sample = inputs.size(0)  # batch size
+
+        if use_cuda:
+            inputs = inputs.cuda()
+            targets = targets.cuda()
+        _, output = model(inputs) # out size:(batch size, #classes), for softmax
+
+        # calculate accuracy of predictions in the current batch
+        n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
+        n_total += current_sample
+        train_acc_temp = 100. * n_correct / n_total
+        train_acc.update(train_acc_temp, inputs.size(0))
+
+        loss = criterion(output, targets)
+        losses.update(loss.item(), inputs.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if batch_idx % log_interval == 0:
+            print(
+                    'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t'
+                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                    'Loss {loss.avg:.4f}\t'
+                    'Acc {train_acc.avg:.4f}'.format(
+                     epoch, batch_idx * len(inputs), len(train_loader.dataset),
+                     100. * batch_idx / len(train_loader),
+                     batch_time=batch_time, loss=losses, train_acc=train_acc))
+    return losses.avg
+
+def validate(val_loader, model, criterion, use_cuda, epoch):
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    val_acc = AverageMeter()
+
+    n_correct, n_total = 0, 0
+
+    # switch to evaluate mode
+    model.eval()
+
+    with torch.no_grad():
+        end = time.time()
+        for i, (data) in enumerate(val_loader):
+            inputs, targets = data
+            current_sample = inputs.size(0)  # batch size
+
+            if use_cuda:
+                inputs = inputs.cuda()
+                targets = targets.cuda()
+
+            # compute output
+            _, output = model(inputs)
+
+            # measure accuracy and record loss
+            n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
+            n_total += current_sample
+            val_acc_temp = 100. * n_correct / n_total
+            val_acc.update(val_acc_temp, inputs.size(0))
+
+            loss = criterion(output, targets)
+            losses.update(loss.item(), inputs.size(0))
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+        print('  * Validation: '
+                  'Loss {loss.avg:.4f}\t'
+                  'Acc {val_acc.avg:.4f}'.format(
+                  loss=losses, val_acc=val_acc))
+
+    return losses.avg
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+def create_optimizer(optimizer, model, new_lr, wd):
+    # setup optimizer
+    if optimizer == 'sgd':
+        optimizer = optim.SGD(model.parameters(), lr=new_lr,
+                              momentum=0.9, dampening=0,
+                              weight_decay=wd)
+    elif optimizer == 'adam':
+        optimizer = optim.Adam(model.parameters(), lr=new_lr,
+                               weight_decay=wd)
+    elif optimizer == 'adagrad':
+        optimizer = optim.Adagrad(model.parameters(),
+                                  lr=new_lr,
+                                  weight_decay=wd)
+    return optimizer
+
+def visualize_the_losses(train_loss, valid_loss):
+    fig = plt.figure(figsize=(10,8))
+    plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss')
+    plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss')
+
+    # find position of lowest validation loss
+    minposs = valid_loss.index(min(valid_loss))+1
+    plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')
+
+    plt.xlabel('epochs')
+    plt.ylabel('loss')
+    plt.ylim(0, 3.5) # consistent scale
+    plt.xlim(0, len(train_loss)+1) # consistent scale
+    plt.grid(True)
+    plt.legend()
+    plt.tight_layout()
+    #plt.show()
+    fig.savefig('train3.png', bbox_inches='tight')
+
+if __name__ == '__main__':
+    main()
--- a/Speaker_Recognition/train4.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/train4.py 0 → 100644
View file @cdaaea5
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchvision.transforms as transforms
+
+import time
+import os
+import numpy as np
+import configure as c
+import pandas as pd
+from DB_wav_reader import read_feats_structure
+from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded
+from model.model4 import background_resnet
+import matplotlib as mpl
+mpl.use('Agg')
+import matplotlib.pyplot as plt
+import pandas as pd
+def load_dataset(val_ratio):
+    # Load training set and validation set
+
+
+    # Split training set into training set and validation set according to "val_ratio"
+    train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio)
+
+    file_loader = read_MFB # numpy array:(n_frames, n_dims)
+
+    transform = transforms.Compose([
+        TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims)
+        ToTensorInput() # torch tensor:(1, n_dims, n_frames)
+    ])
+    transform_T = ToTensorDevInput()
+
+
+    speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers
+    spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)}
+
+    train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx)
+    valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx)
+
+    n_classes = len(speaker_list) # How many speakers? 240
+    return train_dataset, valid_dataset, n_classes
+
+def split_train_dev(train_feat_dir, valid_ratio):
+    train_valid_DB = read_feats_structure(train_feat_dir)
+    total_len = len(train_valid_DB) # 148642
+    valid_len = int(total_len * valid_ratio/100.)
+    train_len = total_len - valid_len
+    shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True)
+    # Split the DB into train and valid set
+    train_DB = shuffled_train_valid_DB.iloc[:train_len]
+    valid_DB = shuffled_train_valid_DB.iloc[train_len:]
+    # Reset the index
+    train_DB = train_DB.reset_index(drop=True)
+    valid_DB = valid_DB.reset_index(drop=True)
+    print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100))
+    print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100))
+    print('Total %d utts' %(total_len))
+
+    return train_DB, valid_DB
+
+def main():
+    # Set hyperparameters
+    use_cuda = True # use gpu or cpu
+    val_ratio = 10 # Percentage of validation set
+    embedding_size = 128
+    start = 1 # Start epoch
+    n_epochs = 30 # How many epochs?
+    end = start + n_epochs # Last epoch
+
+    lr = 1e-1 # Initial learning rate
+    wd = 1e-4 # Weight decay (L2 penalty)
+    optimizer_type = 'sgd' # ex) sgd, adam, adagrad
+
+    batch_size = 64 # Batch size for training
+    valid_batch_size = 16 # Batch size for validation
+    use_shuffle = True # Shuffle for training or not
+
+    # Load dataset
+    train_dataset, valid_dataset, n_classes = load_dataset(val_ratio)
+
+    # print the experiment configuration
+    print('\nNumber of classes (speakers):\n{}\n'.format(n_classes))
+
+    log_dir = 'new_model4' # where to save checkpoints
+
+    if not os.path.exists(log_dir):
+        os.makedirs(log_dir)
+
+    # instantiate model and initialize weights
+    model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+
+    if use_cuda:
+        model.cuda()
+
+    # define loss function (criterion), optimizer and scheduler
+    criterion = nn.CrossEntropyLoss()
+    optimizer = create_optimizer(optimizer_type, model, lr, wd)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1)
+
+    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+                                                       batch_size=batch_size,
+                                                       shuffle=use_shuffle)
+    valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
+                                                       batch_size=valid_batch_size,
+                                                       shuffle=False,
+                                                       collate_fn = collate_fn_feat_padded)
+
+    # to track the average training loss per epoch as the model trains
+    avg_train_losses = []
+    # to track the average validation loss per epoch as the model trains
+    avg_valid_losses = []
+
+
+    for epoch in range(start, end):
+
+        # train for one epoch
+        train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes)
+
+        # evaluate on validation set
+        valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch)
+
+        scheduler.step(valid_loss, epoch)
+
+        # calculate average loss over an epoch
+        avg_train_losses.append(train_loss)
+        avg_valid_losses.append(valid_loss)
+        # do checkpointing
+        torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(),
+                    'optimizer': optimizer.state_dict()},
+                   '{}/checkpoint_{}.pth'.format(log_dir, epoch))
+
+    # find position of lowest validation loss
+    minposs = avg_valid_losses.index(min(avg_valid_losses))+1
+    print('Lowest validation loss at epoch %d' %minposs)
+
+    # visualize the loss and learning rate as the network trained
+    visualize_the_losses(avg_train_losses, avg_valid_losses)
+
+
+def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes):
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    train_acc = AverageMeter()
+
+    n_correct, n_total = 0, 0
+    log_interval = 84
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    # pbar = tqdm(enumerate(train_loader))
+    for batch_idx, (data) in enumerate(train_loader):
+        inputs, targets = data  # target size:(batch size,1), input size:(batch size, 1, dim, win)
+        targets = targets.view(-1) # target size:(batch size)
+        current_sample = inputs.size(0)  # batch size
+
+        if use_cuda:
+            inputs = inputs.cuda()
+            targets = targets.cuda()
+        _, output = model(inputs) # out size:(batch size, #classes), for softmax
+
+        # calculate accuracy of predictions in the current batch
+        n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
+        n_total += current_sample
+        train_acc_temp = 100. * n_correct / n_total
+        train_acc.update(train_acc_temp, inputs.size(0))
+
+        loss = criterion(output, targets)
+        losses.update(loss.item(), inputs.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if batch_idx % log_interval == 0:
+            print(
+                    'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t'
+                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                    'Loss {loss.avg:.4f}\t'
+                    'Acc {train_acc.avg:.4f}'.format(
+                     epoch, batch_idx * len(inputs), len(train_loader.dataset),
+                     100. * batch_idx / len(train_loader),
+                     batch_time=batch_time, loss=losses, train_acc=train_acc))
+    return losses.avg
+
+def validate(val_loader, model, criterion, use_cuda, epoch):
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    val_acc = AverageMeter()
+
+    n_correct, n_total = 0, 0
+
+    # switch to evaluate mode
+    model.eval()
+
+    with torch.no_grad():
+        end = time.time()
+        for i, (data) in enumerate(val_loader):
+            inputs, targets = data
+            current_sample = inputs.size(0)  # batch size
+
+            if use_cuda:
+                inputs = inputs.cuda()
+                targets = targets.cuda()
+
+            # compute output
+            _, output = model(inputs)
+
+            # measure accuracy and record loss
+            n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
+            n_total += current_sample
+            val_acc_temp = 100. * n_correct / n_total
+            val_acc.update(val_acc_temp, inputs.size(0))
+
+            loss = criterion(output, targets)
+            losses.update(loss.item(), inputs.size(0))
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+        print('  * Validation: '
+                  'Loss {loss.avg:.4f}\t'
+                  'Acc {val_acc.avg:.4f}'.format(
+                  loss=losses, val_acc=val_acc))
+
+    return losses.avg
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+def create_optimizer(optimizer, model, new_lr, wd):
+    # setup optimizer
+    if optimizer == 'sgd':
+        optimizer = optim.SGD(model.parameters(), lr=new_lr,
+                              momentum=0.9, dampening=0,
+                              weight_decay=wd)
+    elif optimizer == 'adam':
+        optimizer = optim.Adam(model.parameters(), lr=new_lr,
+                               weight_decay=wd)
+    elif optimizer == 'adagrad':
+        optimizer = optim.Adagrad(model.parameters(),
+                                  lr=new_lr,
+                                  weight_decay=wd)
+    return optimizer
+
+def visualize_the_losses(train_loss, valid_loss):
+    fig = plt.figure(figsize=(10,8))
+    plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss')
+    plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss')
+
+    # find position of lowest validation loss
+    minposs = valid_loss.index(min(valid_loss))+1
+    plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')
+
+    plt.xlabel('epochs')
+    plt.ylabel('loss')
+    plt.ylim(0, 3.5) # consistent scale
+    plt.xlim(0, len(train_loss)+1) # consistent scale
+    plt.grid(True)
+    plt.legend()
+    plt.tight_layout()
+    #plt.show()
+    fig.savefig('train4.png', bbox_inches='tight')
+
+if __name__ == '__main__':
+    main()
--- a/Speaker_Recognition/train4_merge.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/train4_merge.py 0 → 100644
View file @cdaaea5
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchvision.transforms as transforms
+
+import time
+import os
+import numpy as np
+import configure1_merge as c
+import pandas as pd
+from DB_wav_reader import read_feats_structure
+from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded
+from model.model4 import background_resnet
+import matplotlib as mpl
+mpl.use('Agg')
+import matplotlib.pyplot as plt
+
+import pandas as pd
+def load_dataset(val_ratio):
+    # Load training set and validation set
+
+
+    # Split training set into training set and validation set according to "val_ratio"
+    train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio)
+
+    file_loader = read_MFB # numpy array:(n_frames, n_dims)
+
+    transform = transforms.Compose([
+        TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims)
+        ToTensorInput() # torch tensor:(1, n_dims, n_frames)
+    ])
+    transform_T = ToTensorDevInput()
+
+
+    speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers
+    spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)}
+
+    train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx)
+    valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx)
+
+    n_classes = len(speaker_list) # How many speakers? 240
+    return train_dataset, valid_dataset, n_classes
+
+def split_train_dev(train_feat_dir, valid_ratio):
+    train_valid_DB = read_feats_structure(train_feat_dir)
+    total_len = len(train_valid_DB) # 148642
+    valid_len = int(total_len * valid_ratio/100.)
+    train_len = total_len - valid_len
+    shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True)
+    # Split the DB into train and valid set
+    train_DB = shuffled_train_valid_DB.iloc[:train_len]
+    valid_DB = shuffled_train_valid_DB.iloc[train_len:]
+    # Reset the index
+    train_DB = train_DB.reset_index(drop=True)
+    valid_DB = valid_DB.reset_index(drop=True)
+    print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100))
+    print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100))
+    print('Total %d utts' %(total_len))
+
+    return train_DB, valid_DB
+
+def main():
+    # Set hyperparameters
+    use_cuda = True # use gpu or cpu
+    val_ratio = 10 # Percentage of validation set
+    embedding_size = 128
+    start = 1 # Start epoch
+    n_epochs = 60 # How many epochs?
+    end = start + n_epochs # Last epoch
+
+    lr = 1e-1 # Initial learning rate
+    wd = 1e-4 # Weight decay (L2 penalty)
+    optimizer_type = 'sgd' # ex) sgd, adam, adagrad
+
+    batch_size = 64 # Batch size for training
+    valid_batch_size = 16 # Batch size for validation
+    use_shuffle = True # Shuffle for training or not
+
+    # Load dataset
+    train_dataset, valid_dataset, n_classes = load_dataset(val_ratio)
+
+    # print the experiment configuration
+    print('\nNumber of classes (speakers):\n{}\n'.format(n_classes))
+
+    log_dir = 'new_model4_merge' # where to save checkpoints
+
+    if not os.path.exists(log_dir):
+        os.makedirs(log_dir)
+
+    # instantiate model and initialize weights
+    model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+
+    if use_cuda:
+        model.cuda()
+
+    # define loss function (criterion), optimizer and scheduler
+    criterion = nn.CrossEntropyLoss()
+    optimizer = create_optimizer(optimizer_type, model, lr, wd)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1)
+
+    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+                                                       batch_size=batch_size,
+                                                       shuffle=use_shuffle)
+    valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
+                                                       batch_size=valid_batch_size,
+                                                       shuffle=False,
+                                                       collate_fn = collate_fn_feat_padded)
+
+    # to track the average training loss per epoch as the model trains
+    avg_train_losses = []
+    # to track the average validation loss per epoch as the model trains
+    avg_valid_losses = []
+
+
+    for epoch in range(start, end):
+
+        # train for one epoch
+        train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes)
+
+        # evaluate on validation set
+        valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch)
+
+        scheduler.step(valid_loss, epoch)
+
+        # calculate average loss over an epoch
+        avg_train_losses.append(train_loss)
+        avg_valid_losses.append(valid_loss)
+        # do checkpointing
+        torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(),
+                    'optimizer': optimizer.state_dict()},
+                   '{}/checkpoint_{}.pth'.format(log_dir, epoch))
+
+    # find position of lowest validation loss
+    minposs = avg_valid_losses.index(min(avg_valid_losses))+1
+    print('Lowest validation loss at epoch %d' %minposs)
+
+    # visualize the loss and learning rate as the network trained
+    visualize_the_losses(avg_train_losses, avg_valid_losses)
+
+
+def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes):
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    train_acc = AverageMeter()
+
+    n_correct, n_total = 0, 0
+    log_interval = 84
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    # pbar = tqdm(enumerate(train_loader))
+    for batch_idx, (data) in enumerate(train_loader):
+        inputs, targets = data  # target size:(batch size,1), input size:(batch size, 1, dim, win)
+        targets = targets.view(-1) # target size:(batch size)
+        current_sample = inputs.size(0)  # batch size
+
+        if use_cuda:
+            inputs = inputs.cuda()
+            targets = targets.cuda()
+        _, output = model(inputs) # out size:(batch size, #classes), for softmax
+
+        # calculate accuracy of predictions in the current batch
+        n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
+        n_total += current_sample
+        train_acc_temp = 100. * n_correct / n_total
+        train_acc.update(train_acc_temp, inputs.size(0))
+
+        loss = criterion(output, targets)
+        losses.update(loss.item(), inputs.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if batch_idx % log_interval == 0:
+            print(
+                    'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t'
+                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                    'Loss {loss.avg:.4f}\t'
+                    'Acc {train_acc.avg:.4f}'.format(
+                     epoch, batch_idx * len(inputs), len(train_loader.dataset),
+                     100. * batch_idx / len(train_loader),
+                     batch_time=batch_time, loss=losses, train_acc=train_acc))
+    return losses.avg
+
+def validate(val_loader, model, criterion, use_cuda, epoch):
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    val_acc = AverageMeter()
+
+    n_correct, n_total = 0, 0
+
+    # switch to evaluate mode
+    model.eval()
+
+    with torch.no_grad():
+        end = time.time()
+        for i, (data) in enumerate(val_loader):
+            inputs, targets = data
+            current_sample = inputs.size(0)  # batch size
+
+            if use_cuda:
+                inputs = inputs.cuda()
+                targets = targets.cuda()
+
+            # compute output
+            _, output = model(inputs)
+
+            # measure accuracy and record loss
+            n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
+            n_total += current_sample
+            val_acc_temp = 100. * n_correct / n_total
+            val_acc.update(val_acc_temp, inputs.size(0))
+
+            loss = criterion(output, targets)
+            losses.update(loss.item(), inputs.size(0))
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+        print('  * Validation: '
+                  'Loss {loss.avg:.4f}\t'
+                  'Acc {val_acc.avg:.4f}'.format(
+                  loss=losses, val_acc=val_acc))
+
+    return losses.avg
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+def create_optimizer(optimizer, model, new_lr, wd):
+    # setup optimizer
+    if optimizer == 'sgd':
+        optimizer = optim.SGD(model.parameters(), lr=new_lr,
+                              momentum=0.9, dampening=0,
+                              weight_decay=wd)
+    elif optimizer == 'adam':
+        optimizer = optim.Adam(model.parameters(), lr=new_lr,
+                               weight_decay=wd)
+    elif optimizer == 'adagrad':
+        optimizer = optim.Adagrad(model.parameters(),
+                                  lr=new_lr,
+                                  weight_decay=wd)
+    return optimizer
+
+def visualize_the_losses(train_loss, valid_loss):
+    fig = plt.figure(figsize=(10,8))
+    plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss')
+    plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss')
+
+    # find position of lowest validation loss
+    minposs = valid_loss.index(min(valid_loss))+1
+    plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')
+
+    plt.xlabel('epochs')
+    plt.ylabel('loss')
+    plt.ylim(0, 3.5) # consistent scale
+    plt.xlim(0, len(train_loss)+1) # consistent scale
+    plt.grid(True)
+    plt.legend()
+    plt.tight_layout()
+    #plt.show()
+    fig.savefig('train4_merge.png', bbox_inches='tight')
+
+if __name__ == '__main__':
+    main()
--- a/Speaker_Recognition/train4_zeroth.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/train4_zeroth.py 0 → 100644
View file @cdaaea5
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchvision.transforms as transforms
+
+import time
+import os
+import numpy as np
+import configure1_zeroth as c
+import pandas as pd
+from DB_wav_reader import read_feats_structure
+from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded
+from model.model4 import background_resnet
+
+import matplotlib as mpl
+mpl.use('Agg')
+import matplotlib.pyplot as plt
+
+import pandas as pd
+def load_dataset(val_ratio):
+    # Load training set and validation set
+
+
+    # Split training set into training set and validation set according to "val_ratio"
+    train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio)
+
+    file_loader = read_MFB # numpy array:(n_frames, n_dims)
+
+    transform = transforms.Compose([
+        TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims)
+        ToTensorInput() # torch tensor:(1, n_dims, n_frames)
+    ])
+    transform_T = ToTensorDevInput()
+
+
+    speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers
+    spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)}
+
+    train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx)
+    valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx)
+
+    n_classes = len(speaker_list) # How many speakers? 240
+    return train_dataset, valid_dataset, n_classes
+
+def split_train_dev(train_feat_dir, valid_ratio):
+    train_valid_DB = read_feats_structure(train_feat_dir)
+    total_len = len(train_valid_DB) # 148642
+    valid_len = int(total_len * valid_ratio/100.)
+    train_len = total_len - valid_len
+    shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True)
+    # Split the DB into train and valid set
+    train_DB = shuffled_train_valid_DB.iloc[:train_len]
+    valid_DB = shuffled_train_valid_DB.iloc[train_len:]
+    # Reset the index
+    train_DB = train_DB.reset_index(drop=True)
+    valid_DB = valid_DB.reset_index(drop=True)
+    print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100))
+    print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100))
+    print('Total %d utts' %(total_len))
+
+    return train_DB, valid_DB
+
+def main():
+    # Set hyperparameters
+    use_cuda = True # use gpu or cpu
+    val_ratio = 10 # Percentage of validation set
+    embedding_size = 128
+    start = 1 # Start epoch
+    n_epochs = 30 # How many epochs?
+    end = start + n_epochs # Last epoch
+
+    lr = 1e-1 # Initial learning rate
+    wd = 1e-4 # Weight decay (L2 penalty)
+    optimizer_type = 'sgd' # ex) sgd, adam, adagrad
+
+    batch_size = 64 # Batch size for training
+    valid_batch_size = 16 # Batch size for validation
+    use_shuffle = True # Shuffle for training or not
+
+    # Load dataset
+    train_dataset, valid_dataset, n_classes = load_dataset(val_ratio)
+
+    # print the experiment configuration
+    print('\nNumber of classes (speakers):\n{}\n'.format(n_classes))
+
+    log_dir = 'new_model4_zeroth' # where to save checkpoints
+
+    if not os.path.exists(log_dir):
+        os.makedirs(log_dir)
+
+    # instantiate model and initialize weights
+    model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+
+    if use_cuda:
+        model.cuda()
+
+    # define loss function (criterion), optimizer and scheduler
+    criterion = nn.CrossEntropyLoss()
+    optimizer = create_optimizer(optimizer_type, model, lr, wd)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1)
+
+    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+                                                       batch_size=batch_size,
+                                                       shuffle=use_shuffle)
+    valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
+                                                       batch_size=valid_batch_size,
+                                                       shuffle=False,
+                                                       collate_fn = collate_fn_feat_padded)
+
+    # to track the average training loss per epoch as the model trains
+    avg_train_losses = []
+    # to track the average validation loss per epoch as the model trains
+    avg_valid_losses = []
+
+
+    for epoch in range(start, end):
+
+        # train for one epoch
+        train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes)
+
+        # evaluate on validation set
+        valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch)
+
+        scheduler.step(valid_loss, epoch)
+
+        # calculate average loss over an epoch
+        avg_train_losses.append(train_loss)
+        avg_valid_losses.append(valid_loss)
+        # do checkpointing
+        torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(),
+                    'optimizer': optimizer.state_dict()},
+                   '{}/checkpoint_{}.pth'.format(log_dir, epoch))
+
+    # find position of lowest validation loss
+    minposs = avg_valid_losses.index(min(avg_valid_losses))+1
+    print('Lowest validation loss at epoch %d' %minposs)
+
+    # visualize the loss and learning rate as the network trained
+    visualize_the_losses(avg_train_losses, avg_valid_losses)
+
+
+def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes):
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    train_acc = AverageMeter()
+
+    n_correct, n_total = 0, 0
+    log_interval = 84
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    # pbar = tqdm(enumerate(train_loader))
+    for batch_idx, (data) in enumerate(train_loader):
+        inputs, targets = data  # target size:(batch size,1), input size:(batch size, 1, dim, win)
+        targets = targets.view(-1) # target size:(batch size)
+        current_sample = inputs.size(0)  # batch size
+
+        if use_cuda:
+            inputs = inputs.cuda()
+            targets = targets.cuda()
+        _, output = model(inputs) # out size:(batch size, #classes), for softmax
+
+        # calculate accuracy of predictions in the current batch
+        n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
+        n_total += current_sample
+        train_acc_temp = 100. * n_correct / n_total
+        train_acc.update(train_acc_temp, inputs.size(0))
+
+        loss = criterion(output, targets)
+        losses.update(loss.item(), inputs.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if batch_idx % log_interval == 0:
+            print(
+                    'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t'
+                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                    'Loss {loss.avg:.4f}\t'
+                    'Acc {train_acc.avg:.4f}'.format(
+                     epoch, batch_idx * len(inputs), len(train_loader.dataset),
+                     100. * batch_idx / len(train_loader),
+                     batch_time=batch_time, loss=losses, train_acc=train_acc))
+    return losses.avg
+
+def validate(val_loader, model, criterion, use_cuda, epoch):
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    val_acc = AverageMeter()
+
+    n_correct, n_total = 0, 0
+
+    # switch to evaluate mode
+    model.eval()
+
+    with torch.no_grad():
+        end = time.time()
+        for i, (data) in enumerate(val_loader):
+            inputs, targets = data
+            current_sample = inputs.size(0)  # batch size
+
+            if use_cuda:
+                inputs = inputs.cuda()
+                targets = targets.cuda()
+
+            # compute output
+            _, output = model(inputs)
+
+            # measure accuracy and record loss
+            n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
+            n_total += current_sample
+            val_acc_temp = 100. * n_correct / n_total
+            val_acc.update(val_acc_temp, inputs.size(0))
+
+            loss = criterion(output, targets)
+            losses.update(loss.item(), inputs.size(0))
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+        print('  * Validation: '
+                  'Loss {loss.avg:.4f}\t'
+                  'Acc {val_acc.avg:.4f}'.format(
+                  loss=losses, val_acc=val_acc))
+
+    return losses.avg
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+def create_optimizer(optimizer, model, new_lr, wd):
+    # setup optimizer
+    if optimizer == 'sgd':
+        optimizer = optim.SGD(model.parameters(), lr=new_lr,
+                              momentum=0.9, dampening=0,
+                              weight_decay=wd)
+    elif optimizer == 'adam':
+        optimizer = optim.Adam(model.parameters(), lr=new_lr,
+                               weight_decay=wd)
+    elif optimizer == 'adagrad':
+        optimizer = optim.Adagrad(model.parameters(),
+                                  lr=new_lr,
+                                  weight_decay=wd)
+    return optimizer
+
+def visualize_the_losses(train_loss, valid_loss):
+    fig = plt.figure(figsize=(10,8))
+    plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss')
+    plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss')
+
+    # find position of lowest validation loss
+    minposs = valid_loss.index(min(valid_loss))+1
+    plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')
+
+    plt.xlabel('epochs')
+    plt.ylabel('loss')
+    plt.ylim(0, 3.5) # consistent scale
+    plt.xlim(0, len(train_loss)+1) # consistent scale
+    plt.grid(True)
+    plt.legend()
+    plt.tight_layout()
+    #plt.show()
+    fig.savefig('train4_zeroth.png', bbox_inches='tight')
+
+if __name__ == '__main__':
+    main()
--- a/Speaker_Recognition/train5.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/train5.py 0 → 100644
View file @cdaaea5
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchvision.transforms as transforms
+
+import time
+import os
+import numpy as np
+import configure as c
+import pandas as pd
+from DB_wav_reader import read_feats_structure
+from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded
+from model.model5 import background_resnet
+import matplotlib as mpl
+mpl.use('Agg')
+import matplotlib.pyplot as plt
+
+import pandas as pd
+def load_dataset(val_ratio):
+    # Load training set and validation set
+
+
+    # Split training set into training set and validation set according to "val_ratio"
+    train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio)
+
+    file_loader = read_MFB # numpy array:(n_frames, n_dims)
+
+    transform = transforms.Compose([
+        TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims)
+        ToTensorInput() # torch tensor:(1, n_dims, n_frames)
+    ])
+    transform_T = ToTensorDevInput()
+
+
+    speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers
+    spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)}
+
+    train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx)
+    valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx)
+
+    n_classes = len(speaker_list) # How many speakers? 240
+    return train_dataset, valid_dataset, n_classes
+
+def split_train_dev(train_feat_dir, valid_ratio):
+    train_valid_DB = read_feats_structure(train_feat_dir)
+    total_len = len(train_valid_DB) # 148642
+    valid_len = int(total_len * valid_ratio/100.)
+    train_len = total_len - valid_len
+    shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True)
+    # Split the DB into train and valid set
+    train_DB = shuffled_train_valid_DB.iloc[:train_len]
+    valid_DB = shuffled_train_valid_DB.iloc[train_len:]
+    # Reset the index
+    train_DB = train_DB.reset_index(drop=True)
+    valid_DB = valid_DB.reset_index(drop=True)
+    print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100))
+    print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100))
+    print('Total %d utts' %(total_len))
+
+    return train_DB, valid_DB
+
+def main():
+    # Set hyperparameters
+    use_cuda = True # use gpu or cpu
+    val_ratio = 10 # Percentage of validation set
+    embedding_size = 128
+    start = 1 # Start epoch
+    n_epochs = 30 # How many epochs?
+    end = start + n_epochs # Last epoch
+
+    lr = 1e-1 # Initial learning rate
+    wd = 1e-4 # Weight decay (L2 penalty)
+    optimizer_type = 'sgd' # ex) sgd, adam, adagrad
+
+    batch_size = 64 # Batch size for training
+    valid_batch_size = 16 # Batch size for validation
+    use_shuffle = True # Shuffle for training or not
+
+    # Load dataset
+    train_dataset, valid_dataset, n_classes = load_dataset(val_ratio)
+
+    # print the experiment configuration
+    print('\nNumber of classes (speakers):\n{}\n'.format(n_classes))
+
+    log_dir = 'new_model5' # where to save checkpoints
+
+    if not os.path.exists(log_dir):
+        os.makedirs(log_dir)
+
+    # instantiate model and initialize weights
+    model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+
+    if use_cuda:
+        model.cuda()
+
+    # define loss function (criterion), optimizer and scheduler
+    criterion = nn.CrossEntropyLoss()
+    optimizer = create_optimizer(optimizer_type, model, lr, wd)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1)
+
+    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
+                                                       batch_size=batch_size,
+                                                       shuffle=use_shuffle)
+    valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
+                                                       batch_size=valid_batch_size,
+                                                       shuffle=False,
+                                                       collate_fn = collate_fn_feat_padded)
+
+    # to track the average training loss per epoch as the model trains
+    avg_train_losses = []
+    # to track the average validation loss per epoch as the model trains
+    avg_valid_losses = []
+
+
+    for epoch in range(start, end):
+
+        # train for one epoch
+        train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes)
+
+        # evaluate on validation set
+        valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch)
+
+        scheduler.step(valid_loss, epoch)
+
+        # calculate average loss over an epoch
+        avg_train_losses.append(train_loss)
+        avg_valid_losses.append(valid_loss)
+        # do checkpointing
+        torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(),
+                    'optimizer': optimizer.state_dict()},
+                   '{}/checkpoint_{}.pth'.format(log_dir, epoch))
+
+    # find position of lowest validation loss
+    minposs = avg_valid_losses.index(min(avg_valid_losses))+1
+    print('Lowest validation loss at epoch %d' %minposs)
+
+    # visualize the loss and learning rate as the network trained
+    visualize_the_losses(avg_train_losses, avg_valid_losses)
+
+
+def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes):
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    train_acc = AverageMeter()
+
+    n_correct, n_total = 0, 0
+    log_interval = 84
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    # pbar = tqdm(enumerate(train_loader))
+    for batch_idx, (data) in enumerate(train_loader):
+        inputs, targets = data  # target size:(batch size,1), input size:(batch size, 1, dim, win)
+        targets = targets.view(-1) # target size:(batch size)
+        current_sample = inputs.size(0)  # batch size
+
+        if use_cuda:
+            inputs = inputs.cuda()
+            targets = targets.cuda()
+        _, output = model(inputs) # out size:(batch size, #classes), for softmax
+
+        # calculate accuracy of predictions in the current batch
+        n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
+        n_total += current_sample
+        train_acc_temp = 100. * n_correct / n_total
+        train_acc.update(train_acc_temp, inputs.size(0))
+
+        loss = criterion(output, targets)
+        losses.update(loss.item(), inputs.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if batch_idx % log_interval == 0:
+            print(
+                    'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t'
+                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                    'Loss {loss.avg:.4f}\t'
+                    'Acc {train_acc.avg:.4f}'.format(
+                     epoch, batch_idx * len(inputs), len(train_loader.dataset),
+                     100. * batch_idx / len(train_loader),
+                     batch_time=batch_time, loss=losses, train_acc=train_acc))
+    return losses.avg
+
+def validate(val_loader, model, criterion, use_cuda, epoch):
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    val_acc = AverageMeter()
+
+    n_correct, n_total = 0, 0
+
+    # switch to evaluate mode
+    model.eval()
+
+    with torch.no_grad():
+        end = time.time()
+        for i, (data) in enumerate(val_loader):
+            inputs, targets = data
+            current_sample = inputs.size(0)  # batch size
+
+            if use_cuda:
+                inputs = inputs.cuda()
+                targets = targets.cuda()
+
+            # compute output
+            _, output = model(inputs)
+
+            # measure accuracy and record loss
+            n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
+            n_total += current_sample
+            val_acc_temp = 100. * n_correct / n_total
+            val_acc.update(val_acc_temp, inputs.size(0))
+
+            loss = criterion(output, targets)
+            losses.update(loss.item(), inputs.size(0))
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+        print('  * Validation: '
+                  'Loss {loss.avg:.4f}\t'
+                  'Acc {val_acc.avg:.4f}'.format(
+                  loss=losses, val_acc=val_acc))
+
+    return losses.avg
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+def create_optimizer(optimizer, model, new_lr, wd):
+    # setup optimizer
+    if optimizer == 'sgd':
+        optimizer = optim.SGD(model.parameters(), lr=new_lr,
+                              momentum=0.9, dampening=0,
+                              weight_decay=wd)
+    elif optimizer == 'adam':
+        optimizer = optim.Adam(model.parameters(), lr=new_lr,
+                               weight_decay=wd)
+    elif optimizer == 'adagrad':
+        optimizer = optim.Adagrad(model.parameters(),
+                                  lr=new_lr,
+                                  weight_decay=wd)
+    return optimizer
+
+def visualize_the_losses(train_loss, valid_loss):
+    fig = plt.figure(figsize=(10,8))
+    plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss')
+    plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss')
+
+    # find position of lowest validation loss
+    minposs = valid_loss.index(min(valid_loss))+1
+    plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')
+
+    plt.xlabel('epochs')
+    plt.ylabel('loss')
+    plt.ylim(0, 3.5) # consistent scale
+    plt.xlim(0, len(train_loss)+1) # consistent scale
+    plt.grid(True)
+    plt.legend()
+    plt.tight_layout()
+    #plt.show()
+    fig.savefig('train5.png', bbox_inches='tight')
+
+if __name__ == '__main__':
+    main()
--- a/Speaker_Recognition/verification3.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/verification3.py 0 → 100644
View file @cdaaea5
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+import pandas as pd
+import math
+import os
+import configure as c
+
+from DB_wav_reader import read_feats_structure
+from SR_Dataset import read_MFB, ToTensorTestInput
+from model.model3 import background_resnet
+
+def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+    model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+    if use_cuda:
+        model.cuda()
+    print('=> loading checkpoint')
+    # original saved file with DataParallel
+    checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+    # create new OrderedDict that does not contain `module.`
+    model.load_state_dict(checkpoint['state_dict'])
+    model.eval()
+    return model
+
+def split_enroll_and_test(dataroot_dir):
+    DB_all = read_feats_structure(dataroot_dir)
+    enroll_DB = pd.DataFrame()
+    test_DB = pd.DataFrame()
+    
+    enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+    test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+    
+    # Reset the index
+    enroll_DB = enroll_DB.reset_index(drop=True)
+    test_DB = test_DB.reset_index(drop=True)
+    return enroll_DB, test_DB
+
+def load_enroll_embeddings(embedding_dir):
+    embeddings = {}
+    for f in os.listdir(embedding_dir):
+        spk = f.replace('.pth','')
+        # Select the speakers who are in the 'enroll_spk_list'
+        embedding_path = os.path.join(embedding_dir, f)
+        tmp_embeddings = torch.load(embedding_path)
+        embeddings[spk] = tmp_embeddings
+        
+    return embeddings
+
+def get_embeddings(use_cuda, filename, model, test_frames):
+    input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+    
+    tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+    activation = 0
+    with torch.no_grad():
+        for i in range(tot_segments):
+            temp_input = input[i*test_frames:i*test_frames+test_frames]
+            
+            TT = ToTensorTestInput()
+            temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+    
+            if use_cuda:
+                temp_input = temp_input.cuda()
+            temp_activation,_ = model(temp_input)
+            activation += torch.sum(temp_activation, dim=0, keepdim=True)
+    
+    activation = l2_norm(activation, 1)
+                
+    return activation
+
+def l2_norm(input, alpha):
+    input_size = input.size()  # size:(n_frames, dim)
+    buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+    normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+    norm = torch.sqrt(normp)  # size:(n_frames)
+    _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+    output = _output.view(input_size)
+    # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+    output = output * alpha
+    return output
+
+def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
+    enroll_embedding = embeddings[enroll_speaker]
+    test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+
+    score = F.cosine_similarity(test_embedding, enroll_embedding)
+    score = score.data.cpu().numpy() 
+        
+    if score > thres:
+        result = 'Accept'
+    else:
+        result = 'Reject'
+        
+    test_spk = test_filename.split('/')[-2].split('_')[0]
+    print("\n=== Speaker verification ===")
+    print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
+    print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
+
+def main():
+    
+    log_dir = 'new_model3' # Where the checkpoints are saved
+    embedding_dir = 'enroll_embeddings3' # Where embeddings are saved
+    test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+    
+    # Settings
+    use_cuda = True # Use cuda or not
+    embedding_size = 128 # Dimension of speaker embeddings
+    cp_num = 11  # Which checkpoint to use?
+    n_classes = 241 # How many speakers in training data?
+    test_frames = 100 # Split the test utterance 
+
+    # Load model from checkpoint
+    model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+    
+    # Get the dataframe for test DB
+    enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+    
+    # Load enroll embeddings
+    embeddings = load_enroll_embeddings(embedding_dir)
+    
+    """ Test speaker list
+    '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+    '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+    """ 
+    
+    # Set the true speaker
+    enroll_speaker = '103F3021'
+    
+    # Set the claimed speaker
+    test_speaker = '207F2088' 
+    
+    # Threshold
+    thres = 0.95
+    
+    test_path = os.path.join(test_dir, test_speaker, 'test.p')
+    
+    # Perform the test 
+    perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
+
+if __name__ == '__main__':
+    main()
--- a/Speaker_Recognition/verification4.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/verification4.py 0 → 100644
View file @cdaaea5
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+import pandas as pd
+import math
+import os
+import configure as c
+
+from DB_wav_reader import read_feats_structure
+from SR_Dataset import read_MFB, ToTensorTestInput
+from model.model4 import background_resnet
+
+def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+    model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+    if use_cuda:
+        model.cuda()
+    print('=> loading checkpoint')
+    # original saved file with DataParallel
+    checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+    # create new OrderedDict that does not contain `module.`
+    model.load_state_dict(checkpoint['state_dict'])
+    model.eval()
+    return model
+
+def split_enroll_and_test(dataroot_dir):
+    DB_all = read_feats_structure(dataroot_dir)
+    enroll_DB = pd.DataFrame()
+    test_DB = pd.DataFrame()
+    
+    enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+    test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+    
+    # Reset the index
+    enroll_DB = enroll_DB.reset_index(drop=True)
+    test_DB = test_DB.reset_index(drop=True)
+    return enroll_DB, test_DB
+
+def load_enroll_embeddings(embedding_dir):
+    embeddings = {}
+    for f in os.listdir(embedding_dir):
+        spk = f.replace('.pth','')
+        # Select the speakers who are in the 'enroll_spk_list'
+        embedding_path = os.path.join(embedding_dir, f)
+        tmp_embeddings = torch.load(embedding_path)
+        embeddings[spk] = tmp_embeddings
+        
+    return embeddings
+
+def get_embeddings(use_cuda, filename, model, test_frames):
+    input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+    
+    tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+    activation = 0
+    with torch.no_grad():
+        for i in range(tot_segments):
+            temp_input = input[i*test_frames:i*test_frames+test_frames]
+            
+            TT = ToTensorTestInput()
+            temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+    
+            if use_cuda:
+                temp_input = temp_input.cuda()
+            temp_activation,_ = model(temp_input)
+            activation += torch.sum(temp_activation, dim=0, keepdim=True)
+    
+    activation = l2_norm(activation, 1)
+                
+    return activation
+
+def l2_norm(input, alpha):
+    input_size = input.size()  # size:(n_frames, dim)
+    buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+    normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+    norm = torch.sqrt(normp)  # size:(n_frames)
+    _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+    output = _output.view(input_size)
+    # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+    output = output * alpha
+    return output
+
+def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
+    enroll_embedding = embeddings[enroll_speaker]
+    test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+
+    score = F.cosine_similarity(test_embedding, enroll_embedding)
+    score = score.data.cpu().numpy() 
+        
+    if score > thres:
+        result = 'Accept'
+    else:
+        result = 'Reject'
+        
+    test_spk = test_filename.split('/')[-2].split('_')[0]
+    print("\n=== Speaker verification ===")
+    print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
+    print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
+
+def main():
+    
+    log_dir = 'new_model4' # Where the checkpoints are saved
+    embedding_dir = 'enroll_embeddings4' # Where embeddings are saved
+    test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+    
+    # Settings
+    use_cuda = True # Use cuda or not
+    embedding_size = 128 # Dimension of speaker embeddings
+    cp_num = 25   # Which checkpoint to use?
+    n_classes = 241 # How many speakers in training data?
+    test_frames = 100 # Split the test utterance 
+
+    # Load model from checkpoint
+    model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+    
+    # Get the dataframe for test DB
+    enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+    
+    # Load enroll embeddings
+    embeddings = load_enroll_embeddings(embedding_dir)
+    
+    """ Test speaker list
+    '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+    '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+    """ 
+    
+    # Set the true speaker
+    enroll_speaker = '229M2031'
+    
+    # Set the claimed speaker
+    test_speaker = 'sunghwan1' 
+    
+    # Threshold
+    thres = 0.95
+    
+    test_path = os.path.join(test_dir, test_speaker, 'test.p')
+    
+    # Perform the test 
+    perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
+
+if __name__ == '__main__':
+    main()
--- a/Speaker_Recognition/verification4_merge.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/verification4_merge.py 0 → 100644
View file @cdaaea5
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+import pandas as pd
+import math
+import os
+import configure as c
+
+from DB_wav_reader import read_feats_structure
+from SR_Dataset import read_MFB, ToTensorTestInput
+from model.model4 import background_resnet
+
+def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+    model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+    if use_cuda:
+        model.cuda()
+    print('=> loading checkpoint')
+    # original saved file with DataParallel
+    checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+    # create new OrderedDict that does not contain `module.`
+    model.load_state_dict(checkpoint['state_dict'])
+    model.eval()
+    return model
+
+def split_enroll_and_test(dataroot_dir):
+    DB_all = read_feats_structure(dataroot_dir)
+    enroll_DB = pd.DataFrame()
+    test_DB = pd.DataFrame()
+    
+    enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+    test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+    
+    # Reset the index
+    enroll_DB = enroll_DB.reset_index(drop=True)
+    test_DB = test_DB.reset_index(drop=True)
+    return enroll_DB, test_DB
+
+def load_enroll_embeddings(embedding_dir):
+    embeddings = {}
+    for f in os.listdir(embedding_dir):
+        spk = f.replace('.pth','')
+        # Select the speakers who are in the 'enroll_spk_list'
+        embedding_path = os.path.join(embedding_dir, f)
+        tmp_embeddings = torch.load(embedding_path)
+        embeddings[spk] = tmp_embeddings
+        
+    return embeddings
+
+def get_embeddings(use_cuda, filename, model, test_frames):
+    input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+    
+    tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+    activation = 0
+    with torch.no_grad():
+        for i in range(tot_segments):
+            temp_input = input[i*test_frames:i*test_frames+test_frames]
+            
+            TT = ToTensorTestInput()
+            temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+    
+            if use_cuda:
+                temp_input = temp_input.cuda()
+            temp_activation,_ = model(temp_input)
+            activation += torch.sum(temp_activation, dim=0, keepdim=True)
+    
+    activation = l2_norm(activation, 1)
+                
+    return activation
+
+def l2_norm(input, alpha):
+    input_size = input.size()  # size:(n_frames, dim)
+    buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+    normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+    norm = torch.sqrt(normp)  # size:(n_frames)
+    _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+    output = _output.view(input_size)
+    # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+    output = output * alpha
+    return output
+
+def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
+    enroll_embedding = embeddings[enroll_speaker]
+    test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+
+    score = F.cosine_similarity(test_embedding, enroll_embedding)
+    score = score.data.cpu().numpy() 
+        
+    if score > thres:
+        result = 'Accept'
+    else:
+        result = 'Reject'
+        
+    test_spk = test_filename.split('/')[-2].split('_')[0]
+    print("\n=== Speaker verification ===")
+    print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
+    print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
+
+def main():
+    
+    log_dir = 'new_model4_merge' # Where the checkpoints are saved
+    embedding_dir = 'enroll_embeddings4_merge' # Where embeddings are saved
+    test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+    
+    # Settings
+    use_cuda = True # Use cuda or not
+    embedding_size = 128 # Dimension of speaker embeddings
+    cp_num = 50   # Which checkpoint to use?
+    n_classes = 348 # How many speakers in training data?
+    test_frames = 100 # Split the test utterance 
+
+    # Load model from checkpoint
+    model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+    
+    # Get the dataframe for test DB
+    enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+    
+    # Load enroll embeddings
+    embeddings = load_enroll_embeddings(embedding_dir)
+    
+    """ Test speaker list
+    '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+    '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+    """ 
+    
+    # Set the true speaker
+    enroll_speaker = '213F5100'
+
+    # Set the claimed speaker
+    test_speaker = '207F2088' 
+    
+    # Threshold
+    thres = 0.95
+    
+    test_path = os.path.join(test_dir, test_speaker, 'test.p')
+    
+    # Perform the test 
+    perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
+
+if __name__ == '__main__':
+    main()
--- a/Speaker_Recognition/verification4_zeroth.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/verification4_zeroth.py 0 → 100644
View file @cdaaea5
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+import pandas as pd
+import math
+import os
+import configure as c
+
+from DB_wav_reader import read_feats_structure
+from SR_Dataset import read_MFB, ToTensorTestInput
+from model.model4 import background_resnet
+
+def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+    model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+    if use_cuda:
+        model.cuda()
+    print('=> loading checkpoint')
+    # original saved file with DataParallel
+    checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+    # create new OrderedDict that does not contain `module.`
+    model.load_state_dict(checkpoint['state_dict'])
+    model.eval()
+    return model
+
+def split_enroll_and_test(dataroot_dir):
+    DB_all = read_feats_structure(dataroot_dir)
+    enroll_DB = pd.DataFrame()
+    test_DB = pd.DataFrame()
+    
+    enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+    test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+    
+    # Reset the index
+    enroll_DB = enroll_DB.reset_index(drop=True)
+    test_DB = test_DB.reset_index(drop=True)
+    return enroll_DB, test_DB
+
+def load_enroll_embeddings(embedding_dir):
+    embeddings = {}
+    for f in os.listdir(embedding_dir):
+        spk = f.replace('.pth','')
+        # Select the speakers who are in the 'enroll_spk_list'
+        embedding_path = os.path.join(embedding_dir, f)
+        tmp_embeddings = torch.load(embedding_path)
+        embeddings[spk] = tmp_embeddings
+        
+    return embeddings
+
+def get_embeddings(use_cuda, filename, model, test_frames):
+    input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+    
+    tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+    activation = 0
+    with torch.no_grad():
+        for i in range(tot_segments):
+            temp_input = input[i*test_frames:i*test_frames+test_frames]
+            
+            TT = ToTensorTestInput()
+            temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+    
+            if use_cuda:
+                temp_input = temp_input.cuda()
+            temp_activation,_ = model(temp_input)
+            activation += torch.sum(temp_activation, dim=0, keepdim=True)
+    
+    activation = l2_norm(activation, 1)
+                
+    return activation
+
+def l2_norm(input, alpha):
+    input_size = input.size()  # size:(n_frames, dim)
+    buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+    normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+    norm = torch.sqrt(normp)  # size:(n_frames)
+    _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+    output = _output.view(input_size)
+    # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+    output = output * alpha
+    return output
+
+def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
+    enroll_embedding = embeddings[enroll_speaker]
+    test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+
+    score = F.cosine_similarity(test_embedding, enroll_embedding)
+    score = score.data.cpu().numpy() 
+        
+    if score > thres:
+        result = 'Accept'
+    else:
+        result = 'Reject'
+        
+    test_spk = test_filename.split('/')[-2].split('_')[0]
+    print("\n=== Speaker verification ===")
+    print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
+    print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
+
+def main():
+    
+    log_dir = 'new_model4_zeroth' # Where the checkpoints are saved
+    embedding_dir = 'enroll_embeddings4_zeroth' # Where embeddings are saved
+    test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+    
+    # Settings
+    use_cuda = True # Use cuda or not
+    embedding_size = 128 # Dimension of speaker embeddings
+    cp_num = 30   # Which checkpoint to use?
+    n_classes = 105 # How many speakers in training data?
+    test_frames = 100 # Split the test utterance 
+
+    # Load model from checkpoint
+    model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+    
+    # Get the dataframe for test DB
+    enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+    
+    # Load enroll embeddings
+    embeddings = load_enroll_embeddings(embedding_dir)
+    
+    """ Test speaker list
+    '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+    '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+    """ 
+    
+    # Set the true speaker
+    enroll_speaker = '777M7777'
+    
+    # Set the claimed speaker
+    test_speaker = '103F3021' 
+    
+    # Threshold
+    thres = 0.95
+    
+    test_path = os.path.join(test_dir, test_speaker, 'test.p')
+    
+    # Perform the test 
+    perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
+
+if __name__ == '__main__':
+    main()
--- a/Speaker_Recognition/verification5.py 0 → 100644
View file @cdaaea5
+++ b/Speaker_Recognition/verification5.py 0 → 100644
View file @cdaaea5
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+import pandas as pd
+import math
+import os
+import configure as c
+
+from DB_wav_reader import read_feats_structure
+from SR_Dataset import read_MFB, ToTensorTestInput
+from model.model5 import background_resnet
+
+def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
+    model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
+    if use_cuda:
+        model.cuda()
+    print('=> loading checkpoint')
+    # original saved file with DataParallel
+    checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
+    # create new OrderedDict that does not contain `module.`
+    model.load_state_dict(checkpoint['state_dict'])
+    model.eval()
+    return model
+
+def split_enroll_and_test(dataroot_dir):
+    DB_all = read_feats_structure(dataroot_dir)
+    enroll_DB = pd.DataFrame()
+    test_DB = pd.DataFrame()
+    
+    enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
+    test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
+    
+    # Reset the index
+    enroll_DB = enroll_DB.reset_index(drop=True)
+    test_DB = test_DB.reset_index(drop=True)
+    return enroll_DB, test_DB
+
+def load_enroll_embeddings(embedding_dir):
+    embeddings = {}
+    for f in os.listdir(embedding_dir):
+        spk = f.replace('.pth','')
+        # Select the speakers who are in the 'enroll_spk_list'
+        embedding_path = os.path.join(embedding_dir, f)
+        tmp_embeddings = torch.load(embedding_path)
+        embeddings[spk] = tmp_embeddings
+        
+    return embeddings
+
+def get_embeddings(use_cuda, filename, model, test_frames):
+    input, label = read_MFB(filename) # input size:(n_frames, n_dims)
+    
+    tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
+    activation = 0
+    with torch.no_grad():
+        for i in range(tot_segments):
+            temp_input = input[i*test_frames:i*test_frames+test_frames]
+            
+            TT = ToTensorTestInput()
+            temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
+    
+            if use_cuda:
+                temp_input = temp_input.cuda()
+            temp_activation,_ = model(temp_input)
+            activation += torch.sum(temp_activation, dim=0, keepdim=True)
+    
+    activation = l2_norm(activation, 1)
+                
+    return activation
+
+def l2_norm(input, alpha):
+    input_size = input.size()  # size:(n_frames, dim)
+    buffer = torch.pow(input, 2)  # 2 denotes a squared operation. size:(n_frames, dim)
+    normp = torch.sum(buffer, 1).add_(1e-10)  # size:(n_frames)
+    norm = torch.sqrt(normp)  # size:(n_frames)
+    _output = torch.div(input, norm.view(-1, 1).expand_as(input))
+    output = _output.view(input_size)
+    # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
+    output = output * alpha
+    return output
+
+def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
+    enroll_embedding = embeddings[enroll_speaker]
+    test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
+
+    score = F.cosine_similarity(test_embedding, enroll_embedding)
+    score = score.data.cpu().numpy() 
+        
+    if score > thres:
+        result = 'Accept'
+    else:
+        result = 'Reject'
+        
+    test_spk = test_filename.split('/')[-2].split('_')[0]
+    print("\n=== Speaker verification ===")
+    print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
+    print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
+
+def main():
+    
+    log_dir = 'new_model5' # Where the checkpoints are saved
+    embedding_dir = 'enroll_embeddings5' # Where embeddings are saved
+    test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
+    
+    # Settings
+    use_cuda = True # Use cuda or not
+    embedding_size = 128 # Dimension of speaker embeddings
+    cp_num = 30   # Which checkpoint to use?
+    n_classes = 241 # How many speakers in training data?
+    test_frames = 100 # Split the test utterance 
+
+    # Load model from checkpoint
+    model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
+    
+    # Get the dataframe for test DB
+    enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
+    
+    # Load enroll embeddings
+    embeddings = load_enroll_embeddings(embedding_dir)
+    
+    """ Test speaker list
+    '103F3021', '207F2088', '213F5100', '217F3038', '225M4062', 
+    '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
+    """ 
+    
+    # Set the true speaker
+    enroll_speaker = '777M7777'
+    
+    # Set the claimed speaker
+    test_speaker = 'sunghwan1' 
+    
+    # Threshold
+    thres = 0.95
+    
+    test_path = os.path.join(test_dir, test_speaker, 'test.p')
+    
+    # Perform the test 
+    perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
+
+if __name__ == '__main__':
+    main()