최성환
1 +<?xml version="1.0" encoding="UTF-8"?>
2 +<module type="JAVA_MODULE" version="4">
3 + <component name="NewModuleRootManager" inherit-compiler-output="true">
4 + <exclude-output />
5 + <content url="file://$MODULE_DIR$" />
6 + <orderEntry type="inheritedJdk" />
7 + <orderEntry type="sourceFolder" forTests="false" />
8 + </component>
9 +</module>
...\ No newline at end of file ...\ No newline at end of file
1 +<?xml version="1.0" encoding="UTF-8"?>
2 +<project version="4">
3 + <component name="ProjectRootManager" version="2" languageLevel="JDK_14" project-jdk-name="14" project-jdk-type="JavaSDK">
4 + <output url="file://$PROJECT_DIR$/out" />
5 + </component>
6 +</project>
...\ No newline at end of file ...\ No newline at end of file
1 +<?xml version="1.0" encoding="UTF-8"?>
2 +<project version="4">
3 + <component name="ProjectModuleManager">
4 + <modules>
5 + <module fileurl="file://$PROJECT_DIR$/.idea/LYG_project.iml" filepath="$PROJECT_DIR$/.idea/LYG_project.iml" />
6 + </modules>
7 + </component>
8 +</project>
...\ No newline at end of file ...\ No newline at end of file
1 +<?xml version="1.0" encoding="UTF-8"?>
2 +<project version="4">
3 + <component name="VcsDirectoryMappings">
4 + <mapping directory="" vcs="Git" />
5 + </component>
6 +</project>
...\ No newline at end of file ...\ No newline at end of file
1 +<?xml version="1.0" encoding="UTF-8"?>
2 +<project version="4">
3 + <component name="ChangeListManager">
4 + <list default="true" id="2b896c0d-f1d9-4424-819a-bc96fe07a387" name="Default Changelist" comment="" />
5 + <option name="SHOW_DIALOG" value="false" />
6 + <option name="HIGHLIGHT_CONFLICTS" value="true" />
7 + <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
8 + <option name="LAST_RESOLUTION" value="IGNORE" />
9 + </component>
10 + <component name="Git.Settings">
11 + <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
12 + </component>
13 + <component name="MavenImportPreferences">
14 + <option name="generalSettings">
15 + <MavenGeneralSettings>
16 + <option name="mavenHome" value="C:\Program Files\JetBrains\IntelliJ IDEA Community Edition 2020.2\plugins\maven\lib\maven3" />
17 + </MavenGeneralSettings>
18 + </option>
19 + </component>
20 + <component name="ProjectId" id="1kPBM4RUtUJvLFOYYSRQMuBxazR" />
21 + <component name="ProjectViewState">
22 + <option name="hideEmptyMiddlePackages" value="true" />
23 + <option name="showLibraryContents" value="true" />
24 + </component>
25 + <component name="PropertiesComponent">
26 + <property name="RunOnceActivity.OpenProjectViewOnStart" value="true" />
27 + <property name="last_opened_file_path" value="$PROJECT_DIR$/../../../../spring/Myprj" />
28 + </component>
29 + <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
30 + <component name="TaskManager">
31 + <task active="true" id="Default" summary="Default task">
32 + <changelist id="2b896c0d-f1d9-4424-819a-bc96fe07a387" name="Default Changelist" comment="" />
33 + <created>1605592296286</created>
34 + <option name="number" value="Default" />
35 + <option name="presentableId" value="Default" />
36 + <updated>1605592296286</updated>
37 + </task>
38 + <servers />
39 + </component>
40 + <component name="WindowStateProjectService">
41 + <state x="740" y="275" key="FileChooserDialogImpl" timestamp="1605592333707">
42 + <screen x="0" y="0" width="1920" height="1040" />
43 + </state>
44 + <state x="740" y="275" key="FileChooserDialogImpl/0.0.1920.1040/-1920.65.1920.1040@0.0.1920.1040" timestamp="1605592333707" />
45 + </component>
46 +</project>
...\ No newline at end of file ...\ No newline at end of file
...@@ -5,6 +5,8 @@ TEST_WAV_DIR = 'test_wavs' ...@@ -5,6 +5,8 @@ TEST_WAV_DIR = 'test_wavs'
5 5
6 # Feature path 6 # Feature path
7 TRAIN_FEAT_DIR = 'feat_logfbank_nfilt40/train' 7 TRAIN_FEAT_DIR = 'feat_logfbank_nfilt40/train'
8 +# TRAIN_FEAT_DIR = '/test/merge_dataset'
9 +# TRAIN_FEAT_DIR = '/test/trainFeature'
8 TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test' 10 TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test'
9 11
10 # Context window size 12 # Context window size
......
1 +# Wave path
2 +TRAIN_WAV_DIR = '/home/admin/Desktop/read_25h_2/train'
3 +DEV_WAV_DIR = '/home/admin/Desktop/read_25h_2/dev'
4 +TEST_WAV_DIR = 'test_wavs'
5 +
6 +# Feature path
7 +# TRAIN_FEAT_DIR = '/test/zeroth/train_data_01/003'
8 +TRAIN_FEAT_DIR = '/test/merge_train_dataset'
9 +# TRAIN_FEAT_DIR = '/test/trainFeature'
10 +# TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test'
11 +TEST_FEAT_DIR = '/test/merge_test_dataset'
12 +# Context window size
13 +NUM_WIN_SIZE = 100 #10
14 +
15 +# Settings for feature extraction
16 +USE_LOGSCALE = True
17 +USE_DELTA = False
18 +USE_SCALE = False
19 +SAMPLE_RATE = 16000
20 +FILTER_BANK = 40
...\ No newline at end of file ...\ No newline at end of file
1 +# Wave path
2 +TRAIN_WAV_DIR = '/home/admin/Desktop/read_25h_2/train'
3 +DEV_WAV_DIR = '/home/admin/Desktop/read_25h_2/dev'
4 +TEST_WAV_DIR = 'test_wavs'
5 +
6 +# Feature path
7 +# TRAIN_FEAT_DIR = '/test/zeroth/train_data_01/003'
8 +TRAIN_FEAT_DIR = '/test/zeroth_train_dataset'
9 +# TRAIN_FEAT_DIR = '/test/trainFeature'
10 +# TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test'
11 +TEST_FEAT_DIR = '/test/zeroth_test_dataset'
12 +# Context window size
13 +NUM_WIN_SIZE = 100 #10
14 +
15 +# Settings for feature extraction
16 +USE_LOGSCALE = True
17 +USE_DELTA = False
18 +USE_SCALE = False
19 +SAMPLE_RATE = 16000
20 +FILTER_BANK = 40
...\ No newline at end of file ...\ No newline at end of file
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model1 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def get_embeddings(use_cuda, filename, model, test_frames):
40 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
41 +
42 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
43 + activation = 0
44 + with torch.no_grad():
45 + for i in range(tot_segments):
46 + temp_input = input[i*test_frames:i*test_frames+test_frames]
47 +
48 + TT = ToTensorTestInput()
49 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
50 +
51 + if use_cuda:
52 + temp_input = temp_input.cuda()
53 + temp_activation,_ = model(temp_input)
54 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
55 +
56 + activation = l2_norm(activation, 1)
57 +
58 + return activation
59 +
60 +def l2_norm(input, alpha):
61 + input_size = input.size() # size:(n_frames, dim)
62 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
63 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
64 + norm = torch.sqrt(normp) # size:(n_frames)
65 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
66 + output = _output.view(input_size)
67 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
68 + output = output * alpha
69 + return output
70 +
71 +def enroll_per_spk(use_cuda, test_frames, model, DB, embedding_dir):
72 + """
73 + Output the averaged d-vector for each speaker (enrollment)
74 + Return the dictionary (length of n_spk)
75 + """
76 + n_files = len(DB) # 10
77 + enroll_speaker_list = sorted(set(DB['speaker_id']))
78 +
79 + embeddings = {}
80 +
81 + # Aggregates all the activations
82 + print("Start to aggregate all the d-vectors per enroll speaker")
83 +
84 + for i in range(n_files):
85 + filename = DB['filename'][i]
86 + spk = DB['speaker_id'][i]
87 +
88 + activation = get_embeddings(use_cuda, filename, model, test_frames)
89 + if spk in embeddings:
90 + embeddings[spk] += activation
91 + else:
92 + embeddings[spk] = activation
93 +
94 + print("Aggregates the activation (spk : %s)" % (spk))
95 +
96 + if not os.path.exists(embedding_dir):
97 + os.makedirs(embedding_dir)
98 +
99 + # Save the embeddings
100 + for spk_index in enroll_speaker_list:
101 + embedding_path = os.path.join(embedding_dir, spk_index+'.pth')
102 + torch.save(embeddings[spk_index], embedding_path)
103 + print("Save the embeddings for %s" % (spk_index))
104 + return embeddings
105 +
106 +def main():
107 +
108 + # Settings
109 + use_cuda = True
110 + log_dir = 'new_model1'
111 + embedding_size = 128
112 + cp_num = 24 # Which checkpoint to use?
113 + n_classes = 241
114 + test_frames = 200
115 +
116 + # Load model from checkpoint
117 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
118 +
119 + # Get the dataframe for enroll DB
120 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
121 +
122 + # Where to save embeddings
123 + embedding_dir = 'enroll_embeddings1'
124 +
125 + # Perform the enrollment and save the results
126 + enroll_per_spk(use_cuda, test_frames, model, enroll_DB, embedding_dir)
127 +
128 + """ Test speaker list
129 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
130 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
131 + """
132 +
133 +if __name__ == '__main__':
134 + main()
...\ No newline at end of file ...\ No newline at end of file
...@@ -123,10 +123,10 @@ def main(): ...@@ -123,10 +123,10 @@ def main():
123 """ 123 """
124 124
125 spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\ 125 spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
126 - '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'] 126 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
127 127
128 # Set the test speaker 128 # Set the test speaker
129 - test_speaker = '230M4087' 129 + test_speaker = '778M8777'
130 130
131 test_path = os.path.join(test_dir, test_speaker, 'test.p') 131 test_path = os.path.join(test_dir, test_speaker, 'test.p')
132 132
...@@ -134,4 +134,4 @@ def main(): ...@@ -134,4 +134,4 @@ def main():
134 best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list) 134 best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
135 135
136 if __name__ == '__main__': 136 if __name__ == '__main__':
137 - main()
...\ No newline at end of file ...\ No newline at end of file
137 + main()
......
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model1 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
83 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
84 + max_score = -10**8
85 + best_spk = None
86 + for spk in spk_list:
87 + score = F.cosine_similarity(test_embedding, embeddings[spk])
88 + score = score.data.cpu().numpy()
89 + if score > max_score:
90 + max_score = score
91 + best_spk = spk
92 + #print("Speaker identification result : %s" %best_spk)
93 + true_spk = test_filename.split('/')[-2].split('_')[0]
94 + print("\n=== Speaker identification ===")
95 + print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
96 + return best_spk
97 +
98 +def main():
99 +
100 + log_dir = 'new_model1' # Where the checkpoints are saved
101 + embedding_dir = 'enroll_embeddings1' # Where embeddings are saved
102 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
103 +
104 + # Settings
105 + use_cuda = True # Use cuda or not
106 + embedding_size = 128 # Dimension of speaker embeddings
107 + cp_num = 30 # Which checkpoint to use?
108 + n_classes = 241 # How many speakers in training data?
109 + test_frames = 100 # Split the test utterance
110 +
111 + # Load model from checkpoint
112 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
113 +
114 + # Get the dataframe for test DB
115 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
116 +
117 + # Load enroll embeddings
118 + embeddings = load_enroll_embeddings(embedding_dir)
119 +
120 + """ Test speaker list
121 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
122 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
123 + """
124 +
125 + spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
126 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
127 +
128 + # Set the test speaker
129 + test_speaker = '213F5100'
130 +
131 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
132 +
133 + # Perform the test
134 + best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
135 +
136 +if __name__ == '__main__':
137 + main()
1 +import torch
2 +import torch.nn as nn
3 +import torch.nn.functional as F
4 +from torch.autograd import Function
5 +import model.resnet as resnet
6 +
7 +
8 +class background_resnet(nn.Module):
9 + def __init__(self, embedding_size, num_classes, backbone='resnet18'):
10 + super(background_resnet, self).__init__()
11 + self.backbone = backbone
12 + # copying modules from pretrained models
13 + if backbone == 'resnet50':
14 + self.pretrained = resnet.resnet50(pretrained=False)
15 + elif backbone == 'resnet101':
16 + self.pretrained = resnet.resnet101(pretrained=False)
17 + elif backbone == 'resnet152':
18 + self.pretrained = resnet.resnet152(pretrained=False)
19 + elif backbone == 'resnet18':
20 + self.pretrained = resnet.resnet18(pretrained=False)
21 + elif backbone == 'resnet34':
22 + self.pretrained = resnet.resnet34(pretrained=False)
23 + else:
24 + raise RuntimeError('unknown backbone: {}'.format(backbone))
25 +
26 + self.fc0 = nn.Linear(128, embedding_size)
27 + self.bn0 = nn.BatchNorm1d(embedding_size)
28 + self.relu = nn.ReLU()
29 + self.last = nn.Linear(embedding_size, num_classes)
30 +
31 + def forward(self, x):
32 + # input x: minibatch x 1 x 40 x 40
33 + x = self.pretrained.conv1(x)
34 + x = self.pretrained.bn1(x)
35 + x = self.pretrained.relu(x)
36 +
37 + x = self.pretrained.layer1(x)
38 + x = self.pretrained.layer2(x)
39 + x = self.pretrained.layer3(x)
40 + x = self.pretrained.layer4(x)
41 +
42 + out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1]
43 + out = torch.squeeze(out) # [batch, n_embed]
44 + # flatten the out so that the fully connected layer can be connected from here
45 + out = out.view(x.size(0), -1) # (n_batch, n_embed)
46 + spk_embedding = self.fc0(out)
47 + out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed]
48 + out = self.last(out)
49 +
50 + return spk_embedding, out
...\ No newline at end of file ...\ No newline at end of file
...@@ -113,6 +113,7 @@ class ResNet(nn.Module): ...@@ -113,6 +113,7 @@ class ResNet(nn.Module):
113 self.layer2 = self._make_layer(block, 32, layers[1], stride=2) 113 self.layer2 = self._make_layer(block, 32, layers[1], stride=2)
114 self.layer3 = self._make_layer(block, 64, layers[2], stride=2) 114 self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
115 self.layer4 = self._make_layer(block, 128, layers[3], stride=2) 115 self.layer4 = self._make_layer(block, 128, layers[3], stride=2)
116 +
116 self.avgpool = nn.AvgPool2d(1, stride=1) 117 self.avgpool = nn.AvgPool2d(1, stride=1)
117 self.fc = nn.Linear(128 * block.expansion, num_classes) 118 self.fc = nn.Linear(128 * block.expansion, num_classes)
118 119
......
1 +"""Imported from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
2 +and added support for the 1x32x32 mel spectrogram for the speech recognition.
3 +Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun: Deep Residual Learning for Image Recognition
4 +https://arxiv.org/abs/1512.03385
5 +"""
6 +
7 +import torch.nn as nn
8 +import math
9 +import torch.utils.model_zoo as model_zoo
10 +
11 +
12 +__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
13 + 'resnet152']
14 +
15 +
16 +model_urls = {
17 + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
18 + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
19 + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
20 + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
21 + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
22 +}
23 +
24 +
25 +def conv3x3(in_planes, out_planes, stride=1):
26 + """3x3 convolution with padding"""
27 + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
28 + padding=1, bias=False)
29 +
30 +
31 +class BasicBlock(nn.Module):
32 + expansion = 1
33 +
34 + def __init__(self, inplanes, planes, stride=1, downsample=None):
35 + super(BasicBlock, self).__init__()
36 + self.conv1 = conv3x3(inplanes, planes, stride)
37 + self.bn1 = nn.BatchNorm2d(planes)
38 + self.relu = nn.ReLU(inplace=True)
39 + self.conv2 = conv3x3(planes, planes)
40 + self.bn2 = nn.BatchNorm2d(planes)
41 + self.downsample = downsample
42 + self.stride = stride
43 +
44 + def forward(self, x):
45 + residual = x
46 +
47 + out = self.conv1(x)
48 + out = self.bn1(out)
49 + out = self.relu(out)
50 +
51 + out = self.conv2(out)
52 + out = self.bn2(out)
53 +
54 + if self.downsample is not None:
55 + residual = self.downsample(x)
56 +
57 + out += residual
58 + out = self.relu(out)
59 +
60 + return out
61 +
62 +
63 +class Bottleneck(nn.Module):
64 + expansion = 4
65 +
66 + def __init__(self, inplanes, planes, stride=1, downsample=None):
67 + super(Bottleneck, self).__init__()
68 + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
69 + self.bn1 = nn.BatchNorm2d(planes)
70 + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
71 + padding=1, bias=False)
72 + self.bn2 = nn.BatchNorm2d(planes)
73 + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
74 + self.bn3 = nn.BatchNorm2d(planes * 4)
75 + self.relu = nn.ReLU(inplace=True)
76 + self.downsample = downsample
77 + self.stride = stride
78 +
79 + def forward(self, x):
80 + residual = x
81 +
82 + out = self.conv1(x)
83 + out = self.bn1(out)
84 + out = self.relu(out)
85 +
86 + out = self.conv2(out)
87 + out = self.bn2(out)
88 + out = self.relu(out)
89 +
90 + out = self.conv3(out)
91 + out = self.bn3(out)
92 +
93 + if self.downsample is not None:
94 + residual = self.downsample(x)
95 +
96 + out += residual
97 + out = self.relu(out)
98 +
99 + return out
100 +
101 +
102 +class ResNet(nn.Module):
103 +
104 + def __init__(self, block, layers, num_classes=1000, in_channels=1):
105 + self.inplanes = 16
106 + super(ResNet, self).__init__()
107 + self.conv1 = nn.Conv2d(in_channels, 16, kernel_size=7, stride=1, padding=3,
108 + bias=False) # ori : stride = 2
109 + self.bn1 = nn.BatchNorm2d(16)
110 + self.relu = nn.ReLU(inplace=True)
111 + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
112 + self.layer1 = self._make_layer(block, 16, layers[0])
113 + self.layer2 = self._make_layer(block, 32, layers[1], stride=2)
114 + self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
115 + self.layer4 = self._make_layer(block, 128, layers[3], stride=2)
116 + self.layer5 = self._make_layer(block, 256, layers[3], stride=2)
117 + self.avgpool = nn.AvgPool2d(1, stride=1)
118 + self.fc = nn.Linear(128 * block.expansion, num_classes)
119 +
120 + for m in self.modules():
121 + if isinstance(m, nn.Conv2d):
122 + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
123 + m.weight.data.normal_(0, math.sqrt(2. / n))
124 + elif isinstance(m, nn.BatchNorm2d):
125 + m.weight.data.fill_(1)
126 + m.bias.data.zero_()
127 +
128 + def _make_layer(self, block, planes, blocks, stride=1):
129 + downsample = None
130 + if stride != 1 or self.inplanes != planes * block.expansion:
131 + downsample = nn.Sequential(
132 + nn.Conv2d(self.inplanes, planes * block.expansion,
133 + kernel_size=1, stride=stride, bias=False),
134 + nn.BatchNorm2d(planes * block.expansion),
135 + )
136 +
137 + layers = []
138 + layers.append(block(self.inplanes, planes, stride, downsample))
139 + self.inplanes = planes * block.expansion
140 + for i in range(1, blocks):
141 + layers.append(block(self.inplanes, planes))
142 +
143 + return nn.Sequential(*layers)
144 +
145 + def forward(self, x):
146 + x = self.conv1(x)
147 + x = self.bn1(x)
148 + x = self.relu(x)
149 + x = self.maxpool(x)
150 +
151 + x = self.layer1(x)
152 + x = self.layer2(x)
153 + x = self.layer3(x)
154 + x = self.layer4(x)
155 +
156 + x = self.avgpool(x)
157 + x = x.view(x.size(0), -1)
158 + x = self.fc(x)
159 +
160 + return x
161 +
162 +
163 +def resnet18(pretrained=False, **kwargs):
164 + """Constructs a ResNet-18 model.
165 + Args:
166 + pretrained (bool): If True, returns a model pre-trained on ImageNet
167 + """
168 + model = ResNet(BasicBlock, [2, 2, 2, 2, 2], **kwargs)
169 + if pretrained:
170 + model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
171 + return model
172 +
173 +
174 +def resnet34(pretrained=False, **kwargs):
175 + """Constructs a ResNet-34 model.
176 + Args:
177 + pretrained (bool): If True, returns a model pre-trained on ImageNet
178 + """
179 + model = ResNet(BasicBlock, [3, 4, 6, 3, 3], **kwargs)
180 + if pretrained:
181 + model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
182 + return model
183 +
184 +
185 +def resnet50(pretrained=False, **kwargs):
186 + """Constructs a ResNet-50 model.
187 + Args:
188 + pretrained (bool): If True, returns a model pre-trained on ImageNet
189 + """
190 + model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
191 + if pretrained:
192 + model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
193 + return model
194 +
195 +
196 +def resnet101(pretrained=False, **kwargs):
197 + """Constructs a ResNet-101 model.
198 + Args:
199 + pretrained (bool): If True, returns a model pre-trained on ImageNet
200 + """
201 + model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
202 + if pretrained:
203 + model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
204 + return model
205 +
206 +
207 +def resnet152(pretrained=False, **kwargs):
208 + """Constructs a ResNet-152 model.
209 + Args:
210 + pretrained (bool): If True, returns a model pre-trained on ImageNet
211 + """
212 + model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
213 + if pretrained:
214 + model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
215 + return model
...\ No newline at end of file ...\ No newline at end of file
1 +import torch
2 +import torch.nn as nn
3 +import torch.optim as optim
4 +import torchvision.transforms as transforms
5 +
6 +import time
7 +import os
8 +import numpy as np
9 +import configure as c
10 +import pandas as pd
11 +from DB_wav_reader import read_feats_structure
12 +from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded
13 +from model.model1 import background_resnet
14 +import matplotlib.pyplot as plt
15 +import pandas as pd
16 +def load_dataset(val_ratio):
17 + # Load training set and validation set
18 +
19 +
20 + # Split training set into training set and validation set according to "val_ratio"
21 + train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio)
22 +
23 + file_loader = read_MFB # numpy array:(n_frames, n_dims)
24 +
25 + transform = transforms.Compose([
26 + TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims)
27 + ToTensorInput() # torch tensor:(1, n_dims, n_frames)
28 + ])
29 + transform_T = ToTensorDevInput()
30 +
31 +
32 + speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers
33 + spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)}
34 +
35 + train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx)
36 + valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx)
37 +
38 + n_classes = len(speaker_list) # How many speakers? 240
39 + return train_dataset, valid_dataset, n_classes
40 +
41 +def split_train_dev(train_feat_dir, valid_ratio):
42 + train_valid_DB = read_feats_structure(train_feat_dir)
43 + total_len = len(train_valid_DB) # 148642
44 + valid_len = int(total_len * valid_ratio/100.)
45 + train_len = total_len - valid_len
46 + shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True)
47 + # Split the DB into train and valid set
48 + train_DB = shuffled_train_valid_DB.iloc[:train_len]
49 + valid_DB = shuffled_train_valid_DB.iloc[train_len:]
50 + # Reset the index
51 + train_DB = train_DB.reset_index(drop=True)
52 + valid_DB = valid_DB.reset_index(drop=True)
53 + print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100))
54 + print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100))
55 + print('Total %d utts' %(total_len))
56 +
57 + return train_DB, valid_DB
58 +
59 +def main():
60 + # Set hyperparameters
61 + use_cuda = True # use gpu or cpu
62 + val_ratio = 10 # Percentage of validation set
63 + embedding_size = 128
64 + start = 1 # Start epoch
65 + n_epochs = 30 # How many epochs?
66 + end = start + n_epochs # Last epoch
67 +
68 + lr = 1e-1 # Initial learning rate
69 + wd = 1e-4 # Weight decay (L2 penalty)
70 + optimizer_type = 'sgd' # ex) sgd, adam, adagrad
71 +
72 + batch_size = 64 # Batch size for training
73 + valid_batch_size = 16 # Batch size for validation
74 + use_shuffle = True # Shuffle for training or not
75 +
76 + # Load dataset
77 + train_dataset, valid_dataset, n_classes = load_dataset(val_ratio)
78 +
79 + # print the experiment configuration
80 + print('\nNumber of classes (speakers):\n{}\n'.format(n_classes))
81 +
82 + log_dir = 'new_model1' # where to save checkpoints
83 +
84 + if not os.path.exists(log_dir):
85 + os.makedirs(log_dir)
86 +
87 + # instantiate model and initialize weights
88 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
89 +
90 + if use_cuda:
91 + model.cuda()
92 +
93 + # define loss function (criterion), optimizer and scheduler
94 + criterion = nn.CrossEntropyLoss()
95 + optimizer = create_optimizer(optimizer_type, model, lr, wd)
96 + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1)
97 +
98 + train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
99 + batch_size=batch_size,
100 + shuffle=use_shuffle)
101 + valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
102 + batch_size=valid_batch_size,
103 + shuffle=False,
104 + collate_fn = collate_fn_feat_padded)
105 +
106 + # to track the average training loss per epoch as the model trains
107 + avg_train_losses = []
108 + # to track the average validation loss per epoch as the model trains
109 + avg_valid_losses = []
110 +
111 +
112 + for epoch in range(start, end):
113 +
114 + # train for one epoch
115 + train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes)
116 +
117 + # evaluate on validation set
118 + valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch)
119 +
120 + scheduler.step(valid_loss, epoch)
121 +
122 + # calculate average loss over an epoch
123 + avg_train_losses.append(train_loss)
124 + avg_valid_losses.append(valid_loss)
125 + # do checkpointing
126 + torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(),
127 + 'optimizer': optimizer.state_dict()},
128 + '{}/checkpoint_{}.pth'.format(log_dir, epoch))
129 +
130 + # find position of lowest validation loss
131 + minposs = avg_valid_losses.index(min(avg_valid_losses))+1
132 + print('Lowest validation loss at epoch %d' %minposs)
133 +
134 + # visualize the loss and learning rate as the network trained
135 + visualize_the_losses(avg_train_losses, avg_valid_losses)
136 +
137 +
138 +def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes):
139 + batch_time = AverageMeter()
140 + losses = AverageMeter()
141 + train_acc = AverageMeter()
142 +
143 + n_correct, n_total = 0, 0
144 + log_interval = 84
145 + # switch to train mode
146 + model.train()
147 +
148 + end = time.time()
149 + # pbar = tqdm(enumerate(train_loader))
150 + for batch_idx, (data) in enumerate(train_loader):
151 + inputs, targets = data # target size:(batch size,1), input size:(batch size, 1, dim, win)
152 + targets = targets.view(-1) # target size:(batch size)
153 + current_sample = inputs.size(0) # batch size
154 +
155 + if use_cuda:
156 + inputs = inputs.cuda()
157 + targets = targets.cuda()
158 + _, output = model(inputs) # out size:(batch size, #classes), for softmax
159 +
160 + # calculate accuracy of predictions in the current batch
161 + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
162 + n_total += current_sample
163 + train_acc_temp = 100. * n_correct / n_total
164 + train_acc.update(train_acc_temp, inputs.size(0))
165 +
166 + loss = criterion(output, targets)
167 + losses.update(loss.item(), inputs.size(0))
168 +
169 + # compute gradient and do SGD step
170 + optimizer.zero_grad()
171 + loss.backward()
172 + optimizer.step()
173 +
174 + # measure elapsed time
175 + batch_time.update(time.time() - end)
176 + end = time.time()
177 +
178 + if batch_idx % log_interval == 0:
179 + print(
180 + 'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t'
181 + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
182 + 'Loss {loss.avg:.4f}\t'
183 + 'Acc {train_acc.avg:.4f}'.format(
184 + epoch, batch_idx * len(inputs), len(train_loader.dataset),
185 + 100. * batch_idx / len(train_loader),
186 + batch_time=batch_time, loss=losses, train_acc=train_acc))
187 + return losses.avg
188 +
189 +def validate(val_loader, model, criterion, use_cuda, epoch):
190 + batch_time = AverageMeter()
191 + losses = AverageMeter()
192 + val_acc = AverageMeter()
193 +
194 + n_correct, n_total = 0, 0
195 +
196 + # switch to evaluate mode
197 + model.eval()
198 +
199 + with torch.no_grad():
200 + end = time.time()
201 + for i, (data) in enumerate(val_loader):
202 + inputs, targets = data
203 + current_sample = inputs.size(0) # batch size
204 +
205 + if use_cuda:
206 + inputs = inputs.cuda()
207 + targets = targets.cuda()
208 +
209 + # compute output
210 + _, output = model(inputs)
211 +
212 + # measure accuracy and record loss
213 + n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
214 + n_total += current_sample
215 + val_acc_temp = 100. * n_correct / n_total
216 + val_acc.update(val_acc_temp, inputs.size(0))
217 +
218 + loss = criterion(output, targets)
219 + losses.update(loss.item(), inputs.size(0))
220 + # measure elapsed time
221 + batch_time.update(time.time() - end)
222 + end = time.time()
223 +
224 + print(' * Validation: '
225 + 'Loss {loss.avg:.4f}\t'
226 + 'Acc {val_acc.avg:.4f}'.format(
227 + loss=losses, val_acc=val_acc))
228 +
229 + return losses.avg
230 +
231 +class AverageMeter(object):
232 + """Computes and stores the average and current value"""
233 + def __init__(self):
234 + self.reset()
235 + def reset(self):
236 + self.val = 0
237 + self.avg = 0
238 + self.sum = 0
239 + self.count = 0
240 + def update(self, val, n=1):
241 + self.val = val
242 + self.sum += val * n
243 + self.count += n
244 + self.avg = self.sum / self.count
245 +
246 +def create_optimizer(optimizer, model, new_lr, wd):
247 + # setup optimizer
248 + if optimizer == 'sgd':
249 + optimizer = optim.SGD(model.parameters(), lr=new_lr,
250 + momentum=0.9, dampening=0,
251 + weight_decay=wd)
252 + elif optimizer == 'adam':
253 + optimizer = optim.Adam(model.parameters(), lr=new_lr,
254 + weight_decay=wd)
255 + elif optimizer == 'adagrad':
256 + optimizer = optim.Adagrad(model.parameters(),
257 + lr=new_lr,
258 + weight_decay=wd)
259 + return optimizer
260 +
261 +def visualize_the_losses(train_loss, valid_loss):
262 + epoch = []
263 + for i in range (1,31) :
264 + epoch.append(i)
265 + with open("file.txt", "w") as output:
266 + output.write(str(epoch))
267 + output.write('\n')
268 + output.write(str(train_loss))
269 + output.write('\n')
270 + output.write(str(valid_loss))
271 + # fig = plt.figure(figsize=(10,8))
272 + # plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss')
273 + # plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss')
274 +
275 + # find position of lowest validation loss
276 + # minposs = valid_loss.index(min(valid_loss))+1
277 + # plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')
278 +
279 + # plt.xlabel('epochs')
280 + # plt.ylabel('loss')
281 + # plt.ylim(0, 3.5) # consistent scale
282 + # plt.xlim(0, len(train_loss)+1) # consistent scale
283 + # plt.grid(True)
284 + # plt.legend()
285 + # plt.tight_layout()
286 + #plt.show()
287 + # fig.savefig('loss_plot.png', bbox_inches='tight')
288 +
289 +if __name__ == '__main__':
290 + main()
1 +import torch
2 +import torch.nn.functional as F
3 +from torch.autograd import Variable
4 +
5 +import pandas as pd
6 +import math
7 +import os
8 +import configure as c
9 +
10 +from DB_wav_reader import read_feats_structure
11 +from SR_Dataset import read_MFB, ToTensorTestInput
12 +from model.model1 import background_resnet
13 +
14 +def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
15 + model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
16 + if use_cuda:
17 + model.cuda()
18 + print('=> loading checkpoint')
19 + # original saved file with DataParallel
20 + checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
21 + # create new OrderedDict that does not contain `module.`
22 + model.load_state_dict(checkpoint['state_dict'])
23 + model.eval()
24 + return model
25 +
26 +def split_enroll_and_test(dataroot_dir):
27 + DB_all = read_feats_structure(dataroot_dir)
28 + enroll_DB = pd.DataFrame()
29 + test_DB = pd.DataFrame()
30 +
31 + enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
32 + test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
33 +
34 + # Reset the index
35 + enroll_DB = enroll_DB.reset_index(drop=True)
36 + test_DB = test_DB.reset_index(drop=True)
37 + return enroll_DB, test_DB
38 +
39 +def load_enroll_embeddings(embedding_dir):
40 + embeddings = {}
41 + for f in os.listdir(embedding_dir):
42 + spk = f.replace('.pth','')
43 + # Select the speakers who are in the 'enroll_spk_list'
44 + embedding_path = os.path.join(embedding_dir, f)
45 + tmp_embeddings = torch.load(embedding_path)
46 + embeddings[spk] = tmp_embeddings
47 +
48 + return embeddings
49 +
50 +def get_embeddings(use_cuda, filename, model, test_frames):
51 + input, label = read_MFB(filename) # input size:(n_frames, n_dims)
52 +
53 + tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
54 + activation = 0
55 + with torch.no_grad():
56 + for i in range(tot_segments):
57 + temp_input = input[i*test_frames:i*test_frames+test_frames]
58 +
59 + TT = ToTensorTestInput()
60 + temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
61 +
62 + if use_cuda:
63 + temp_input = temp_input.cuda()
64 + temp_activation,_ = model(temp_input)
65 + activation += torch.sum(temp_activation, dim=0, keepdim=True)
66 +
67 + activation = l2_norm(activation, 1)
68 +
69 + return activation
70 +
71 +def l2_norm(input, alpha):
72 + input_size = input.size() # size:(n_frames, dim)
73 + buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
74 + normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
75 + norm = torch.sqrt(normp) # size:(n_frames)
76 + _output = torch.div(input, norm.view(-1, 1).expand_as(input))
77 + output = _output.view(input_size)
78 + # Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
79 + output = output * alpha
80 + return output
81 +
82 +def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
83 + enroll_embedding = embeddings[enroll_speaker]
84 + test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
85 +
86 + score = F.cosine_similarity(test_embedding, enroll_embedding)
87 + score = score.data.cpu().numpy()
88 +
89 + if score > thres:
90 + result = 'Accept'
91 + else:
92 + result = 'Reject'
93 +
94 + test_spk = test_filename.split('/')[-2].split('_')[0]
95 + print("\n=== Speaker verification ===")
96 + print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
97 + print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
98 +
99 +def main():
100 +
101 + log_dir = 'new_model1' # Where the checkpoints are saved
102 + embedding_dir = 'enroll_embeddings1' # Where embeddings are saved
103 + test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
104 +
105 + # Settings
106 + use_cuda = True # Use cuda or not
107 + embedding_size = 128 # Dimension of speaker embeddings
108 + cp_num = 29 # Which checkpoint to use?
109 + n_classes = 241 # How many speakers in training data?
110 + test_frames = 100 # Split the test utterance
111 +
112 + # Load model from checkpoint
113 + model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
114 +
115 + # Get the dataframe for test DB
116 + enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
117 +
118 + # Load enroll embeddings
119 + embeddings = load_enroll_embeddings(embedding_dir)
120 +
121 + """ Test speaker list
122 + '103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
123 + '229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
124 + """
125 +
126 + # Set the true speaker
127 + enroll_speaker = 'zerothfloac'
128 +
129 + # Set the claimed speaker
130 + test_speaker = 'zerothfloac'
131 +
132 + # Threshold
133 + thres = 0.95
134 +
135 + test_path = os.path.join(test_dir, test_speaker, 'test.p')
136 +
137 + # Perform the test
138 + perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
139 +
140 +if __name__ == '__main__':
141 + main()