최성환
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" languageLevel="JDK_14" project-jdk-name="14" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/LYG_project.iml" filepath="$PROJECT_DIR$/.idea/LYG_project.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="2b896c0d-f1d9-4424-819a-bc96fe07a387" name="Default Changelist" comment="" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="Git.Settings">
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
</component>
<component name="MavenImportPreferences">
<option name="generalSettings">
<MavenGeneralSettings>
<option name="mavenHome" value="C:\Program Files\JetBrains\IntelliJ IDEA Community Edition 2020.2\plugins\maven\lib\maven3" />
</MavenGeneralSettings>
</option>
</component>
<component name="ProjectId" id="1kPBM4RUtUJvLFOYYSRQMuBxazR" />
<component name="ProjectViewState">
<option name="hideEmptyMiddlePackages" value="true" />
<option name="showLibraryContents" value="true" />
</component>
<component name="PropertiesComponent">
<property name="RunOnceActivity.OpenProjectViewOnStart" value="true" />
<property name="last_opened_file_path" value="$PROJECT_DIR$/../../../../spring/Myprj" />
</component>
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="2b896c0d-f1d9-4424-819a-bc96fe07a387" name="Default Changelist" comment="" />
<created>1605592296286</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1605592296286</updated>
</task>
<servers />
</component>
<component name="WindowStateProjectService">
<state x="740" y="275" key="FileChooserDialogImpl" timestamp="1605592333707">
<screen x="0" y="0" width="1920" height="1040" />
</state>
<state x="740" y="275" key="FileChooserDialogImpl/0.0.1920.1040/-1920.65.1920.1040@0.0.1920.1040" timestamp="1605592333707" />
</component>
</project>
\ No newline at end of file
......@@ -5,6 +5,8 @@ TEST_WAV_DIR = 'test_wavs'
# Feature path
TRAIN_FEAT_DIR = 'feat_logfbank_nfilt40/train'
# TRAIN_FEAT_DIR = '/test/merge_dataset'
# TRAIN_FEAT_DIR = '/test/trainFeature'
TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test'
# Context window size
......
# Wave path
TRAIN_WAV_DIR = '/home/admin/Desktop/read_25h_2/train'
DEV_WAV_DIR = '/home/admin/Desktop/read_25h_2/dev'
TEST_WAV_DIR = 'test_wavs'
# Feature path
# TRAIN_FEAT_DIR = '/test/zeroth/train_data_01/003'
TRAIN_FEAT_DIR = '/test/merge_train_dataset'
# TRAIN_FEAT_DIR = '/test/trainFeature'
# TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test'
TEST_FEAT_DIR = '/test/merge_test_dataset'
# Context window size
NUM_WIN_SIZE = 100 #10
# Settings for feature extraction
USE_LOGSCALE = True
USE_DELTA = False
USE_SCALE = False
SAMPLE_RATE = 16000
FILTER_BANK = 40
\ No newline at end of file
# Wave path
TRAIN_WAV_DIR = '/home/admin/Desktop/read_25h_2/train'
DEV_WAV_DIR = '/home/admin/Desktop/read_25h_2/dev'
TEST_WAV_DIR = 'test_wavs'
# Feature path
# TRAIN_FEAT_DIR = '/test/zeroth/train_data_01/003'
TRAIN_FEAT_DIR = '/test/zeroth_train_dataset'
# TRAIN_FEAT_DIR = '/test/trainFeature'
# TEST_FEAT_DIR = 'feat_logfbank_nfilt40/test'
TEST_FEAT_DIR = '/test/zeroth_test_dataset'
# Context window size
NUM_WIN_SIZE = 100 #10
# Settings for feature extraction
USE_LOGSCALE = True
USE_DELTA = False
USE_SCALE = False
SAMPLE_RATE = 16000
FILTER_BANK = 40
\ No newline at end of file
import torch
import torch.nn.functional as F
from torch.autograd import Variable
import pandas as pd
import math
import os
import configure as c
from DB_wav_reader import read_feats_structure
from SR_Dataset import read_MFB, ToTensorTestInput
from model.model1 import background_resnet
def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
if use_cuda:
model.cuda()
print('=> loading checkpoint')
# original saved file with DataParallel
checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
# create new OrderedDict that does not contain `module.`
model.load_state_dict(checkpoint['state_dict'])
model.eval()
return model
def split_enroll_and_test(dataroot_dir):
DB_all = read_feats_structure(dataroot_dir)
enroll_DB = pd.DataFrame()
test_DB = pd.DataFrame()
enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
# Reset the index
enroll_DB = enroll_DB.reset_index(drop=True)
test_DB = test_DB.reset_index(drop=True)
return enroll_DB, test_DB
def get_embeddings(use_cuda, filename, model, test_frames):
input, label = read_MFB(filename) # input size:(n_frames, n_dims)
tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
activation = 0
with torch.no_grad():
for i in range(tot_segments):
temp_input = input[i*test_frames:i*test_frames+test_frames]
TT = ToTensorTestInput()
temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
if use_cuda:
temp_input = temp_input.cuda()
temp_activation,_ = model(temp_input)
activation += torch.sum(temp_activation, dim=0, keepdim=True)
activation = l2_norm(activation, 1)
return activation
def l2_norm(input, alpha):
input_size = input.size() # size:(n_frames, dim)
buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
norm = torch.sqrt(normp) # size:(n_frames)
_output = torch.div(input, norm.view(-1, 1).expand_as(input))
output = _output.view(input_size)
# Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
output = output * alpha
return output
def enroll_per_spk(use_cuda, test_frames, model, DB, embedding_dir):
"""
Output the averaged d-vector for each speaker (enrollment)
Return the dictionary (length of n_spk)
"""
n_files = len(DB) # 10
enroll_speaker_list = sorted(set(DB['speaker_id']))
embeddings = {}
# Aggregates all the activations
print("Start to aggregate all the d-vectors per enroll speaker")
for i in range(n_files):
filename = DB['filename'][i]
spk = DB['speaker_id'][i]
activation = get_embeddings(use_cuda, filename, model, test_frames)
if spk in embeddings:
embeddings[spk] += activation
else:
embeddings[spk] = activation
print("Aggregates the activation (spk : %s)" % (spk))
if not os.path.exists(embedding_dir):
os.makedirs(embedding_dir)
# Save the embeddings
for spk_index in enroll_speaker_list:
embedding_path = os.path.join(embedding_dir, spk_index+'.pth')
torch.save(embeddings[spk_index], embedding_path)
print("Save the embeddings for %s" % (spk_index))
return embeddings
def main():
# Settings
use_cuda = True
log_dir = 'new_model1'
embedding_size = 128
cp_num = 24 # Which checkpoint to use?
n_classes = 241
test_frames = 200
# Load model from checkpoint
model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
# Get the dataframe for enroll DB
enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
# Where to save embeddings
embedding_dir = 'enroll_embeddings1'
# Perform the enrollment and save the results
enroll_per_spk(use_cuda, test_frames, model, enroll_DB, embedding_dir)
""" Test speaker list
'103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
'229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
"""
if __name__ == '__main__':
main()
\ No newline at end of file
......@@ -123,10 +123,10 @@ def main():
"""
spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
'229M2031', '230M4087', '233F4013', '236M3043', '240M3063']
'229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
# Set the test speaker
test_speaker = '230M4087'
test_speaker = '778M8777'
test_path = os.path.join(test_dir, test_speaker, 'test.p')
......@@ -134,4 +134,4 @@ def main():
best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
if __name__ == '__main__':
main()
\ No newline at end of file
main()
......
import torch
import torch.nn.functional as F
from torch.autograd import Variable
import pandas as pd
import math
import os
import configure as c
from DB_wav_reader import read_feats_structure
from SR_Dataset import read_MFB, ToTensorTestInput
from model.model1 import background_resnet
def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
if use_cuda:
model.cuda()
print('=> loading checkpoint')
# original saved file with DataParallel
checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
# create new OrderedDict that does not contain `module.`
model.load_state_dict(checkpoint['state_dict'])
model.eval()
return model
def split_enroll_and_test(dataroot_dir):
DB_all = read_feats_structure(dataroot_dir)
enroll_DB = pd.DataFrame()
test_DB = pd.DataFrame()
enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
# Reset the index
enroll_DB = enroll_DB.reset_index(drop=True)
test_DB = test_DB.reset_index(drop=True)
return enroll_DB, test_DB
def load_enroll_embeddings(embedding_dir):
embeddings = {}
for f in os.listdir(embedding_dir):
spk = f.replace('.pth','')
# Select the speakers who are in the 'enroll_spk_list'
embedding_path = os.path.join(embedding_dir, f)
tmp_embeddings = torch.load(embedding_path)
embeddings[spk] = tmp_embeddings
return embeddings
def get_embeddings(use_cuda, filename, model, test_frames):
input, label = read_MFB(filename) # input size:(n_frames, n_dims)
tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
activation = 0
with torch.no_grad():
for i in range(tot_segments):
temp_input = input[i*test_frames:i*test_frames+test_frames]
TT = ToTensorTestInput()
temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
if use_cuda:
temp_input = temp_input.cuda()
temp_activation,_ = model(temp_input)
activation += torch.sum(temp_activation, dim=0, keepdim=True)
activation = l2_norm(activation, 1)
return activation
def l2_norm(input, alpha):
input_size = input.size() # size:(n_frames, dim)
buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
norm = torch.sqrt(normp) # size:(n_frames)
_output = torch.div(input, norm.view(-1, 1).expand_as(input))
output = _output.view(input_size)
# Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
output = output * alpha
return output
def perform_identification(use_cuda, model, embeddings, test_filename, test_frames, spk_list):
test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
max_score = -10**8
best_spk = None
for spk in spk_list:
score = F.cosine_similarity(test_embedding, embeddings[spk])
score = score.data.cpu().numpy()
if score > max_score:
max_score = score
best_spk = spk
#print("Speaker identification result : %s" %best_spk)
true_spk = test_filename.split('/')[-2].split('_')[0]
print("\n=== Speaker identification ===")
print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_spk, best_spk, true_spk==best_spk))
return best_spk
def main():
log_dir = 'new_model1' # Where the checkpoints are saved
embedding_dir = 'enroll_embeddings1' # Where embeddings are saved
test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
# Settings
use_cuda = True # Use cuda or not
embedding_size = 128 # Dimension of speaker embeddings
cp_num = 30 # Which checkpoint to use?
n_classes = 241 # How many speakers in training data?
test_frames = 100 # Split the test utterance
# Load model from checkpoint
model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
# Get the dataframe for test DB
enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
# Load enroll embeddings
embeddings = load_enroll_embeddings(embedding_dir)
""" Test speaker list
'103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
'229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
"""
spk_list = ['103F3021', '207F2088', '213F5100', '217F3038', '225M4062',\
'229M2031', '230M4087', '233F4013', '236M3043', '240M3063','777M7777','778M8777']
# Set the test speaker
test_speaker = '213F5100'
test_path = os.path.join(test_dir, test_speaker, 'test.p')
# Perform the test
best_spk = perform_identification(use_cuda, model, embeddings, test_path, test_frames, spk_list)
if __name__ == '__main__':
main()
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function
import model.resnet as resnet
class background_resnet(nn.Module):
def __init__(self, embedding_size, num_classes, backbone='resnet18'):
super(background_resnet, self).__init__()
self.backbone = backbone
# copying modules from pretrained models
if backbone == 'resnet50':
self.pretrained = resnet.resnet50(pretrained=False)
elif backbone == 'resnet101':
self.pretrained = resnet.resnet101(pretrained=False)
elif backbone == 'resnet152':
self.pretrained = resnet.resnet152(pretrained=False)
elif backbone == 'resnet18':
self.pretrained = resnet.resnet18(pretrained=False)
elif backbone == 'resnet34':
self.pretrained = resnet.resnet34(pretrained=False)
else:
raise RuntimeError('unknown backbone: {}'.format(backbone))
self.fc0 = nn.Linear(128, embedding_size)
self.bn0 = nn.BatchNorm1d(embedding_size)
self.relu = nn.ReLU()
self.last = nn.Linear(embedding_size, num_classes)
def forward(self, x):
# input x: minibatch x 1 x 40 x 40
x = self.pretrained.conv1(x)
x = self.pretrained.bn1(x)
x = self.pretrained.relu(x)
x = self.pretrained.layer1(x)
x = self.pretrained.layer2(x)
x = self.pretrained.layer3(x)
x = self.pretrained.layer4(x)
out = F.adaptive_avg_pool2d(x,1) # [batch, 128, 1, 1]
out = torch.squeeze(out) # [batch, n_embed]
# flatten the out so that the fully connected layer can be connected from here
out = out.view(x.size(0), -1) # (n_batch, n_embed)
spk_embedding = self.fc0(out)
out = F.relu(self.bn0(spk_embedding)) # [batch, n_embed]
out = self.last(out)
return spk_embedding, out
\ No newline at end of file
......@@ -113,6 +113,7 @@ class ResNet(nn.Module):
self.layer2 = self._make_layer(block, 32, layers[1], stride=2)
self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
self.layer4 = self._make_layer(block, 128, layers[3], stride=2)
self.avgpool = nn.AvgPool2d(1, stride=1)
self.fc = nn.Linear(128 * block.expansion, num_classes)
......
"""Imported from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
and added support for the 1x32x32 mel spectrogram for the speech recognition.
Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun: Deep Residual Learning for Image Recognition
https://arxiv.org/abs/1512.03385
"""
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo
__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
'resnet152']
model_urls = {
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}
def conv3x3(in_planes, out_planes, stride=1):
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000, in_channels=1):
self.inplanes = 16
super(ResNet, self).__init__()
self.conv1 = nn.Conv2d(in_channels, 16, kernel_size=7, stride=1, padding=3,
bias=False) # ori : stride = 2
self.bn1 = nn.BatchNorm2d(16)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 16, layers[0])
self.layer2 = self._make_layer(block, 32, layers[1], stride=2)
self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
self.layer4 = self._make_layer(block, 128, layers[3], stride=2)
self.layer5 = self._make_layer(block, 256, layers[3], stride=2)
self.avgpool = nn.AvgPool2d(1, stride=1)
self.fc = nn.Linear(128 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def resnet18(pretrained=False, **kwargs):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2, 2], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
return model
def resnet34(pretrained=False, **kwargs):
"""Constructs a ResNet-34 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [3, 4, 6, 3, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
return model
def resnet50(pretrained=False, **kwargs):
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
return model
def resnet101(pretrained=False, **kwargs):
"""Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
return model
def resnet152(pretrained=False, **kwargs):
"""Constructs a ResNet-152 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
return model
\ No newline at end of file
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import time
import os
import numpy as np
import configure as c
import pandas as pd
from DB_wav_reader import read_feats_structure
from SR_Dataset import read_MFB, TruncatedInputfromMFB, ToTensorInput, ToTensorDevInput, DvectorDataset, collate_fn_feat_padded
from model.model1 import background_resnet
import matplotlib.pyplot as plt
import pandas as pd
def load_dataset(val_ratio):
# Load training set and validation set
# Split training set into training set and validation set according to "val_ratio"
train_DB, valid_DB = split_train_dev(c.TRAIN_FEAT_DIR, val_ratio)
file_loader = read_MFB # numpy array:(n_frames, n_dims)
transform = transforms.Compose([
TruncatedInputfromMFB(), # numpy array:(1, n_frames, n_dims)
ToTensorInput() # torch tensor:(1, n_dims, n_frames)
])
transform_T = ToTensorDevInput()
speaker_list = sorted(set(train_DB['speaker_id'])) # len(speaker_list) == n_speakers
spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)}
train_dataset = DvectorDataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx)
valid_dataset = DvectorDataset(DB=valid_DB, loader=file_loader, transform=transform_T, spk_to_idx=spk_to_idx)
n_classes = len(speaker_list) # How many speakers? 240
return train_dataset, valid_dataset, n_classes
def split_train_dev(train_feat_dir, valid_ratio):
train_valid_DB = read_feats_structure(train_feat_dir)
total_len = len(train_valid_DB) # 148642
valid_len = int(total_len * valid_ratio/100.)
train_len = total_len - valid_len
shuffled_train_valid_DB = train_valid_DB.sample(frac=1).reset_index(drop=True)
# Split the DB into train and valid set
train_DB = shuffled_train_valid_DB.iloc[:train_len]
valid_DB = shuffled_train_valid_DB.iloc[train_len:]
# Reset the index
train_DB = train_DB.reset_index(drop=True)
valid_DB = valid_DB.reset_index(drop=True)
print('\nTraining set %d utts (%0.1f%%)' %(train_len, (train_len/total_len)*100))
print('Validation set %d utts (%0.1f%%)' %(valid_len, (valid_len/total_len)*100))
print('Total %d utts' %(total_len))
return train_DB, valid_DB
def main():
# Set hyperparameters
use_cuda = True # use gpu or cpu
val_ratio = 10 # Percentage of validation set
embedding_size = 128
start = 1 # Start epoch
n_epochs = 30 # How many epochs?
end = start + n_epochs # Last epoch
lr = 1e-1 # Initial learning rate
wd = 1e-4 # Weight decay (L2 penalty)
optimizer_type = 'sgd' # ex) sgd, adam, adagrad
batch_size = 64 # Batch size for training
valid_batch_size = 16 # Batch size for validation
use_shuffle = True # Shuffle for training or not
# Load dataset
train_dataset, valid_dataset, n_classes = load_dataset(val_ratio)
# print the experiment configuration
print('\nNumber of classes (speakers):\n{}\n'.format(n_classes))
log_dir = 'new_model1' # where to save checkpoints
if not os.path.exists(log_dir):
os.makedirs(log_dir)
# instantiate model and initialize weights
model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
if use_cuda:
model.cuda()
# define loss function (criterion), optimizer and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = create_optimizer(optimizer_type, model, lr, wd)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, min_lr=1e-4, verbose=1)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=use_shuffle)
valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
batch_size=valid_batch_size,
shuffle=False,
collate_fn = collate_fn_feat_padded)
# to track the average training loss per epoch as the model trains
avg_train_losses = []
# to track the average validation loss per epoch as the model trains
avg_valid_losses = []
for epoch in range(start, end):
# train for one epoch
train_loss = train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes)
# evaluate on validation set
valid_loss = validate(valid_loader, model, criterion, use_cuda, epoch)
scheduler.step(valid_loss, epoch)
# calculate average loss over an epoch
avg_train_losses.append(train_loss)
avg_valid_losses.append(valid_loss)
# do checkpointing
torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict()},
'{}/checkpoint_{}.pth'.format(log_dir, epoch))
# find position of lowest validation loss
minposs = avg_valid_losses.index(min(avg_valid_losses))+1
print('Lowest validation loss at epoch %d' %minposs)
# visualize the loss and learning rate as the network trained
visualize_the_losses(avg_train_losses, avg_valid_losses)
def train(train_loader, model, criterion, optimizer, use_cuda, epoch, n_classes):
batch_time = AverageMeter()
losses = AverageMeter()
train_acc = AverageMeter()
n_correct, n_total = 0, 0
log_interval = 84
# switch to train mode
model.train()
end = time.time()
# pbar = tqdm(enumerate(train_loader))
for batch_idx, (data) in enumerate(train_loader):
inputs, targets = data # target size:(batch size,1), input size:(batch size, 1, dim, win)
targets = targets.view(-1) # target size:(batch size)
current_sample = inputs.size(0) # batch size
if use_cuda:
inputs = inputs.cuda()
targets = targets.cuda()
_, output = model(inputs) # out size:(batch size, #classes), for softmax
# calculate accuracy of predictions in the current batch
n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
n_total += current_sample
train_acc_temp = 100. * n_correct / n_total
train_acc.update(train_acc_temp, inputs.size(0))
loss = criterion(output, targets)
losses.update(loss.item(), inputs.size(0))
# compute gradient and do SGD step
optimizer.zero_grad()
loss.backward()
optimizer.step()
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if batch_idx % log_interval == 0:
print(
'Train Epoch: {:3d} [{:8d}/{:8d} ({:3.0f}%)]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Loss {loss.avg:.4f}\t'
'Acc {train_acc.avg:.4f}'.format(
epoch, batch_idx * len(inputs), len(train_loader.dataset),
100. * batch_idx / len(train_loader),
batch_time=batch_time, loss=losses, train_acc=train_acc))
return losses.avg
def validate(val_loader, model, criterion, use_cuda, epoch):
batch_time = AverageMeter()
losses = AverageMeter()
val_acc = AverageMeter()
n_correct, n_total = 0, 0
# switch to evaluate mode
model.eval()
with torch.no_grad():
end = time.time()
for i, (data) in enumerate(val_loader):
inputs, targets = data
current_sample = inputs.size(0) # batch size
if use_cuda:
inputs = inputs.cuda()
targets = targets.cuda()
# compute output
_, output = model(inputs)
# measure accuracy and record loss
n_correct += (torch.max(output, 1)[1].long().view(targets.size()) == targets).sum().item()
n_total += current_sample
val_acc_temp = 100. * n_correct / n_total
val_acc.update(val_acc_temp, inputs.size(0))
loss = criterion(output, targets)
losses.update(loss.item(), inputs.size(0))
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
print(' * Validation: '
'Loss {loss.avg:.4f}\t'
'Acc {val_acc.avg:.4f}'.format(
loss=losses, val_acc=val_acc))
return losses.avg
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def create_optimizer(optimizer, model, new_lr, wd):
# setup optimizer
if optimizer == 'sgd':
optimizer = optim.SGD(model.parameters(), lr=new_lr,
momentum=0.9, dampening=0,
weight_decay=wd)
elif optimizer == 'adam':
optimizer = optim.Adam(model.parameters(), lr=new_lr,
weight_decay=wd)
elif optimizer == 'adagrad':
optimizer = optim.Adagrad(model.parameters(),
lr=new_lr,
weight_decay=wd)
return optimizer
def visualize_the_losses(train_loss, valid_loss):
epoch = []
for i in range (1,31) :
epoch.append(i)
with open("file.txt", "w") as output:
output.write(str(epoch))
output.write('\n')
output.write(str(train_loss))
output.write('\n')
output.write(str(valid_loss))
# fig = plt.figure(figsize=(10,8))
# plt.plot(range(1,len(train_loss)+1),train_loss, label='Training Loss')
# plt.plot(range(1,len(valid_loss)+1),valid_loss, label='Validation Loss')
# find position of lowest validation loss
# minposs = valid_loss.index(min(valid_loss))+1
# plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')
# plt.xlabel('epochs')
# plt.ylabel('loss')
# plt.ylim(0, 3.5) # consistent scale
# plt.xlim(0, len(train_loss)+1) # consistent scale
# plt.grid(True)
# plt.legend()
# plt.tight_layout()
#plt.show()
# fig.savefig('loss_plot.png', bbox_inches='tight')
if __name__ == '__main__':
main()
import torch
import torch.nn.functional as F
from torch.autograd import Variable
import pandas as pd
import math
import os
import configure as c
from DB_wav_reader import read_feats_structure
from SR_Dataset import read_MFB, ToTensorTestInput
from model.model1 import background_resnet
def load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes):
model = background_resnet(embedding_size=embedding_size, num_classes=n_classes)
if use_cuda:
model.cuda()
print('=> loading checkpoint')
# original saved file with DataParallel
checkpoint = torch.load(log_dir + '/checkpoint_' + str(cp_num) + '.pth')
# create new OrderedDict that does not contain `module.`
model.load_state_dict(checkpoint['state_dict'])
model.eval()
return model
def split_enroll_and_test(dataroot_dir):
DB_all = read_feats_structure(dataroot_dir)
enroll_DB = pd.DataFrame()
test_DB = pd.DataFrame()
enroll_DB = DB_all[DB_all['filename'].str.contains('enroll.p')]
test_DB = DB_all[DB_all['filename'].str.contains('test.p')]
# Reset the index
enroll_DB = enroll_DB.reset_index(drop=True)
test_DB = test_DB.reset_index(drop=True)
return enroll_DB, test_DB
def load_enroll_embeddings(embedding_dir):
embeddings = {}
for f in os.listdir(embedding_dir):
spk = f.replace('.pth','')
# Select the speakers who are in the 'enroll_spk_list'
embedding_path = os.path.join(embedding_dir, f)
tmp_embeddings = torch.load(embedding_path)
embeddings[spk] = tmp_embeddings
return embeddings
def get_embeddings(use_cuda, filename, model, test_frames):
input, label = read_MFB(filename) # input size:(n_frames, n_dims)
tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames'
activation = 0
with torch.no_grad():
for i in range(tot_segments):
temp_input = input[i*test_frames:i*test_frames+test_frames]
TT = ToTensorTestInput()
temp_input = TT(temp_input) # size:(1, 1, n_dims, n_frames)
if use_cuda:
temp_input = temp_input.cuda()
temp_activation,_ = model(temp_input)
activation += torch.sum(temp_activation, dim=0, keepdim=True)
activation = l2_norm(activation, 1)
return activation
def l2_norm(input, alpha):
input_size = input.size() # size:(n_frames, dim)
buffer = torch.pow(input, 2) # 2 denotes a squared operation. size:(n_frames, dim)
normp = torch.sum(buffer, 1).add_(1e-10) # size:(n_frames)
norm = torch.sqrt(normp) # size:(n_frames)
_output = torch.div(input, norm.view(-1, 1).expand_as(input))
output = _output.view(input_size)
# Multiply by alpha = 10 as suggested in https://arxiv.org/pdf/1703.09507.pdf
output = output * alpha
return output
def perform_verification(use_cuda, model, embeddings, enroll_speaker, test_filename, test_frames, thres):
enroll_embedding = embeddings[enroll_speaker]
test_embedding = get_embeddings(use_cuda, test_filename, model, test_frames)
score = F.cosine_similarity(test_embedding, enroll_embedding)
score = score.data.cpu().numpy()
if score > thres:
result = 'Accept'
else:
result = 'Reject'
test_spk = test_filename.split('/')[-2].split('_')[0]
print("\n=== Speaker verification ===")
print("True speaker: %s\nClaimed speaker : %s\n\nResult : %s\n" %(enroll_speaker, test_spk, result))
print("Score : %0.4f\nThreshold : %0.2f\n" %(score, thres))
def main():
log_dir = 'new_model1' # Where the checkpoints are saved
embedding_dir = 'enroll_embeddings1' # Where embeddings are saved
test_dir = 'feat_logfbank_nfilt40/test/' # Where test features are saved
# Settings
use_cuda = True # Use cuda or not
embedding_size = 128 # Dimension of speaker embeddings
cp_num = 29 # Which checkpoint to use?
n_classes = 241 # How many speakers in training data?
test_frames = 100 # Split the test utterance
# Load model from checkpoint
model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
# Get the dataframe for test DB
enroll_DB, test_DB = split_enroll_and_test(c.TEST_FEAT_DIR)
# Load enroll embeddings
embeddings = load_enroll_embeddings(embedding_dir)
""" Test speaker list
'103F3021', '207F2088', '213F5100', '217F3038', '225M4062',
'229M2031', '230M4087', '233F4013', '236M3043', '240M3063'
"""
# Set the true speaker
enroll_speaker = 'zerothfloac'
# Set the claimed speaker
test_speaker = 'zerothfloac'
# Threshold
thres = 0.95
test_path = os.path.join(test_dir, test_speaker, 'test.p')
# Perform the test
perform_verification(use_cuda, model, embeddings, enroll_speaker, test_path, test_frames, thres)
if __name__ == '__main__':
main()