add code

heeseon cheon
Commit 4f3fddc3c7da88bd6d1a822c6d6ea3d387129a33 4f3fddc3 1 parent a0a5240e
Showing 6 changed files with 803 additions and 0 deletions
code/Visualization.ipynb
code/masknn.py
code/modeling.py
code/resnet_mask.py
code/run.sh
code/utils.py
--- a/code/Visualization.ipynb 0 → 100644
View file @4f3fddc
+++ b/code/Visualization.ipynb 0 → 100644
View file @4f3fddc
--- a/code/masknn.py 0 → 100644
View file @4f3fddc
+++ b/code/masknn.py 0 → 100644
View file @4f3fddc
+ import torch
+ import torch.nn as nn
+ import torch.nn.functional as F
+ from torch.nn.parameter import Parameter
+ 
+ 
+ class Masker(torch.autograd.Function):
+     @staticmethod
+     def forward(ctx, x, mask):
+         return x * mask
+ 
+     @staticmethod
+     def backward(ctx, grad):
+         return grad, None
+ 
+ 
+ class MaskConv2d(nn.Conv2d):
+     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                  padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros'):
+         super(MaskConv2d, self).__init__(in_channels, out_channels, kernel_size, stride,
+                                      padding, dilation, groups, bias, padding_mode)
+         self.mask = Parameter(torch.ones(self.weight.size()), requires_grad=False)
+ 
+     def forward(self, inputs):
+         masked_weight = Masker.apply(self.weight, self.mask)
+         return super(MaskConv2d, self)._conv_forward(inputs, masked_weight)
--- a/code/modeling.py 0 → 100644
View file @4f3fddc
+++ b/code/modeling.py 0 → 100644
View file @4f3fddc
+ import time
+ import random
+ import pathlib
+ from os.path import isfile
+ import copy
+ import sys
+ 
+ import numpy as np
+ import cv2
+ 
+ import torch
+ import torch.nn as nn
+ import torch.nn.functional as F
+ import torch.optim as optim
+ import torch.backends.cudnn as cudnn
+ 
+ from torch.autograd import Variable
+ import torchvision
+ import torchvision.transforms as transforms
+ 
+ from resnet_mask import *
+ from utils import *
+ 
+ 
+ def main(args):
+     device = 'cuda' if torch.cuda.is_available() else 'cpu'
+     torch.manual_seed(777)
+     if device =='cuda':
+         torch.cuda.manual_seed_all(777)
+     
+     ## args
+     layers     = int(args.layers)
+     prune_type = args.prune_type
+     prune_rate = float(args.prune_rate)
+     prune_imp  = args.prune_imp
+     reg        = args.reg
+     epochs     = int(args.epochs)
+     batch_size = int(args.batch_size)
+     lr         = float(args.lr)
+     momentum   = float(args.momentum)
+     wd         = float(args.wd)
+     odecay     = float(args.odecay)
+ 
+     if prune_type:
+         prune = {'type':prune_type, 'rate':prune_rate}
+     else:
+         prune = None
+     
+     if reg == 'reg_cov':
+         reg = reg_cov
+     
+     cfgs = {
+         '18':  (BasicBlock, [2, 2, 2, 2]),
+         '34':  (BasicBlock, [3, 4, 6, 3]),
+         '50':  (Bottleneck, [3, 4, 6, 3]),
+         '101': (Bottleneck, [3, 4, 23, 3]),
+         '152': (Bottleneck, [3, 8, 36, 3]),
+     }
+     cfgs_cifar = {
+         '20':  [3, 3, 3],
+         '32':  [5, 5, 5],
+         '44':  [7, 7, 7],
+         '56':  [9, 9, 9],
+         '110': [18, 18, 18],
+     }
+     
+     train_data_mean = (0.5, 0.5, 0.5)
+     train_data_std  = (0.5, 0.5, 0.5)
+ 
+     transform_train = transforms.Compose([
+         transforms.RandomCrop(32, padding=4),
+         transforms.ToTensor(),
+         transforms.Normalize(train_data_mean, train_data_std)
+     ])
+ 
+     transform_test = transforms.Compose([
+         transforms.ToTensor(),
+         transforms.Normalize(train_data_mean, train_data_std)
+     ])
+ 
+     trainset    = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
+     trainloader = torch.utils.data.DataLoader(trainset, batch_size=256, shuffle=True, num_workers=4)
+     testset     = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
+     testloader  = torch.utils.data.DataLoader(testset, batch_size=256, shuffle=False, num_workers=4)
+ 
+     classes = ('plane','car','bird','cat','deer','dog','frog','horse','ship','truck')
+     
+     model = ResNet_CIFAR(BasicBlock, cfgs_cifar['56'], 10).to(device)
+     image_size = 32
+     
+     criterion = nn.CrossEntropyLoss().to(device)
+     optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=wd) #nesterov=args.nesterov)
+     lr_sche   = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
+     
+     ##### main 함수 보고 train 짜기
+     best_acc1 = 0.0
+ 
+     print('prune rate', prune_rate, 'regularization odecay', odecay)
+ 
+     for epoch in range(epochs):
+ 
+         acc1_train_cor, acc5_train_cor = train(trainloader, epoch=epoch, model=model, 
+                                                criterion=criterion, optimizer=optimizer, 
+                                                prune=prune, reg=reg, odecay=odecay)
+         acc1_valid_cor, acc5_valid_cor = validate(testloader, epoch=epoch, model=model, criterion=criterion)
+ 
+         acc1_train = round(acc1_train_cor.item(), 4)
+         acc5_train = round(acc5_train_cor.item(), 4)
+         acc1_valid = round(acc1_valid_cor.item(), 4)
+         acc5_valid = round(acc5_valid_cor.item(), 4)
+ 
+         # remember best Acc@1 and save checkpoint and summary csv file
+     #     summary = [epoch, acc1_train, acc5_train, acc1_valid, acc5_valid]
+ 
+         is_best = acc1_valid > best_acc1
+         best_acc1 = max(acc1_valid, best_acc1)
+         if is_best:
+             summary = [epoch, acc1_train, acc5_train, acc1_valid, acc5_valid]
+     print(summary)
+     #     save_model(arch_name, args.dataset, state, args.save)
+     #     save_summary(arch_name, args.dataset, args.save.split('.pth')[0], summary)
+ 
+ if __name__ == '__main__':
+     import argparse
+     parser = argparse.ArgumentParser(description="")
+     parser.add_argument('--layers', default=56)
+     parser.add_argument('--prune_type', default=None, help='None / structured / unstructured')
+     parser.add_argument('--prune_rate', default=0.9)
+     parser.add_argument('--prune_imp', default='L2')
+     parser.add_argument('--reg', default=None, help='None / reg_cov')
+     parser.add_argument('--epochs', default=300)
+     parser.add_argument('--batch_size', default=128)
+     parser.add_argument('--lr', default=0.2)
+     parser.add_argument('--momentum', default=0.9)
+     parser.add_argument('--wd', default=1e-4)
+     parser.add_argument('--odecay', default=1)
+     args = parser.parse_args()
+     
+     main(args)
--- a/code/resnet_mask.py 0 → 100644
View file @4f3fddc
+++ b/code/resnet_mask.py 0 → 100644
View file @4f3fddc
+ import torch
+ import torch.nn as nn
+ import masknn as mnn
+ 
+ 
+ def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+     """3x3 convolution with padding"""
+     return mnn.MaskConv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                      padding=dilation, groups=groups, bias=False, dilation=dilation)
+ 
+ 
+ def conv1x1(in_planes, out_planes, stride=1):
+     """1x1 convolution"""
+     return mnn.MaskConv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+ 
+ 
+ class BasicBlock(nn.Module):
+     expansion = 1
+     __constants__ = ['downsample']
+ 
+     def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                  base_width=64, dilation=1, norm_layer=None):
+         super(BasicBlock, self).__init__()
+         if norm_layer is None:
+             norm_layer = nn.BatchNorm2d
+         if groups != 1 or base_width != 64:
+             raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+         if dilation > 1:
+             raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+         # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+         self.conv1 = conv3x3(inplanes, planes, stride)
+         self.bn1 = norm_layer(planes)
+         self.relu = nn.ReLU(inplace=True)
+         self.conv2 = conv3x3(planes, planes)
+         self.bn2 = norm_layer(planes)
+         self.downsample = downsample
+         self.stride = stride
+ 
+     def forward(self, x):
+         identity = x
+ 
+         out = self.conv1(x)
+         out = self.bn1(out)
+         out = self.relu(out)
+ 
+         out = self.conv2(out)
+         out = self.bn2(out)
+ 
+         if self.downsample is not None:
+             identity = self.downsample(x)
+ 
+         out += identity
+         out = self.relu(out)
+ 
+         return out
+ 
+ 
+ class Bottleneck(nn.Module):
+     expansion = 4
+     __constants__ = ['downsample']
+ 
+     def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                  base_width=64, dilation=1, norm_layer=None):
+         super(Bottleneck, self).__init__()
+         if norm_layer is None:
+             norm_layer = nn.BatchNorm2d
+         width = int(planes * (base_width / 64.)) * groups
+         # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+         self.conv1 = conv1x1(inplanes, width)
+         self.bn1 = norm_layer(width)
+         self.conv2 = conv3x3(width, width, stride, groups, dilation)
+         self.bn2 = norm_layer(width)
+         self.conv3 = conv1x1(width, planes * self.expansion)
+         self.bn3 = norm_layer(planes * self.expansion)
+         self.relu = nn.ReLU(inplace=True)
+         self.downsample = downsample
+         self.stride = stride
+ 
+     def forward(self, x):
+         identity = x
+ 
+         out = self.conv1(x)
+         out = self.bn1(out)
+         out = self.relu(out)
+ 
+         out = self.conv2(out)
+         out = self.bn2(out)
+         out = self.relu(out)
+ 
+         out = self.conv3(out)
+         out = self.bn3(out)
+ 
+         if self.downsample is not None:
+             identity = self.downsample(x)
+ 
+         out += identity
+         out = self.relu(out)
+ 
+         return out
+ 
+ 
+ class ResNet(nn.Module):
+     def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
+                  groups=1, width_per_group=64, replace_stride_with_dilation=None,
+                  norm_layer=None):
+         super(ResNet, self).__init__()
+         self.block_name = str(block.__name__)
+         if norm_layer is None:
+             norm_layer = nn.BatchNorm2d
+         self._norm_layer = norm_layer
+ 
+         self.inplanes = 64
+         self.dilation = 1
+         if replace_stride_with_dilation is None:
+             # each element in the tuple indicates if we should replace
+             # the 2x2 stride with a dilated convolution instead
+             replace_stride_with_dilation = [False, False, False]
+         if len(replace_stride_with_dilation) != 3: 
+             raise ValueError("replace_stride_with_dilation should be None "
+                              "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+         self.groups = groups
+         self.base_width = width_per_group
+         self.conv1 = mnn.MaskConv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
+                                     bias=False)
+         self.bn1 = norm_layer(self.inplanes)
+         self.relu = nn.ReLU(inplace=True)
+         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+         self.layer1 = self._make_layer(block, 64, layers[0])
+         self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
+                                        dilate=replace_stride_with_dilation[0])
+         self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                        dilate=replace_stride_with_dilation[1])
+         self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
+                                        dilate=replace_stride_with_dilation[2])
+         self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+         self.fc = nn.Linear(512 * block.expansion, num_classes)
+ 
+         for m in self.modules():
+             if isinstance(m, mnn.MaskConv2d):
+                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+             elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                 nn.init.constant_(m.weight, 1)
+                 nn.init.constant_(m.bias, 0)
+ 
+         # Zero-initialize the last BN in each residual branch,
+         # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+         # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+         if zero_init_residual:
+             for m in self.modules():
+                 if isinstance(m, Bottleneck):
+                     nn.init.constant_(m.bn3.weight, 0)
+                 elif isinstance(m, BasicBlock):
+                     nn.init.constant_(m.bn2.weight, 0)
+ 
+     def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+         norm_layer = self._norm_layer
+         downsample = None
+         previous_dilation = self.dilation
+         if dilate:
+             self.dilation *= stride
+             stride = 1
+         if stride != 1 or self.inplanes != planes * block.expansion:
+             downsample = nn.Sequential(
+                 conv1x1(self.inplanes, planes * block.expansion, stride),
+                 norm_layer(planes * block.expansion),
+             )
+ 
+         layers = []
+         layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
+                             self.base_width, previous_dilation, norm_layer))
+         self.inplanes = planes * block.expansion
+         for _ in range(1, blocks):
+             layers.append(block(self.inplanes, planes, groups=self.groups,
+                                 base_width=self.base_width, dilation=self.dilation,
+                                 norm_layer=norm_layer))
+ 
+         return nn.Sequential(*layers)
+ 
+     def _forward_impl(self, x):
+         # See note [TorchScript super()]
+         x = self.conv1(x)
+         x = self.bn1(x)
+         x = self.relu(x)
+         x = self.maxpool(x)
+ 
+         x = self.layer1(x)
+         x = self.layer2(x)
+         x = self.layer3(x)
+         x = self.layer4(x)
+ 
+         x = self.avgpool(x)
+         x = torch.flatten(x, 1)
+         x = self.fc(x)
+         return x
+ 
+     def forward(self, x):
+         return self._forward_impl(x)
+ 
+ class ResNet_CIFAR(nn.Module):
+     def __init__(self, block, layers, num_classes=10, zero_init_residual=False,
+                  groups=1, width_per_group=64, replace_stride_with_dilation=None,
+                  norm_layer=None):
+         super(ResNet_CIFAR, self).__init__()
+         self.block_name = str(block.__name__)
+         if norm_layer is None:
+             norm_layer = nn.BatchNorm2d
+         self._norm_layer = norm_layer
+ 
+         self.inplanes = 16
+         self.dilation = 1
+         if replace_stride_with_dilation is None:
+             # each element in the tuple indicates if we should replace
+             # the 2x2 stride with a dilated convolution instead
+             replace_stride_with_dilation = [False, False, False]
+         if len(replace_stride_with_dilation) != 3: 
+             raise ValueError("replace_stride_with_dilation should be None "
+                              "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+         self.groups = groups
+         self.base_width = width_per_group
+         self.conv1 = mnn.MaskConv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1,
+                                     bias=False)                               
+         self.bn1 = norm_layer(self.inplanes)
+         self.relu = nn.ReLU(inplace=True)
+         self.layer1 = self._make_layer(block, 16, layers[0])
+         self.layer2 = self._make_layer(block, 32, layers[1], stride=2,
+                                        dilate=replace_stride_with_dilation[0])
+         self.layer3 = self._make_layer(block, 64, layers[2], stride=2,
+                                        dilate=replace_stride_with_dilation[1])
+         self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+         self.fc = nn.Linear(64 * block.expansion, num_classes)
+ 
+         for m in self.modules():
+             if isinstance(m, mnn.MaskConv2d):
+                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+             elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                 nn.init.constant_(m.weight, 1)
+                 nn.init.constant_(m.bias, 0)
+ 
+         # Zero-initialize the last BN in each residual branch,
+         # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+         # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+         if zero_init_residual:
+             for m in self.modules():
+                 if isinstance(m, Bottleneck):
+                     nn.init.constant_(m.bn3.weight, 0)
+                 elif isinstance(m, BasicBlock):
+                     nn.init.constant_(m.bn2.weight, 0)
+ 
+     def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+         norm_layer = self._norm_layer
+         downsample = None
+         previous_dilation = self.dilation
+         if dilate:
+             self.dilation *= stride
+             stride = 1
+         if stride != 1 or self.inplanes != planes * block.expansion:
+             downsample = nn.Sequential(
+                 conv1x1(self.inplanes, planes * block.expansion, stride),
+                 norm_layer(planes * block.expansion),
+             )
+ 
+         layers = []
+         layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
+                             self.base_width, previous_dilation, norm_layer))
+         self.inplanes = planes * block.expansion
+         for _ in range(1, blocks):
+             layers.append(block(self.inplanes, planes, groups=self.groups,
+                                 base_width=self.base_width, dilation=self.dilation,
+                                 norm_layer=norm_layer))
+ 
+         return nn.Sequential(*layers)
+ 
+     def _forward_impl(self, x):
+         # See note [TorchScript super()]
+         x = self.conv1(x)
+         x = self.bn1(x)
+         x = self.relu(x)
+ 
+         x = self.layer1(x)
+         x = self.layer2(x)
+         x = self.layer3(x)
+ 
+         x = self.avgpool(x)
+         x = torch.flatten(x, 1)
+         x = self.fc(x)
+         return x
+ 
+     def forward(self, x):
+         return self._forward_impl(x)
+ 
+ 
+ # Model configurations
+ cfgs = {
+     '18':  (BasicBlock, [2, 2, 2, 2]),
+     '34':  (BasicBlock, [3, 4, 6, 3]),
+     '50':  (Bottleneck, [3, 4, 6, 3]),
+     '101': (Bottleneck, [3, 4, 23, 3]),
+     '152': (Bottleneck, [3, 8, 36, 3]),
+ }
+ cfgs_cifar = {
+     '20':  [3, 3, 3],
+     '32':  [5, 5, 5],
+     '44':  [7, 7, 7],
+     '56':  [9, 9, 9],
+     '110': [18, 18, 18],
+ }
+ 
+ 
+ def resnet(data='cifar10', **kwargs):
+     r"""ResNet models from "[Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385)"
+     Args:
+         data (str): the name of datasets
+     """
+     num_layers = str(kwargs.get('num_layers'))
+ 
+     # set pruner
+     global mnn
+     mnn = kwargs.get('mnn')
+     assert mnn is not None, "Please specify proper pruning method"
+ 
+     if data in ['cifar10', 'cifar100']:
+         if num_layers in cfgs_cifar.keys():
+             model = ResNet_CIFAR(BasicBlock, cfgs_cifar[num_layers], int(data[5:]))
+         else:
+             model = None
+         image_size = 32
+     elif data == 'imagenet':
+         if num_layers in cfgs.keys():
+             block, layers = cfgs[num_layers]
+             model = ResNet(block, layers, 1000)
+         else:
+             model = None
+         image_size = 224
+     else:
+         model = None
+         image_size = None
+ 
+     return model, image_size
\ No newline at end of file
--- a/code/run.sh 0 → 100644
View file @4f3fddc
+++ b/code/run.sh 0 → 100644
View file @4f3fddc
+ #!/bin/bash
+ RESULT_DIR=result_201203
+ 
+ if [ ! -d $RESULT_DIR ]; then
+     mkdir $RESULT_DIR
+ fi
+ 
+ #python modeling_default.py > $RESULT_DIR/default.txt #&
+ #python modeling_pruning.py > $RESULT_DIR/pruning_prune90.txt &
+ #python modeling_decorrelation.py > $RESULT_DIR/decorrelation_lambda1.txt &
+ #python modeling_pruning+decorrelation.py > $RESULT_DIR/pruning+decorrelation_lambda1+prune90.txt
+ 
+ #python modeling.py --prune_type structured --prune_rate 0.5 > $RESULT_DIR/prune_05.txt
+ #python modeling.py --prune_type structured --prune_rate 0.6 > $RESULT_DIR/prune_06.txt
+ #python modeling.py --prune_type structured --prune_rate 0.8 > $RESULT_DIR/prune_08.txt &
+ #python modeling.py --prune_type structured --prune_rate 0.7 > $RESULT_DIR/prune_07.txt
+ 
+ #python modeling.py --reg reg_cov --odecay 0.9 > $RESULT_DIR/reg_9.txt
+ #python modeling.py --reg reg_cov --odecay 0.8 > $RESULT_DIR/reg_8.txt
+ #python modeling.py --reg reg_cov --odecay 0.7 > $RESULT_DIR/reg_7.txt
+ #python modeling.py --reg reg_cov --odecay 0.6 > $RESULT_DIR/reg_6.txt
+ #python modeling.py --reg reg_cov --odecay 0.5 > $RESULT_DIR/reg_5.txt
+ 
+ #python modeling.py --prune_type structured --prune_rate 0.5 --reg reg_cov --odecay 0.7 > $RESULT_DIR/prune_05_reg_07.txt &
+ #python modeling.py --prune_type structured --prune_rate 0.5 --reg reg_cov --odecay 0.8 > $RESULT_DIR/prune_05_reg_08.txt &
+ #python modeling.py --prune_type structured --prune_rate 0.5 --reg reg_cov --odecay 0.9 > $RESULT_DIR/prune_05_reg_09.txt &
+ #python modeling.py --prune_type structured --prune_rate 0.6 --reg reg_cov --odecay 0.7 > $RESULT_DIR/prune_06_reg_07.txt &
+ #python modeling.py --prune_type structured --prune_rate 0.6 --reg reg_cov --odecay 0.8 > $RESULT_DIR/prune_06_reg_08.txt &
+ python modeling.py --prune_type structured --prune_rate 0.6 --reg reg_cov --odecay 0.9 > $RESULT_DIR/prune_06_reg_09.txt 
+ 
--- a/code/utils.py 0 → 100644
View file @4f3fddc
+++ b/code/utils.py 0 → 100644
View file @4f3fddc
+ import numpy as np # linear algebra
+ import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
+ import masknn
+ import resnet_mask
+ 
+ import torch
+ import torch.nn as nn
+ import torchvision
+ import torchvision.transforms as transforms
+ import numpy as np
+ 
+ import sys
+ 
+ 
+ def get_weight_threshold(model, rate, prune_imp='L1'):
+     importance_all = None
+     for name, item in model.named_parameters():  
+         #module.named_parameters():
+         if len(item.size())==4 and 'mask' not in name:
+             weights = item.data.view(-1).cpu()
+             grads = item.grad.data.view(-1).cpu()
+ 
+             if prune_imp == 'L1':
+                 importance = weights.abs().numpy()
+             elif prune_imp == 'L2':
+                 importance = weights.pow(2).numpy()
+             elif prune_imp == 'grad':
+                 importance = grads.abs().numpy()
+             elif prune_imp == 'syn':
+                 importance = (weights * grads).abs().numpy()
+             
+ 
+             if importance_all is None:
+                 importance_all = importance
+             else:
+                 importance_all = np.append(importance_all, importance)
+ 
+     threshold = np.sort(importance_all)[int(len(importance_all) * rate)]
+     return threshold
+ 
+ 
+ def weight_prune(model, threshold, prune_imp='L1'):
+     state = model.state_dict()
+     for name, item in model.named_parameters():
+         if 'weight' in name:
+             key = name.replace('weight', 'mask')
+             if key in state.keys():
+                 if prune_imp == 'L1':
+                     mat = item.data.abs()
+                 elif prune_imp == 'L2':
+                     mat = item.data.pow(2)
+                 elif prune_imp == 'grad':
+                     mat = item.grad.data.abs()
+                 elif prune_imp == 'syn':
+                     mat = (item.data * item.grad.data).abs()
+                 state[key].data.copy_(torch.gt(mat, threshold).float())
+ 
+                 
+ def get_filter_mask(model, rate, prune_imp='L1'):
+     importance_all = None
+     for name, item in model.named_parameters():  
+         #.module.named_parameters():
+         if len(item.size())==4 and 'weight' in name:
+             filters = item.data.view(item.size(0), -1).cpu()
+             weight_len = filters.size(1)
+             if prune_imp =='L1':
+                 importance = filters.abs().sum(dim=1).numpy() / weight_len
+             elif prune_imp == 'L2':
+                 importance = filters.pow(2).sum(dim=1).numpy() / weight_len
+         
+             if importance_all is None:
+                 importance_all = importance
+             else:
+                 importance_all = np.append(importance_all, importance)
+                 
+ 
+     threshold = np.sort(importance_all)[int(len(importance_all) * rate)]
+     #threshold = np.percentile(importance_all, rate)
+     filter_mask = np.greater(importance_all, threshold)
+     return filter_mask
+ 
+ 
+ def filter_prune(model, filter_mask):
+     idx = 0
+     for name, item in model.named_parameters():  
+         #.module.named_parameters():
+         if len(item.size())==4 and 'mask' in name:
+             for i in range(item.size(0)):
+                 item.data[i,:,:,:] = 1 if filter_mask[idx] else 0
+                 idx += 1
+ 
+ 
+ def reg_ortho(mdl):
+     l2_reg = None
+     for W in mdl.parameters():
+         if W.ndimension() < 2:
+             continue
+         else:
+             cols = W[0].numel()
+             rows = W.shape[0]
+             w1 = W.view(-1,cols)
+             wt = torch.transpose(w1,0,1)
+             m  = torch.matmul(wt,w1)
+             ident = Variable(torch.eye(cols,cols))
+             ident = ident.cuda()
+ 
+             w_tmp = (m - ident)
+             height = w_tmp.size(0)
+             u = normalize(w_tmp.new_empty(height).normal_(0,1), dim=0, eps=1e-12)
+             v = normalize(torch.matmul(w_tmp.t(), u), dim=0, eps=1e-12)
+             u = normalize(torch.matmul(w_tmp, v), dim=0, eps=1e-12)
+             sigma = torch.dot(u, torch.matmul(w_tmp, v))
+ 
+             if l2_reg is None:
+                 l2_reg = (sigma)**2
+             else:
+                 l2_reg = l2_reg + (sigma)**2
+     return l2_reg
+ 
+ 
+ def reg_cov(mdl):
+     cov_reg = 0
+     for W in mdl.parameters():
+         if W.ndimension() < 2:
+             continue
+         else:
+             for w in W:
+                 for w_ in w:
+                     if w_.dim() > 0 and len(w_) == 2:
+                         cov_ = np.cov(w_.detach().numpy())
+                         cov_upper = np.triu(cov_)
+                         cov_upper_abs = np.absolute(cov_upper)
+                         cov_upper_abs_sum = np.sum(cov_upper_abs)
+                         cov_reg += cov_upper_abs_sum
+             
+     return cov_reg
+ 
+ 
+ class AverageMeter(object):
+     r"""Computes and stores the average and current value
+     """
+     def __init__(self, name, fmt=':f'):
+         self.name = name
+         self.fmt = fmt
+         self.reset()
+ 
+     def reset(self):
+         self.val = 0
+         self.avg = 0
+         self.sum = 0
+         self.count = 0
+ 
+     def update(self, val, n=1):
+         self.val = val
+         self.sum += val * n
+         self.count += n
+         self.avg = self.sum / self.count
+ 
+     def __str__(self):
+         fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+         return fmtstr.format(**self.__dict__)
+     
+ 
+ def accuracy(output, target, topk=(1,)):
+     r"""Computes the accuracy over the $k$ top predictions for the specified values of k
+     """
+     with torch.no_grad():
+         maxk = max(topk)
+         batch_size = target.size(0)
+ 
+         _, pred = output.topk(maxk, 1, True, True)
+         pred = pred.t()
+         correct = pred.eq(target.view(1, -1).expand_as(pred))
+ 
+         res = []
+         for k in topk:
+             correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+             res.append(correct_k.mul_(100.0 / batch_size))
+         return res
+ 
+     
+ def cal_sparsity(model):
+     mask_nonzeros = 0
+     mask_length = 0
+     total_weights = 0
+ 
+     for name, item in model.named_parameters():  
+         #.module.named_parameters():
+         if 'mask' in name:
+             flatten = item.data.view(-1)
+             np_flatten = flatten.cpu().numpy()
+ 
+             mask_nonzeros += np.count_nonzero(np_flatten)
+             mask_length += item.numel()
+ 
+         if 'weight' in name or 'bias' in name:
+             total_weights += item.numel()
+ 
+     num_zero = mask_length - mask_nonzeros
+     sparsity = (num_zero / total_weights) * 100
+     return total_weights, num_zero, sparsity
+ 
+ 
+ def train(train_loader, epoch, model, criterion, optimizer, reg=None, prune=None, prune_freq=4, odecay=0, device='cuda'):
+     losses = AverageMeter('Loss', ':.4e')
+     top1 = AverageMeter('Acc@1', ':6.2f')
+     top5 = AverageMeter('Acc@5', ':6.2f')
+     
+     model.train()
+     
+     for i, (inputs, targets) in enumerate(train_loader):
+         inputs = inputs.to(device)
+         targets = targets.to(device)
+         
+         if prune:
+             if (i+1) % prune_freq == 0 and epoch <= 225:
+                 if prune['type'] == 'structured':
+                     filter_mask = get_filter_mask(model, prune['rate'])
+                     filter_prune(model, filter_mask)
+                 elif prune['type'] == 'unstructured':
+                     thres = get_weight_threshold(model, prune['target_sparsity'])
+                     weight_prune(model, thres)        
+             
+         outputs = model(inputs)
+             
+         if reg:
+             oloss = reg(model)
+             oloss = odecay * oloss
+             loss = criterion(outputs, targets) + oloss
+         else:
+             loss = criterion(outputs, targets)
+         
+         acc1, acc5 = accuracy(outputs, targets, topk=(1,5))
+         losses.update(loss.item(), inputs.size(0))
+         top1.update(acc1[0], inputs.size(0))
+         top5.update(acc5[0], inputs.size(0))
+         
+         optimizer.zero_grad()
+         loss.backward()
+         optimizer.step()
+         
+     print('train {i} ====> Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format(i=epoch, top1=top1, top5=top5))
+     if prune:
+         num_total, num_zero, sparsity = cal_sparsity(model)
+         print('sparsity {} ====> {:.2f}% || num_zero/num_total: {}/{}'.format(epoch, sparsity, num_zero, num_total))
+     return top1.avg, top5.avg
+ 
+ 
+ def validate(val_loader, epoch, model, criterion, device='cuda'):
+     losses = AverageMeter('Loss', ':.4e')
+     top1 = AverageMeter('Acc@1', ':6.2f')
+     top5 = AverageMeter('Acc@5', ':6.2f')
+     
+     model.eval()
+     
+     with torch.no_grad():
+         for i, (inputs, targets) in enumerate(val_loader):
+             inputs = inputs.to(device)
+             targets = targets.to(device)
+             
+             outputs = model(inputs)
+             loss = criterion(outputs, targets)
+             
+             acc1, acc5 = accuracy(outputs, targets, topk=(1,5))
+             losses.update(loss.item(), inputs.size(0))
+             top1.update(acc1[0], inputs.size(0))
+             top5.update(acc5[0], inputs.size(0))
+             
+     print('valid {i} ====> Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format(i=epoch, top1=top1, top5=top5))
+     return top1.avg, top5.avg