каждый.
Я загрузил предварительно обученный аудио антиспуфинг SPE C модель в PyTorch.
Модель следующая:
class ResNetBlock(nn.Module):
def __init__(self, in_depth, depth, first=False):
super(ResNetBlock, self).__init__()
self.first = first
self.conv1 = nn.Conv2d(in_depth, depth, kernel_size=3, stride=1, padding=1)
self.bn1 = nn.BatchNorm2d(depth)
self.lrelu = nn.LeakyReLU(0.01)
self.dropout = nn.Dropout(0.5)
self.conv2 = nn.Conv2d(depth, depth, kernel_size=3, stride=3, padding=1)
self.conv11 = nn.Conv2d(in_depth, depth, kernel_size=3, stride=3, padding=1)
if not self.first :
self.pre_bn = nn.BatchNorm2d(in_depth)
def forward(self, x):
# x is (B x d_in x T)
prev = x
prev_mp = self.conv11(x)
if not self.first:
out = self.pre_bn(x)
out = self.lrelu(out)
else:
out = x
out = self.conv1(x)
# out is (B x depth x T/2)
out = self.bn1(out)
out = self.lrelu(out)
out = self.dropout(out)
out = self.conv2(out)
# out is (B x depth x T/2)
out = out + prev_mp
return out
class SpectrogramModel(nn.Module):
def __init__(self):
super(SpectrogramModel, self).__init__()
self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
self.block1 = ResNetBlock(32, 32, True)
self.mp = nn.MaxPool2d(3, stride=3, padding=1)
self.block2 = ResNetBlock(32, 32, False)
self.block3 = ResNetBlock(32, 32, False)
self.block4= ResNetBlock(32, 32, False)
self.block5= ResNetBlock(32, 32, False)
self.block6 = ResNetBlock(32, 32, False)
self.block7 = ResNetBlock(32, 32, False)
self.block8 = ResNetBlock(32, 32, False)
self.block9 = ResNetBlock(32, 32, False)
self.block10 = ResNetBlock(32, 32, False)
self.block11 = ResNetBlock(32, 32, False)
self.lrelu = nn.LeakyReLU(0.01)
self.bn = nn.BatchNorm2d(32)
self.dropout = nn.Dropout(0.5)
self.logsoftmax = nn.LogSoftmax(dim=1)
self.fc1 = nn.Linear(64, 128)
self.fc2 = nn.Linear(128, 2)
def forward(self, x):
batch_size = x.size(0)
x = x.unsqueeze(dim=1)
out = self.conv1(x)
out = self.block1(out)
#out = self.block2(out)
#out = self.mp(out)
out = self.block3(out)
#out = self.block4(out)
#out = self.mp(out)
out = self.block5(out)
#out = self.block6(out)
#out = self.mp(out)
out = self.block7(out)
#out = self.block8(out)
#out = self.mp(out)
out = self.block9(out)
#out = self.block10(out)
#out = self.mp(out)
out = self.block11(out)
out = self.bn(out)
out = self.lrelu(out)
#out = self.mp(out)
out = out.view(batch_size, -1)
out = self.dropout(out)
out = self.fc1(out)
out = self.lrelu(out)
out = self.fc2(out)
out = self.logsoftmax(out)
return out
Для извлечения характеристик используется логарифмическая c величина STFT.
def get_log_spectrum(x):
s = librosa.core.stft(x, n_fft=2048, win_length=2048, hop_length=512)
a = np.abs(s)**2
#melspect = librosa.feature.melspectrogram(S=a)
feat = librosa.power_to_db(a)
return feat
Чтобы получить необходимый вывод из необработанного аудиофайла, я попытался сделать следующее:
model = SpectrogramModel()
checkpoint = torch.load('sample_models/SPEC/spec.pth')
model.load_state_dict(checkpoint)
model.eval()
y, sr = librosa.load('pipe.wav')
feat = get_log_spectrum(y)
output = model(feat)
Но столкнулся со следующей ошибкой:
TypeError Traceback (most recent call last)
<ipython-input-20-75bc77ad7d73> in <module>
----> 1 output = model(feat)
~/python-venvs/asvspoof-2019/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
530 result = self._slow_forward(*input, **kwargs)
531 else:
--> 532 result = self.forward(*input, **kwargs)
533 for hook in self._forward_hooks.values():
534 hook_result = hook(self, input, result)
~/projects/asvspoof2019/models.py in forward(self, x)
106
107 def forward(self, x):
--> 108 batch_size = x.size(0)
109 x = x.unsqueeze(dim=1)
110 out = self.conv1(x)
TypeError: 'int' object is not callable
Что я хочу знать, это
- Что такое вход и выход этой модели?
- Как получить оценку этой модели из необработанного аудиофайла (
wav
). - подлинный или фальшивый?
- CM балл
Пожалуйста, помогите новому дайверу в поле обнаружения аудио-спуфинга.