Учитывая следующий код, я хочу понять, как вычисляется conv1. x.register_hook:
?
try_grad = nn.Conv2d(1,4,kernel_size=1)
def try_grad_assign(x):
try_grad.data = x
class try_conv_model(nn.Module):
def __init__(self):
super(try_conv_model, self).__init__()
self.conv1 = nn.Conv2d(1, 4, kernel_size=1, bias=False)
#self.conv2 = nn.Conv2d(1, 4, kernel_size=1, bias=False)
self.gap = nn.AdaptiveAvgPool2d((1,1))
def forward(self, x):
print("-"*150)
print("before conv1: ",x)
x = self.conv1(x)
x.register_hook(lambda x: print("conv1. x.register_hook: ",x))
#print("before conv2: ",x)
#x = self.conv2(x)
print("before AdaptiveAvgPool2D: ",x)
x = self.gap(x)
x.register_hook(try_grad_assign)
print("after AdaptiveAvgPool2D: ",x)
x = x.view(-1, self.num_flat_features(x))
return(x)
def num_flat_features(self, x):
size = x.size()[1:] # all dimensions except the batch dimension
num_features = 1
for s in size:
num_features *= s
return num_features
conv_model = try_conv_model()
def my_func(z):
print("z[0]: ",z[0])
z = z[0] + torch.tensor([[0,0,1,0]])
z = z[0] - torch.tensor([[0,0,-1,0]])
print("new_grad: ",z)
return z
conv_model.conv1.register_forward_hook(lambda x,y,z: print(" \n conv_model.conv1.register_forward_hook: ",x,y,z))
conv_model.conv1.register_backward_hook(lambda x,y,z: print(" \n conv_model.conv1.register_backward_hook: ",x,y,z))
#conv_model.conv1.register_hook(lambda x: print("conv_model.conv1.register_hook: ",conv_model.conv1.register_hook))
#conv_model.conv2.register_forward_hook(lambda x,y,z: print(" \n conv_model.conv2.register_forward_hook: ",x,y,z))
#conv_model.conv2.register_backward_hook(lambda x,y,z: print(" \n conv_model.conv2.register_backward_hook: ",x,y,z))
conv_model.gap.register_forward_hook(lambda x,y,z: print(" \n conv_model.gap.register_forward_hook: ",x,y,z))
conv_model.gap.register_backward_hook(lambda x,y,z: print(" \n conv_model.gap.register_backward_hook: ",x,y,z))
#conv_model.gap.register_hook(lambda x: print("conv_model.gap.register_hook: ",x))
conv_model.conv1.weight.data = torch. tensor([[[[1]]], [[[1]]], [[[1]]], [[[1]]]], dtype = torch.float)
img = torch.ones(1,1,5,5, dtype = torch.float)
print(img)
labels = torch.tensor([2], dtype = torch.long)
print(labels)
criterion = F.cross_entropy
#criterion.register_hook(lambda x: print("babasjbdkajskjs"))
optimizer = optim.SGD(conv_model.parameters(), lr=0.01, momentum=0.9)
epoch = 1
def train(epoch):
conv_model.train()
for i in range(epoch):
print("&"*300)
print("*"*25)
for param in conv_model.parameters():
print(param)
print("^"*25)
conv_model.train()
outputs = conv_model(img)
outputs.require_grad = True
#outputs.register_hook(lambda x: print("outputs.register_hook: ", x))
outputs.register_hook(my_func)
print("outputs: ",outputs)
loss = criterion(outputs, labels)
loss.register_hook(lambda x: print(" \n before backward loss hook: ",x))
print(" \n before backward () conv_model.conv1.weight.grad: ",conv_model.conv1.weight.grad)
#print(" \n before backward () conv_model.conv2.weight.grad: ",conv_model.conv2.weight.grad)
#print(" \n before backward () linear_model.fc1.bias.grad: ",linear_model.fc1.bias.grad)
loss.backward()
loss.register_hook(lambda x: print(" \n after backward loss hook: ",x))
print("loss.grad: ",loss.grad)
#print(" \n after backward () conv_model.conv2.weight.grad: ",conv_model.conv2.weight.grad)
print(" \n after backward () conv_model.conv1.weight.grad: ",conv_model.conv1.weight.grad)
#print(" \n after backward () linear_model.fc1.bias.grad: ",linear_model.fc1.bias.grad)
print("outputs.grad: ", outputs.grad)
optimizer.step()
print("*"*25)
for param in conv_model.parameters():
print(param)
print("^"*25)
train(epoch)
print("model prediction ",conv_model(img).max(1, keepdim=True)[1])
результат:
tensor([[[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]]]])
tensor([2])
&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
*************************
Parameter containing:
tensor([[[[1.]]],
[[[1.]]],
[[[1.]]],
[[[1.]]]], requires_grad=True)
^^^^^^^^^^^^^^^^^^^^^^^^^
------------------------------------------------------------------------------------------------------------------------------------------------------
before conv1: tensor([[[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]]]])
conv_model.conv1.register_forward_hook: Conv2d(1, 4, kernel_size=(1, 1), stride=(1, 1), bias=False) (tensor([[[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]]]]),) tensor([[[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]],
[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]],
[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]],
[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]]]], grad_fn=<MkldnnConvolutionBackward>)
before AdaptiveAvgPool2D: tensor([[[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]],
[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]],
[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]],
[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]]]], grad_fn=<MkldnnConvolutionBackward>)
conv_model.gap.register_forward_hook: AdaptiveAvgPool2d(output_size=(1, 1)) (tensor([[[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]],
[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]],
[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]],
[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]]]], grad_fn=<MkldnnConvolutionBackward>),) tensor([[[[1.]],
[[1.]],
[[1.]],
[[1.]]]], grad_fn=<ViewBackward>)
after AdaptiveAvgPool2D: tensor([[[[1.]],
[[1.]],
[[1.]],
[[1.]]]], grad_fn=<ViewBackward>)
outputs: tensor([[1., 1., 1., 1.]], grad_fn=<ViewBackward>)
before backward () conv_model.conv1.weight.grad: None
before backward loss hook: tensor(1.)
z[0]: tensor([ 0.2500, 0.2500, -0.7500, 0.2500])
new_grad: tensor([[0.2500, 0.2500, 1.2500, 0.2500]])
conv_model.gap.register_backward_hook: AdaptiveAvgPool2d(output_size=(1, 1)) (tensor([0.2500, 0.2500, 1.2500, 0.2500]),) (tensor([[[[0.2500]],
[[0.2500]],
[[1.2500]],
[[0.2500]]]]),)
conv1. x.register_hook: tensor([[[[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100]],
[[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100]],
[[0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
[0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
[0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
[0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
[0.0500, 0.0500, 0.0500, 0.0500, 0.0500]],
[[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100]]]])
conv_model.conv1.register_backward_hook: Conv2d(1, 4, kernel_size=(1, 1), stride=(1, 1), bias=False) (None, tensor([[[[0.2500]]],
[[[0.2500]]],
[[[1.2500]]],
[[[0.2500]]]]), None) (tensor([[[[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100]],
[[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100]],
[[0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
[0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
[0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
[0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
[0.0500, 0.0500, 0.0500, 0.0500, 0.0500]],
[[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100]]]]),)
loss.grad: None
after backward () conv_model.conv1.weight.grad: tensor([[[[0.2500]]],
[[[0.2500]]],
[[[1.2500]]],
[[[0.2500]]]])
outputs.grad: None
*************************
Parameter containing:
tensor([[[[0.9975]]],
[[[0.9975]]],
[[[0.9875]]],
[[[0.9975]]]], requires_grad=True)
^^^^^^^^^^^^^^^^^^^^^^^^^
------------------------------------------------------------------------------------------------------------------------------------------------------
before conv1: tensor([[[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]]]])
conv_model.conv1.register_forward_hook: Conv2d(1, 4, kernel_size=(1, 1), stride=(1, 1), bias=False) (tensor([[[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]]]]),) tensor([[[[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975]],
[[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975]],
[[0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
[0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
[0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
[0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
[0.9875, 0.9875, 0.9875, 0.9875, 0.9875]],
[[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975]]]],
grad_fn=<MkldnnConvolutionBackward>)
before AdaptiveAvgPool2D: tensor([[[[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975]],
[[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975]],
[[0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
[0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
[0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
[0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
[0.9875, 0.9875, 0.9875, 0.9875, 0.9875]],
[[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975]]]],
grad_fn=<MkldnnConvolutionBackward>)
conv_model.gap.register_forward_hook: AdaptiveAvgPool2d(output_size=(1, 1)) (tensor([[[[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975]],
[[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975]],
[[0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
[0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
[0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
[0.9875, 0.9875, 0.9875, 0.9875, 0.9875],
[0.9875, 0.9875, 0.9875, 0.9875, 0.9875]],
[[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975],
[0.9975, 0.9975, 0.9975, 0.9975, 0.9975]]]],
grad_fn=<MkldnnConvolutionBackward>),) tensor([[[[0.9975]],
[[0.9975]],
[[0.9875]],
[[0.9975]]]], grad_fn=<ViewBackward>)
after AdaptiveAvgPool2D: tensor([[[[0.9975]],
[[0.9975]],
[[0.9875]],
[[0.9975]]]], grad_fn=<ViewBackward>)
model prediction tensor([[3]])
Если градиентобратно распространяемый слоем avg2d равен
(tensor([[[[0.2500]],
[[0.2500]],
[[1.2500]],
[[0.2500]]]]),)
, с чем он свернут, так что он дает:
tensor([[[[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100]],
[[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100]],
[[0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
[0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
[0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
[0.0500, 0.0500, 0.0500, 0.0500, 0.0500],
[0.0500, 0.0500, 0.0500, 0.0500, 0.0500]],
[[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100],
[0.0100, 0.0100, 0.0100, 0.0100, 0.0100]]]])
Если он свернут с img с использованием print(try_grad(img))
, он даетследующий результат:
tensor([[[[-0.1372, -0.1372, -0.1372, -0.1372, -0.1372],
[-0.1372, -0.1372, -0.1372, -0.1372, -0.1372],
[-0.1372, -0.1372, -0.1372, -0.1372, -0.1372],
[-0.1372, -0.1372, -0.1372, -0.1372, -0.1372],
[-0.1372, -0.1372, -0.1372, -0.1372, -0.1372]],
[[ 0.3811, 0.3811, 0.3811, 0.3811, 0.3811],
[ 0.3811, 0.3811, 0.3811, 0.3811, 0.3811],
[ 0.3811, 0.3811, 0.3811, 0.3811, 0.3811],
[ 0.3811, 0.3811, 0.3811, 0.3811, 0.3811],
[ 0.3811, 0.3811, 0.3811, 0.3811, 0.3811]],
[[-1.4801, -1.4801, -1.4801, -1.4801, -1.4801],
[-1.4801, -1.4801, -1.4801, -1.4801, -1.4801],
[-1.4801, -1.4801, -1.4801, -1.4801, -1.4801],
[-1.4801, -1.4801, -1.4801, -1.4801, -1.4801],
[-1.4801, -1.4801, -1.4801, -1.4801, -1.4801]],
[[-0.2196, -0.2196, -0.2196, -0.2196, -0.2196],
[-0.2196, -0.2196, -0.2196, -0.2196, -0.2196],
[-0.2196, -0.2196, -0.2196, -0.2196, -0.2196],
[-0.2196, -0.2196, -0.2196, -0.2196, -0.2196],
[-0.2196, -0.2196, -0.2196, -0.2196, -0.2196]]]],
grad_fn=<MkldnnConvolutionBackward>)
Итак, мой вопрос, как conv2d вычисляет градиент относительно входных данных? и почему он отличается от моего? То, как я вручную вычисляю градиент, это неправильно или я что-то упускаю?