RuntimeError: One of the differentiated Tensors does not require grad
See original GitHub issueHi there! I am building a neural network with MAML. Everything works fine until adapting the learner to the loss. Here is the code (slightly modified from the examples):
import random
import time
import numpy as np
import torch as th
from torch import nn
from torch import optim
from torch.utils.tensorboard import SummaryWriter
from torchvision import transforms
from freezenet import FreezeNet
import learn2learn as l2l
writer = SummaryWriter('/info')
def accuracy(predictions, targets):
predictions = predictions.argmax(dim=1).view(targets.shape)
return (predictions == targets).sum().float() / targets.size(0)
def nll(z, log_det):
return (th.mean(z**2) / 2 - th.mean(log_det) / 28*28) * 0.0005
def fast_adapt(adaptation_data, evaluation_data, learner, loss, adaptation_steps):
for step in range(adaptation_steps):
data = [d for d in adaptation_data]
X = th.cat([d[0] for d in data], dim=0)
X = X.view(X.shape[0], 1, X.shape[1], X.shape[2])
y = th.cat([th.tensor(d[1]).view(-1) for d in data], dim=0).to(dtype=th.int64)
y_pred, z, log_det = learner(X)
train_error = loss(y_pred, y) + nll(z, log_det)
train_error /= len(adaptation_data)
learner.adapt(train_error)
data = [d for d in evaluation_data]
X = th.cat([d[0] for d in data], dim=0)
X = X.view(X.shape[0], 1, X.shape[1], X.shape[2])
y = th.cat([th.tensor(d[1]).view(-1) for d in data], dim=0).to(dtype=th.int64)
y_pred, z, log_det = learner(X)
valid_error = loss(y_pred, y) + nll(z, log_det)
valid_error /= len(evaluation_data)
valid_accuracy = accuracy(y_pred, y)
return valid_error, valid_accuracy
def main(
ways=5,
shots=1,
meta_lr=0.003,
fast_lr=0.5,
meta_batch_size=32,
adaptation_steps=1,
num_iterations=6000,
cuda=True,
seed=42,
):
random.seed(seed)
np.random.seed(seed)
th.manual_seed(seed)
omniglot = l2l.vision.datasets.FullOmniglot(root='./data',
transform=transforms.Compose([
l2l.vision.transforms.RandomDiscreteRotation(
[0.0, 90.0, 180.0, 270.0]),
transforms.Resize(28),
transforms.ToTensor(),
lambda x: 1.0 - x,
]),
download=True)
omniglot = l2l.data.MetaDataset(omniglot)
classes = list(range(1623))
random.shuffle(classes)
train_generator = l2l.data.TaskGenerator(dataset=omniglot,
ways=ways,
classes=classes[:1100],
tasks=20000)
valid_generator = l2l.data.TaskGenerator(dataset=omniglot,
ways=ways,
classes=classes[1100:1200],
tasks=1024)
test_generator = l2l.data.TaskGenerator(dataset=omniglot,
ways=ways,
classes=classes[1200:],
tasks=1024)
# Create model
model = FreezeNet((1, 28, 28), ways, num_steps=3)
maml = l2l.algorithms.MAML(model, lr=fast_lr, first_order=True)
opt = optim.Adam(maml.parameters(), meta_lr)
loss = nn.CrossEntropyLoss(reduction='mean')
start = time.time()
for iteration in range(num_iterations):
opt.zero_grad()
meta_train_error = 0.0
meta_train_accuracy = 0.0
meta_valid_error = 0.0
meta_valid_accuracy = 0.0
meta_test_error = 0.0
meta_test_accuracy = 0.0
for task in range(meta_batch_size):
# Compute meta-training loss
learner = maml.clone()
adaptation_data = train_generator.sample(shots=shots)
evaluation_data = train_generator.sample(shots=shots,
task=adaptation_data.sampled_task)
evaluation_error, evaluation_accuracy = fast_adapt(adaptation_data,
evaluation_data,
learner,
loss,
adaptation_steps)
evaluation_error.backward()
meta_train_error += evaluation_error.item()
meta_train_accuracy += evaluation_accuracy.item()
# Compute meta-validation loss
learner = maml.clone()
adaptation_data = valid_generator.sample(shots=shots)
evaluation_data = valid_generator.sample(shots=shots,
task=adaptation_data.sampled_task)
evaluation_error, evaluation_accuracy = fast_adapt(adaptation_data,
evaluation_data,
learner,
loss,
adaptation_steps)
meta_valid_error += evaluation_error.item()
meta_valid_accuracy += evaluation_accuracy.item()
# Compute meta-testing loss
learner = maml.clone()
adaptation_data = test_generator.sample(shots=shots)
evaluation_data = test_generator.sample(shots=shots,
task=adaptation_data.sampled_task)
evaluation_error, evaluation_accuracy = fast_adapt(adaptation_data,
evaluation_data,
learner,
loss,
adaptation_steps)
meta_test_error += evaluation_error.item()
meta_test_accuracy += evaluation_accuracy.item()
# Print some metrics
print('\n')
print('Iteration', iteration)
print('Meta Train Error', meta_train_error / meta_batch_size)
print('Meta Train Accuracy', meta_train_accuracy / meta_batch_size)
print('Meta Valid Error', meta_valid_error / meta_batch_size)
print('Meta Valid Accuracy', meta_valid_accuracy / meta_batch_size)
print('Meta Test Error', meta_test_error / meta_batch_size)
print('Meta Test Accuracy', meta_test_accuracy / meta_batch_size)
# Add information for evidence in project
writer.add_scalar('Train Error', meta_train_error, iteration)
writer.add_scalar('Validation Error', meta_valid_error, iteration)
writer.add_scalar('Test Error', meta_test_error, iteration)
writer.add_scalar('Train Accuracy', meta_train_accuracy, iteration)
writer.add_scalar('Valid Accuracy', meta_valid_accuracy, iteration)
writer.add_scalar('Test Accuracy', meta_test_accuracy, iteration)
# Average the accumulated gradients and optimize
for p in maml.parameters():
p.grad.data.mul_(1.0 / meta_batch_size)
opt.step()
print('Total Training Time is {} seconds'.format(time.time() - start))
writer.close()
th.save(learner.state_dict(), '/saved_model')
if __name__ == '__main__':
main()
Here is the network:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models.squeezenet import Fire
import FrEIA.framework as Ff
import FrEIA.modules as Fm
def subnet(in_channels, out_channels):
return nn.Sequential(
Fire(in_channels, 8, 16, 16),
nn.BatchNorm2d(32),
nn.ReLU(),
Fire(32, 16, out_channels//2, out_channels//2),
)
class FreezeNet(nn.Module):
def __init__(self, shape, num_outputs, num_steps=3):
super(FreezeNet, self).__init__()
self.C, self.H, self.W = shape
self.num_steps = num_steps
self.model = self.build_model()
self.final_conv = nn.Conv2d(2 * 2 * self.C, num_outputs, kernel_size=1)
self.pool = nn.AvgPool2d(self.H // 2)
def build_model(self):
nodes = [Ff.InputNode(self.C, self.H, self.W, name='input')]
nodes.append(Ff.Node(nodes[-1], Fm.HaarDownsampling, {}, name='Downsampling'))
for i in range(self.num_steps):
nodes.append(Ff.Node(nodes[-1], Fm.ActNorm, {}, name='actnorm_{}'.format(i)))
nodes.append(Ff.Node(nodes[-1], Fm.PermuteRandom, {'seed':i}, name='permute_{}'.format(i)))
nodes.append(Ff.Node(nodes[-1], Fm.GLOWCouplingBlock,
{'subnet_constructor': subnet, 'clamp':1.2}, name='coupling_{}'.format(i)))
nodes.append(Ff.OutputNode(nodes[-1], name='output_node'))
return Ff.ReversibleGraphNet(nodes, verbose=False)
def forward(self, x):
z = self.model(x)
y = F.relu(self.final_conv(z))
y = F.sigmoid(self.pool(y))
y = y.view(y.shape[0], -1)
log_det = self.model.log_jacobian(run_forward=False)
return y, z, log_det
Issue Analytics
- State:
- Created 4 years ago
- Comments:6 (3 by maintainers)
Top Results From Across the Web
One of the differentiated Tensors does not require grad
I'm trying to get a hessian vector product, but the following codes from DARTS model don't work as expected while self.named_parameters() is ...
Read more >One of the differentiated Tensors does not require grad - Stack ...
It looks like you are trying to fine tune on a some tensors that do not require gradients. Normally you would fine tune...
Read more >Automatic differentiation package - torch.autograd
If it is a tensor, it will be automatically converted to a Tensor that does not require grad unless create_graph is True. None...
Read more >关于pytorch复现模型的一些报错总结 - CSDN博客
1.RuntimeError: One of the differentiated Tensors does not require grad. 关于这个报错的意思是:有一个参数不需要计算导数此 torch.autograd.grad ...
Read more >Pytorch报错:One of the differentiated Tensors does not ...
问题:当使用torch.autograd.grad求梯度的时候,报错:RuntimeError:element0oftensorsdoesnotrequiregradanddoesnothaveagrad_fn比如简单的模型f(x)=xw+b当使用求梯度的 ...
Read more >Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start FreeTop Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found
Top GitHub Comments
It turns out I misunderstood the issue: the problem is not that some parameters are unused to compute the loss, it’s that some parameters have
requires_grad = False
which makes autograd complain.I’ll push a fix soon.
Okay, so it does not work. When I set
allow_unused
to True, I get the same error message. Perhaps I am doing something wrong?