Stuck on an issue?

Lightrun Answers was designed to reduce the constant googling that comes with debugging 3rd party libraries. It collects links to all the places you might be looking at while hunting down a tough bug.

And, if you’re still stuck at the end, we’re happy to hop on a call to see how we can help out.

Implementing my own custom optimizer

See original GitHub issue

I was following the instructions on the read me but I still got issues. This is the issue I currently have:

Exception has occurred: ValueError
Optimizer type <class '__main__.TrainableSGD'> not supported by higher yet.

but I followed the instructions in the readme, which makes me feel this is a bug or higher doesn’t actually allow my own custom optimizer.

The specific thing my optimizer is doing doesn’t really matter but I will paste the code just for reference (note it’s just really silly optimizer):

class MySGD(Optimizer):

    def __init__(self, params):
        defaults = {}
        super().__init__(params,defaults)

class TrainableSGD(DifferentiableOptimizer):

    def _update(self, grouped_grads, **kwargs):
        old_eta = self.__old_eta
        eta = self.__eta
        # start differentiable & trainable update
        eta = eta(old_eta)
        zipped = zip(self.param_groups, grouped_grads)
        for group_idx, (group, grads) in enumerate(zipped):
            for p_idx, (p, g) in enumerate(zip(group['params'], grads)):
                if g is None:
                    continue
                #group['params'][p_idx] = _add(p, -group['lr'], g)
                group['params'][p_idx] = p + eta*g
        # fake returns
        self._old_eta = eta

Full script:

'''
Single task MAML:

MAML: min_{theta} sum_t L^val( theta - eta* Grad L^train(theta) )

T-step MAML: min_{theta} sum_t L^val( theta^{T} - eta* Grad L^train(theta^{T}) )
outerloop: min_{theta} sum_t L^val( theta^{T} - eta* Grad L^train(theta^{T}) ) ~ min_{theta} sum_t L^val( argmin L^train(theta) )
Innerloop: theta^{T} - eta* Grad L^train(theta^{T}) ~ argmin L^train(theta)

single task MAML: min_{theta} L^val( theta - eta* Grad L^train(theta) )


based on MAML example: https://github.com/facebookresearch/higher/blob/master/examples/maml-omniglot.py
'''

import torch
import torch.nn as nn
from torch.optim.optimizer import Optimizer

import higher
from higher.optim import DifferentiableOptimizer
from higher.optim import DifferentiableSGD

import torchvision
import torchvision.transforms as transforms

from torchviz import make_dot

import copy

import itertools

from collections import OrderedDict

#mini class to add a flatten layer to the ordered dictionary
class Flatten(nn.Module):
    def forward(self, input):
        '''
        Note that input.size(0) is usually the batch size.
        So what it does is that given any input with input.size(0) # of batches,
        will flatten to be 1 * nb_elements.
        '''
        batch_size = input.size(0)
        out = input.view(batch_size,-1)
        return out # (batch_size, *size)

def get_cifar10():
    transform = transforms.Compose(
        [transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

    trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                            download=True, transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                            shuffle=True, num_workers=2)

    testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                        download=True, transform=transform)
    testloader = torch.utils.data.DataLoader(testset, batch_size=4,
                                            shuffle=False, num_workers=2)
    return trainloader, testloader

class MySGD(Optimizer):

    def __init__(self, params):
        defaults = {}
        super().__init__(params,defaults)

class TrainableSGD(DifferentiableOptimizer):

    def _update(self, grouped_grads, **kwargs):
        old_eta = self.__old_eta
        eta = self.__eta
        # start differentiable & trainable update
        eta = eta(old_eta)
        zipped = zip(self.param_groups, grouped_grads)
        for group_idx, (group, grads) in enumerate(zipped):
            for p_idx, (p, g) in enumerate(zip(group['params'], grads)):
                if g is None:
                    continue
                #group['params'][p_idx] = _add(p, -group['lr'], g)
                group['params'][p_idx] = p + eta*g
        # fake returns
        self._old_eta = eta

# get dataloaders
trainloader, testloader = get_cifar10()
criterion = nn.CrossEntropyLoss()

#
child_model = nn.Sequential(OrderedDict([
        ('conv1', nn.Conv2d(in_channels=3,out_channels=2,kernel_size=5)),
        ('relu1', nn.ReLU()),
        ('Flatten', Flatten()),
        ('fc', nn.Linear(in_features=28*28*2,out_features=10) )
    ]))
hidden = torch.randn(size=(1,1),requires_grad=True)
eta = nn.Sequential(OrderedDict([
    ('fc', nn.Linear(1,1)),
    ('sigmoid', nn.Sigmoid())
]))
other = MySGD(child_model.parameters())
inner_opt = TrainableSGD(other, child_model.parameters())
#meta_params = itertools.chain(child_model.parameters(),eta.parameters())
meta_params = itertools.chain(eta.parameters(),[hidden])
meta_opt = torch.optim.Adam(meta_params, lr=1e-3)
# do meta-training/outer training minimize outerloop: min_{theta} sum_t L^val( theta^{T} - eta* Grad L^train(theta^{T}) ) 
nb_outer_steps = 2 # note, in this case it's the same as number of meta-train steps (but it's could not be the same depending how you loop through the val set)
for outer_i, (outer_inputs, outer_targets) in enumerate(testloader, 0):
    meta_opt.zero_grad()
    if outer_i >= nb_outer_steps:
        break
    # do inner-training/MAML; minimize innerloop: theta^{T} - eta* Grad L^train(theta^{T}) ~ argmin L^train(theta)
    nb_inner_steps = 3
    inner_opt.__old_eta = hidden
    inner_opt.__eta = eta
    with higher.innerloop_ctx(child_model, inner_opt) as (fmodel, diffopt):
        for inner_i, (inner_inputs, inner_targets) in enumerate(trainloader, 0):
            print(f'inner_i = {inner_i}')
            print(f'et^<{inner_i-1}> = {diffopt.__old_eta}')
            if inner_i >= nb_inner_steps:
                break
            logits = fmodel(inner_inputs)
            inner_loss = criterion(logits, inner_targets)
            #print(f'inner i = {inner_i}')
            #print(f'inner_loss^<{inner_i}>: {inner_loss}')
            old_eta = diffopt.step(inner_loss) # changes params P[t+1] using P[t] and loss[t] in a differentiable manner
        # compute the meta-loss L^val( theta^{T} - eta* Grad L^train(theta^{T}) ) 
        outer_outputs = fmodel(outer_inputs)
        meta_loss = criterion(outer_outputs, outer_targets) # L^val
        #grad_of_grads = torch.autograd.grad(outputs=meta_loss, inputs=fmodel.parameters(time=0)) # dmeta_loss/dw0
        print(f'outer_i = {outer_i}')
        print(f'-> outer_loss/meta_loss^{outer_i}: {meta_loss}')
        print(f'hidden.grad = {hidden.grad}')
        print(f'eta.fc.weight = {eta.fc.weight}')
        meta_opt.step() # meta-optimizer step: more or less theta^<t> := theta^<t> - meta_eta * Grad L^val( theta^{T} - eta* Grad L^train(theta^{T}) )

Issue Analytics

State:
Created 4 years ago
Comments:5 (5 by maintainers)

Top GitHub Comments

5reactions

egrefencommented, Feb 20, 2020

So sorry, but it seems I forgot to add one of the most important steps from that little guide. Please see the last step (step 7) of the newly expanded guide.

0reactions

renesax14commented, Feb 22, 2020

Note that you are doing things in a way which seems a bit strange. TrainableSGD should be the differentiable version of a functionally equivalent class which inherits from torch.optim.Optimizer (which you should also write)

if they are not functionally equivalent, would that be a problem? I never actually use none-differentiable version of it.