Stuck on an issue?

Lightrun Answers was designed to reduce the constant googling that comes with debugging 3rd party libraries. It collects links to all the places you might be looking at while hunting down a tough bug.

And, if you’re still stuck at the end, we’re happy to hop on a call to see how we can help out.

Transfer Learning not working

See original GitHub issue

I’m trying to use pretrained model b-1 to train the model on Places365 but the training is blocking at ~25% (accuracy). I used Imagenet auto-augment policy founded here using this code: Dataloaders :


def _get_train_data_loader(batch_size, training_dir, is_distributed, **kwargs):
    logger.info(str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")) + "Get train data loader")
    base_dir = '/dev/shm/places365_standard/'
    defaults.device = torch.device('cuda')

    dataset = datasets.ImageFolder(base_dir+"train", transform=transforms.Compose(
                        [transforms.Resize(224, interpolation=PIL.Image.BICUBIC), 
                         ImageNetPolicy(), 
                         transforms.ToTensor(), 
                         transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]))
    
    train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) 
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size, pin_memory=True, num_workers=8, sampler=train_sampler)


def _get_test_data_loader(test_batch_size, training_dir, **kwargs):
    logger.info(str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ")) + "Get test data loader")
    base_dir = '/dev/shm/places365_standard/'
    defaults.device = torch.device('cuda')
    

    dataset = datasets.ImageFolder(base_dir+"val", transform=transforms.Compose(
                        [transforms.Resize(224, interpolation=PIL.Image.BICUBIC), 
                         transforms.ToTensor(),
                         transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
                         ]))
    return torch.utils.data.DataLoader(dataset, batch_size=test_batch_size, num_workers=8, shuffle=True, pin_memory=True)

Training code :

    model = EfficientNet.from_pretrained('efficientnet-b1', num_classes=365).to(device)
    
    for n, p in model.named_parameters():
        if '_fc' not in n:
            p.requires_grad = False

    model = torch.nn.parallel.DistributedDataParallel(model)
    
    optimizer = optim.RMSprop(model.parameters(), lr=3e-2, alpha=0.99, 
                                                  eps=1e-08, weight_decay=1e-5, momentum=0.9)
    lmbda = lambda epoch: 0.98739
    scheduler = optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmbda)
    criterion = nn.CrossEntropyLoss()
    
    best_loss = 10000000
    
    for epoch in range(1, args.epochs + 1):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            if is_distributed and not use_cuda:
                # average gradients manually for multi-machine cpu case only
                _average_gradients(model)
            optimizer.step()
            if batch_idx % (len(train_loader)-1) == 0 and batch_idx != 0:
                log = 'Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.sampler),
                    100. * batch_idx / len(train_loader), loss.item())
                logger.info(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ") + log)

        test_loss = test(model, test_loader, device)
        scheduler.step()
        if test_loss < best_loss:
            logger.info(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ") + "Best loss : Saving")
            save_model(model, args.model_dir)
            best_loss = test_loss

test function:


def test(model, test_loader, device):
    model.eval()
    test_loss = 0
    correct = 0
    crit = nn.CrossEntropyLoss(size_average=False)
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
            output = model(data)
            test_loss += crit(output, target).item()  # sum up batch loss
            pred = output.max(1, keepdim=True)[1]  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    logger.info(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S ") + 'Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    return test_loss

I don’t know what I’m doing wrong ? any help ?

Issue Analytics

State:
Created 3 years ago
Comments:12 (1 by maintainers)

Top GitHub Comments

2reactions

teraoka-hiroshicommented, Jan 12, 2021

@gost-sniper Less efficiently, we have created a fixed layer with

model = EfficientNet.from_pretrained(args.arch, advprop=args.advprop, num_classes=365) 
            for param in model.parameters():
                param.requires_grad = False
            for name, module in model.named_modules():
                if  name == '_blocks.20' or \
                    name == '_blocks.21' or \
                    name == '_blocks.22' or \
                    name == '_fc':
                    for param in module.parameters():
                        param.requires_grad = True

0reactions

alancarlosmlcommented, Nov 11, 2021

Hi @gost-sniper did you fixed the problem?

Could you please share with me your training code? (alancarlosml@outlook.com)

I am facing problems with a code I made here.

Thank you!