[tune] executing _train() takes too much time on Slurm
See original GitHub issueSystem information
- OS Platform and Distribution: Ubuntu 16.04.6
- Ray installed from: source
- Ray version: 0.6.5
- Python version: 3.6.8
- Exact command to reproduce: Run the source code
Describe the problem
When running this code on Tesla V100 GPUs, training one iteration takes about 70 seconds.
The analogous code for one model without Tune (just regular PyTorch) takes about 20 seconds per iteration, even though it gets the same amount of resources as every trial.
The 50 additional seconds do not seem to be regular overhead of distributed training, as according to my measurements, almost all of the 70 seconds are spent on
result = self._train()
in ray/python/ray/tune/trainable.py.
When training not 4 but 16 models in parallel, giving 0.25 GPUs to every instance, it takes about 300 seconds per iteration. This even more severe slow-down surprises me, as the regular PyTorch-Model is far from using a quarter GPU when training.
Does anyone have a clue why this happens? Thank you in advance!
Source code / logs
class Classifier(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 10, kernel_size=5)
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
self.fc1 = nn.Linear(20*5*5, 50)
self.fc2 = nn.Linear(50, 10)
def forward(self, x):
x = F.relu(F.max_pool2d(self.conv1(x),2))
x = F.relu(F.max_pool2d(self.conv2(x),2))
x = x.view(-1, 20*5*5)
x = F.relu(self.fc1(x))
return self.fc2(x)
class TrainClassifier(Trainable):
def _setup(self, config):
args = {'seed': 1}
torch.manual_seed(args['seed'])
self.model = Classifier().cuda()
torch.cuda.manual_seed(args['seed'])
dataloader_args = {'num_workers': 1, 'pin_memory': True, 'drop_last':True, 'batch_size': 256}
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
self.train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10(
'../../../data',
train=True,
download=False,
transform=transform),
shuffle=True,
**dataloader_args)
self.test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10(
'../../../data',
train=False,
transform=transform),
shuffle=False,
**dataloader_args)
self.optimizer = optim.SGD(
self.model.parameters(),
lr=config['lr'],
)
self.args = args
def _train_iteration(self):
self.model.train()
for batch_idx, (data, target) in enumerate(self.train_loader):
data, target = data.cuda(), target.cuda()
self.optimizer.zero_grad()
output = self.model(data)
loss = F.cross_entropy(output, target)
loss.backward()
self.optimizer.step()
self.model.eval()
train_loss = 0
correct = 0
with torch.no_grad():
for data, target in self.train_loader:
data, target = data.cuda(), target.cuda()
output = self.model(data)
train_loss += F.cross_entropy(output, target, reduction='sum').item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(
target.data.view_as(pred)).long().cpu().sum()
train_loss /= len(self.train_loader.dataset)
accuracy = correct.item() / len(self.train_loader.dataset)
return {'train_loss': train_loss, 'train_accuracy': accuracy}
def _test(self):
self.model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in self.test_loader:
data, target = data.cuda(), target.cuda()
output = self.model(data)
# sum up batch loss
test_loss += F.cross_entropy(output, target, reduction='sum').item()
# get the index of the max log-probability
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(
target.data.view_as(pred)).long().cpu().sum()
test_loss /= len(self.test_loader.dataset)
accuracy = correct.item() / len(self.test_loader.dataset)
return {'mean_loss': test_loss, 'mean_accuracy': accuracy}
def _train(self):
train_result = self._train_iteration()
test_result = self._test()
return {**train_result, **test_result}
def _save(self, checkpoint_dir):
path = os.path.join(checkpoint_dir, "checkpoint")
with open(path, "wb") as f:
cloudpickle.dump(({
"model": self.model.state_dict(),
"optimizer": self.optimizer.state_dict()
}), f)
return path
def _restore(self, checkpoint_path):
with open(checkpoint_path, 'rb') as f:
data = cloudpickle.loads(f.read())
self.model.load_state_dict(data["model"])
self.optimizer.load_state_dict(data["optimizer"])
def reset_config(self, new_config):
self.config = new_config
return True
if __name__ == '__main__':
datasets.CIFAR10('../../../data', train=True, download=True)
ray.init(num_gpus=4)
sched = PopulationBasedTraining(
time_attr='training_iteration',
reward_attr='mean_accuracy',
perturbation_interval=1.0,
resample_probability=0,
hyperparam_mutations={
'lr': lambda: np.random.uniform(0.001, 0.1),
'weight_decay': lambda: np.random.uniform(0.00001, 0.001)
},
)
tune.run_experiments(
{
'exp-1': {
'stop': {
'mean_accuracy': 0.95,
'training_iteration': 150,
},
'resources_per_trial': {
'cpu': 1,
'gpu': 1
},
'run': TrainClassifier,
'num_samples': 4,
'checkpoint_at_end': True,
'config': config
}
},
verbose=2,
scheduler=sched,
reuse_actors=True,
)
Issue Analytics
- State:
- Created 4 years ago
- Comments:9 (5 by maintainers)
Top GitHub Comments
Ok, I solved the issue, though it had nothing to do with “OMP_NUM_THREADS”:
The command to request a node in the cluster needed an explicit flag to be set which requests extra CPUs. I didn’t think of that before because Tune claimed to be using all #trials/16 CPUs that are available on the node.
So I’d leave this as an open issue for now, though I’m not sure if it’s possible for Ray to detect whether a CPU/GPU is usable or not.
Closing this for now just because it is resolved and Ray currently does not check CPU usability/for other tenants.