Crash with message node --- has been marked dead because the monitor has missed too many heartbeats from it
See original GitHub issueSystem information
- OS Platform and Distribution (e.g., Linux Ubuntu 16.04): Ubuntu 16.04
- Ray installed from (source or binary): from binary
- Ray version: 0.6.0
- Python version: 2.7.12
- Exact command to reproduce: python mnist.py
Describe the problem
I have a 3 node cluster and 3 remote workers – two of those require GPU and third one only the CPU. Two of the nodes have been started with 1 GPU and 1 CPU and the node with redis contains only the CPU (4 of them). The setup with the attached code works for a few epochs and then crashes with a message that the monitor missed too many heartbeats… The node with client ID 581656f84659be9bba58da993729644cfb554836 has been marked dead because the monitor has missed too many heartbeats from it.
Traceback (most recent call last): File “mnist_main.py”, line 154, in <module> train() File “mnist_main.py”, line 138, in train for actor in train_actors]) File “/home/anurag/.local/lib/python2.7/site-packages/ray/worker.py”, line 2358, in get raise RayGetError(object_ids[i], value) ray.worker.RayGetError: Could not get objectid ObjectID(0100000008623a3fd1946a9132866d6f6113876e). It was created by remote function <unknown> which failed with:
Remote function <unknown> failed with:
Invalid return value: likely worker died or was killed while executing the task; check previous logs or dmesg for errors.
Source code / logs
from __future__ import print_function
import argparse
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import ray
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 20, 5, 1)
self.conv2 = nn.Conv2d(20, 50, 5, 1)
self.fc1 = nn.Linear(4*4*50, 500)
self.fc2 = nn.Linear(500, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2, 2)
x = x.view(-1, 4*4*50)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return F.log_softmax(x, dim=1)
@ray.remote(num_gpus=1)
class MNISTTrainActor(object):
"""Simple actor for MNIST trainer."""
def __init__(self, id):
print("Initialize Actor environment gpu id: ", os.environ["CUDA_VISIBLE_DEVICES"])
self.device = torch.device("cuda")
self.model = Net().to(self.device)
kwargs = {'num_workers': 1, 'pin_memory': True}
self.train_loader = torch.utils.data.DataLoader(
datasets.MNIST('/data/mnist', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=64, shuffle=True, **kwargs)
momentum = 0.5
lr = 0.01
self.optimizer = optim.SGD(self.model.parameters(), lr=lr, momentum=momentum)
self.id = id
print("ID: ", self.id)
def run_train(self, weights):
self.model.load_state_dict(weights)
self.model.cuda()
print("starting run_train for actor.id = ", self.id)
for batch_idx, (data, target) in enumerate(self.train_loader):
#send even batches to id == 1 and odd to id == 0
if ((self.id % 2 == 0 and batch_idx % 2 == 0) or
(self.id % 2 == 1 and batch_idx % 2 == 1) ) : continue
data, target = data.to(self.device), target.to(self.device)
self.optimizer.zero_grad()
output = self.model(data)
loss = F.nll_loss(output, target)
loss.backward()
self.optimizer.step()
#if batch_idx % 200 == 0 or batch_idx % 201 == 0:
if False: #batch_idx % 200 == 0 or batch_idx % 201 == 0:
print('Actor ID: {} batch_idx: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
self.id, batch_idx, batch_idx * len(data), len(self.train_loader.dataset),
100. * batch_idx / len(self.train_loader), loss.item()))
weights = self.model.cpu().state_dict()
return weights
def get_weights(self):
weights = self.model.cpu().state_dict()
return weights
@ray.remote
class MNISTTestActor(object):
def __init__(self):
self.device = torch.device("cpu")
self.model = Net().to(self.device)
kwargs = {}
self.test_loader = torch.utils.data.DataLoader(
datasets.MNIST('/data/mnist', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=64, shuffle=True, **kwargs)
def accuracy(self, weights, step):
self.model.load_state_dict(weights)
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in self.test_loader:
data, target = data.to(self.device), target.to(self.device)
output = self.model(data)
test_loss += F.nll_loss(output, target).item() # sum up batch loss
pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(self.test_loader.dataset)
print('\nTest set: Step: {}, Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
step, test_loss, correct, len(self.test_loader.dataset),
100. * correct / len(self.test_loader.dataset)))
def train():
ray.init(redis_address="IP_address_of_head_removed:6379")
num_actors = 2
train_actors = [MNISTTrainActor.remote(i)
for i in range(num_actors)]
test_actor = MNISTTestActor.remote()
weight_id = train_actors[0].get_weights.remote()
step = 0
acc_id = test_actor.accuracy.remote(weight_id, step)
print("Starting training loop. Use Ctrl-C to exit.")
try:
while True:
all_weights = ray.get([actor.run_train.remote(weight_id)
for actor in train_actors])
mean_weights = {k: (sum(weights[k] for weights in all_weights) /
num_actors)
for k in all_weights[0]}
weight_id = ray.put(mean_weights)
step += 10
if step % 10 == 0:
acc = ray.get(acc_id)
acc_id = test_actor.accuracy.remote(weight_id, step)
except KeyboardInterrupt:
pass
if __name__ == "__main__":
train()
Issue Analytics
- State:
- Created 5 years ago
- Comments:38 (18 by maintainers)
Top GitHub Comments
@robertnishihara My guess is that the monitor (running on the node with redis?) is not getting heartbeat(s) (from one or both nodes) before timeout. Is there a way to make heartbeat timeout longer using Python APIs / configuration file? I can try that to find out if that helps.
I’m thinking maybe it has something to do with running with max number of cores on the master node and also running tensorboard? Perhaps there is not enough CPU for training itself so timeout occurs.
I’m going to try to reduce num of cpus available on master node, e.g.
--num-cpus 40
instead of max 46.