Training falling on version 0.0.14 and 0.0.15
See original GitHub issueHi, I testing training model on new versions of repo, and I have some troubles with 0.0.14 and 0.0.15. On 0.0.14, model always return nan on forward pass, version 0.0.15 lead to CUDA error:
RuntimeError: CUDA error: CUBLAS_STATUS_INTERNAL_ERROR when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`
Full error listing:
ipython-input-7-1329da5363de> in forward(self, inputs, labels)
7 def forward(self, inputs, labels=None):
8 loss_mx = labels != -100
----> 9 output = self.model(inputs)
10 output = output[loss_mx].view(-1, tokenizer.vocab_size)
11 labels = labels[loss_mx].view(-1)
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/sinkhorn_transformer/sinkhorn_transformer.py in forward(self, x, input_mask)
376 x = self.to_token_emb(x)
377 x = self.pos_emb(torch.arange(t, device=device)) + x
--> 378 x = self.sinkhorn_transformer(x)
379 return self.to_logits(x)
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/sinkhorn_transformer/sinkhorn_transformer.py in forward(self, x, input_mask)
359
360 def forward(self, x, input_mask = None):
--> 361 return self.layers(x)
362
363 class SinkhornTransformerLM(nn.Module):
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/sinkhorn_transformer/sinkhorn_transformer.py in forward(self, x, **kwargs)
330 def forward(self, x, **kwargs):
331 x = torch.cat([x, x], dim=-1)
--> 332 x = self.layers(x, **kwargs)
333 return torch.stack(x.chunk(2, dim=-1)).sum(dim=0)
334
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/sinkhorn_transformer/reversible.py in forward(self, x, arg_route, **kwargs)
128 block_kwargs = {'f_args': f_args, 'g_args': g_args}
129
--> 130 return _ReversibleFunction.apply(x, blocks, block_kwargs)
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/sinkhorn_transformer/reversible.py in forward(ctx, x, blocks, kwargs)
98 ctx.kwargs = kwargs
99 for block in blocks:
--> 100 x = block(x, **kwargs)
101 ctx.y = x.detach()
102 ctx.blocks = blocks
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/sinkhorn_transformer/reversible.py in forward(self, x, f_args, g_args)
51 with torch.no_grad():
52 y1 = x1 + self.f(x2, record_rng=self.training, **f_args)
---> 53 y2 = x2 + self.g(y1, record_rng=self.training, **g_args)
54
55 return torch.cat([y1, y2], dim=2)
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/sinkhorn_transformer/reversible.py in forward(self, record_rng, set_rng, *args, **kwargs)
25
26 if not set_rng:
---> 27 return self.net(*args, **kwargs)
28
29 rng_devices = []
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/sinkhorn_transformer/sinkhorn_transformer.py in forward(self, x)
91 def forward(self, x):
92 chunks = x.chunk(self.chunks, dim = self.dim)
---> 93 return torch.cat([self.fn(c) for c in chunks], dim = self.dim)
94
95 class FeedForward(nn.Module):
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/sinkhorn_transformer/sinkhorn_transformer.py in <listcomp>(.0)
91 def forward(self, x):
92 chunks = x.chunk(self.chunks, dim = self.dim)
---> 93 return torch.cat([self.fn(c) for c in chunks], dim = self.dim)
94
95 class FeedForward(nn.Module):
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/sinkhorn_transformer/sinkhorn_transformer.py in forward(self, x, **kwargs)
112 def forward(self, x, **kwargs):
113 x = self.norm(x)
--> 114 return self.fn(x, **kwargs)
115
116 class SortNet(nn.Module):
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/sinkhorn_transformer/sinkhorn_transformer.py in forward(self, x)
103
104 def forward(self, x):
--> 105 return self.net(x)
106
107 class PreNorm(nn.Module):
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/torch/nn/modules/container.py in forward(self, input)
98 def forward(self, input):
99 for module in self:
--> 100 input = module(input)
101 return input
102
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/torch/nn/modules/linear.py in forward(self, input)
85
86 def forward(self, input):
---> 87 return F.linear(input, self.weight, self.bias)
88
89 def extra_repr(self):
/opt/anaconda/envs/torch-nigtly-reformer/lib/python3.6/site-packages/torch/nn/functional.py in linear(input, weight, bias)
1591 ret = torch.addmm(bias, input, weight.t())
1592 else:
-> 1593 output = input.matmul(weight.t())
1594 if bias is not None:
1595 output += bias
RuntimeError: CUDA error: CUBLAS_STATUS_INTERNAL_ERROR when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`
Also, version 0.0.11(and all other version from 0.0.8) work stable.
Issue Analytics
- State:
- Created 3 years ago
- Comments:30 (28 by maintainers)
Top Results From Across the Web
Compromised Hosts: More Information | ATP Cloud
Infected hosts are systems where there is a high confidence that attackers have gained unauthorized access. When a host is compromised, the attacker...
Read more >How to determine whether an IP address belongs to
1.0/24, you can quickly check if a valid IP address belongs to that range by checking if it starts with "192.168.1." Easy peasy!...
Read more >Configure IP Addresses and Unique Subnets for New Users
This document provides basic information needed in order to configure your router for routing IP, such as how addresses are broken down and ......
Read more >npm fails to install - Google Groups
28 silly addNameRange '0.0.14', ... 105 silly gunzTarPerm extractEntry RELEASE.pdf ... n\nSometimes certain messages can be dropped.
Read more >Tell HN: Amazon now owns 3.0.0.0/8 | Hacker News
Much larger and more readable version of this map: ... 3.0.0.0/15 ap-southeast-1 EC2 (Singapore) 3.8.0.0/14 eu-west-2 EC2 3.16.0.0/14 ...
Read more >Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start FreeTop Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found
Top GitHub Comments
You can find the minimum code example here Unfortunately, I can share a full-fledged script only when in the USA will be deep night
No, just PyTorch-xla. Training goes on 0.0.8 version with minimum code change and without any problem.