Crash while optimizing: RuntimeError: cholesky_cpu: U(1,1) is zero, singular U.
See original GitHub issueI got this recently trying to tune the hyperparameters on an MLP.
Relevant versions:
python==3.7.1
ax-platform==0.1.2
botorch==0.1.0
gpytorch==0.3.2
scipy==1.1.0
torch==1.1.0
I’m using ax.optimize()
as the entrypoint. It was 45 trials into the experiment. Here’s the stack trace.
ax.service.managed_loop: Running optimization trial 45...
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-37-5c288f8ea2dd> in <module>
76 ],
77 evaluation_function=do_train,
---> 78 minimize=True,
79 )
~/anaconda3/lib/python3.7/site-packages/ax/service/managed_loop.py in optimize(parameters, evaluation_function, experiment_name, objective_name, minimize, parameter_constraints, outcome_constraints, total_trials, arms_per_trial, wait_time)
204 wait_time=wait_time,
205 )
--> 206 loop.full_run()
207 parameterization, values = loop.get_best_point()
208 return parameterization, values, loop.experiment, loop.get_current_model()
~/anaconda3/lib/python3.7/site-packages/ax/service/managed_loop.py in full_run(self)
148 logger.info(f"Started full optimization with {num_steps} steps.")
149 for _ in range(num_steps):
--> 150 self.run_trial()
151 return self
152
~/anaconda3/lib/python3.7/site-packages/ax/service/managed_loop.py in run_trial(self)
128 trial = self.experiment.new_trial(
129 generator_run=self.generation_strategy.gen(
--> 130 experiment=self.experiment, new_data=dat
131 )
132 )
~/anaconda3/lib/python3.7/site-packages/ax/modelbridge/generation_strategy.py in gen(self, experiment, new_data, n, **kwargs)
161 elif new_data is not None:
162 # We're sticking with the current model, but update with new data
--> 163 self._model.update(experiment=experiment, data=new_data)
164
165 gen_run = not_none(self._model).gen(n=n, **(self._curr.model_gen_kwargs or {}))
~/anaconda3/lib/python3.7/site-packages/ax/modelbridge/base.py in update(self, data, experiment)
385 obs_feats = t.transform_observation_features(obs_feats)
386 obs_data = t.transform_observation_data(obs_data, obs_feats)
--> 387 self._update(observation_features=obs_feats, observation_data=obs_data)
388 self.fit_time += time.time() - t_update_start
389 self.fit_time_since_gen += time.time() - t_update_start
~/anaconda3/lib/python3.7/site-packages/ax/modelbridge/array.py in _update(self, observation_features, observation_data)
110 # Update in-design status for these new points.
111 self.training_in_design[-len(observation_features) :] = in_design
--> 112 self._model_update(Xs=Xs_array, Ys=Ys_array, Yvars=Yvars_array)
113
114 def _model_update(
~/anaconda3/lib/python3.7/site-packages/ax/modelbridge/torch.py in _model_update(self, Xs, Ys, Yvars)
113 Ys: List[Tensor] = self._array_list_to_tensors(Ys)
114 Yvars: List[Tensor] = self._array_list_to_tensors(Yvars)
--> 115 self.model.update(Xs=Xs, Ys=Ys, Yvars=Yvars)
116
117 def _model_predict(self, X: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
~/anaconda3/lib/python3.7/site-packages/ax/models/torch/botorch.py in update(self, Xs, Ys, Yvars)
372 Yvars=self.Yvars,
373 task_features=self.task_features,
--> 374 state_dict=state_dict,
375 )
~/anaconda3/lib/python3.7/site-packages/ax/models/torch/botorch_defaults.py in get_and_fit_model(Xs, Ys, Yvars, task_features, state_dict, **kwargs)
84 # pyre-ignore: [16]
85 mll = ExactMarginalLogLikelihood(model.likelihood, model)
---> 86 mll = fit_gpytorch_model(mll, bounds=bounds)
87 else:
88 model.load_state_dict(state_dict)
~/anaconda3/lib/python3.7/site-packages/botorch/fit.py in fit_gpytorch_model(mll, optimizer, **kwargs)
33 """
34 mll.train()
---> 35 mll, _ = optimizer(mll, track_iterations=False, **kwargs)
36 mll.eval()
37 return mll
~/anaconda3/lib/python3.7/site-packages/botorch/optim/fit.py in fit_gpytorch_scipy(mll, bounds, method, options, track_iterations)
186 jac=True,
187 options=options,
--> 188 callback=cb,
189 )
190 iterations = []
~/anaconda3/lib/python3.7/site-packages/scipy/optimize/_minimize.py in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
601 elif meth == 'l-bfgs-b':
602 return _minimize_lbfgsb(fun, x0, args, jac, bounds,
--> 603 callback=callback, **options)
604 elif meth == 'tnc':
605 return _minimize_tnc(fun, x0, args, jac, bounds, callback=callback,
~/anaconda3/lib/python3.7/site-packages/scipy/optimize/lbfgsb.py in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, **unknown_options)
333 # until the completion of the current minimization iteration.
334 # Overwrite f and g:
--> 335 f, g = func_and_grad(x)
336 elif task_str.startswith(b'NEW_X'):
337 # new iteration
~/anaconda3/lib/python3.7/site-packages/scipy/optimize/lbfgsb.py in func_and_grad(x)
283 else:
284 def func_and_grad(x):
--> 285 f = fun(x, *args)
286 g = jac(x, *args)
287 return f, g
~/anaconda3/lib/python3.7/site-packages/scipy/optimize/optimize.py in function_wrapper(*wrapper_args)
291 def function_wrapper(*wrapper_args):
292 ncalls[0] += 1
--> 293 return function(*(wrapper_args + args))
294
295 return ncalls, function_wrapper
~/anaconda3/lib/python3.7/site-packages/scipy/optimize/optimize.py in __call__(self, x, *args)
61 def __call__(self, x, *args):
62 self.x = numpy.asarray(x).copy()
---> 63 fg = self.fun(x, *args)
64 self.jac = fg[1]
65 return fg[0]
~/anaconda3/lib/python3.7/site-packages/botorch/optim/fit.py in _scipy_objective_and_grad(x, mll, property_dict)
221 output = mll.model(*train_inputs)
222 args = [output, train_targets] + _get_extra_mll_args(mll)
--> 223 loss = -mll(*args).sum()
224 loss.backward()
225 param_dict = OrderedDict(mll.named_parameters())
~/anaconda3/lib/python3.7/site-packages/gpytorch/module.py in __call__(self, *inputs, **kwargs)
20
21 def __call__(self, *inputs, **kwargs):
---> 22 outputs = self.forward(*inputs, **kwargs)
23 if isinstance(outputs, list):
24 return [_validate_module_outputs(output) for output in outputs]
~/anaconda3/lib/python3.7/site-packages/gpytorch/mlls/exact_marginal_log_likelihood.py in forward(self, output, target, *params)
26 # Get the log prob of the marginal distribution
27 output = self.likelihood(output, *params)
---> 28 res = output.log_prob(target)
29
30 # Add terms for SGPR / when inducing points are learned
~/anaconda3/lib/python3.7/site-packages/gpytorch/distributions/multivariate_normal.py in log_prob(self, value)
127
128 # Get log determininat and first part of quadratic form
--> 129 inv_quad, logdet = covar.inv_quad_logdet(inv_quad_rhs=diff.unsqueeze(-1), logdet=True)
130
131 res = -0.5 * sum([inv_quad, logdet, diff.size(-1) * math.log(2 * math.pi)])
~/anaconda3/lib/python3.7/site-packages/gpytorch/lazy/lazy_tensor.py in inv_quad_logdet(self, inv_quad_rhs, logdet, reduce_inv_quad)
990 from .chol_lazy_tensor import CholLazyTensor
991
--> 992 cholesky = CholLazyTensor(self.cholesky())
993 return cholesky.inv_quad_logdet(inv_quad_rhs=inv_quad_rhs, logdet=logdet, reduce_inv_quad=reduce_inv_quad)
994
~/anaconda3/lib/python3.7/site-packages/gpytorch/lazy/lazy_tensor.py in cholesky(self, upper)
716 (LazyTensor) Cholesky factor (lower triangular)
717 """
--> 718 res = self._cholesky()
719 if upper:
720 res = res.transpose(-1, -2)
~/anaconda3/lib/python3.7/site-packages/gpytorch/utils/memoize.py in g(self, *args, **kwargs)
32 cache_name = name if name is not None else method
33 if not is_in_cache(self, cache_name):
---> 34 add_to_cache(self, cache_name, method(self, *args, **kwargs))
35 return get_from_cache(self, cache_name)
36
~/anaconda3/lib/python3.7/site-packages/gpytorch/lazy/lazy_tensor.py in _cholesky(self)
401 evaluated_mat.register_hook(_ensure_symmetric_grad)
402
--> 403 cholesky = psd_safe_cholesky(evaluated_mat.double()).to(self.dtype)
404 return NonLazyTensor(cholesky)
405
~/anaconda3/lib/python3.7/site-packages/gpytorch/utils/cholesky.py in psd_safe_cholesky(A, upper, out, jitter)
45 continue
46
---> 47 raise e
48
49
~/anaconda3/lib/python3.7/site-packages/gpytorch/utils/cholesky.py in psd_safe_cholesky(A, upper, out, jitter)
19 """
20 try:
---> 21 L = torch.cholesky(A, upper=upper, out=out)
22 # TODO: Remove once fixed in pytorch (#16780)
23 if A.dim() > 2 and A.is_cuda:
RuntimeError: cholesky_cpu: U(1,1) is zero, singular U.
Issue Analytics
- State:
- Created 4 years ago
- Reactions:1
- Comments:16 (12 by maintainers)
Top Results From Across the Web
RuntimeError: cholesky_cpu: U(1,1) is zero, singular U - Misc.
RuntimeError : cholesky_cpu: U(1,1) is zero, singular U - ... Normal(tensor(0.), tensor(100.))) ... Any pointers on what am I missing this time?
Read more >Error Cholesky CPU - PyTorch Forums
Hi, I'm getting this RuntimeError while running an optimization ... as e: RuntimeError: cholesky_cpu: U(135,135) is zero, singular U.
Read more >Pytorch torch.cholesky ignoring exception - Stack Overflow
When performing cholesky decomposition PyTorch relies on LAPACK for CPU tensors and MAGMA for CUDA tensors. In the PyTorch code used to call ......
Read more >Detect and diagnose crashes - Android Developers
When an app crashes, Android terminates the app's process and displays a dialog to let the user know that the app has stopped,...
Read more >Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start FreeTop Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found
Top GitHub Comments
@Balandat it hasn’t happened again since I lowered the clamping value down to 1. Also, I’m not using that blackbox function any more as there was a bug in the data pre-processing stage. That was causing the good objective values to be all very close together, which is consistent with the theory that the scaling of the objectives was the problem. Although the timing of the error is odd (iteration 31 vs 45). Since the jittering is stochastic, maybe it’s just bad luck – we should configure the jitter warnings to show up in the logs, because if the jitter warnings start showing up right after 31 and just happen to go past 3 tries on 45, that’s a strong signal.
@leopd, I took a closer look at this and wasn’t able to reproduce the fitting issues (though I can’t rule out that I messed up in trying to re-create the experiment state from the logs, I’ll ask some other folks to double-check this).
Is this issue reproducible on your end?
Here is the notebook that I used, let me know if this looks sane to you: debug_leos_fitting_issue.ipynb.txt
Aside: Above I stated that
This argument doesn’t convince me anymore, since the clamping happens much earlier (iteration 31) than the failure in the model fitting. If this were the cause, one would expect this to happen once the model gets refit on the outlier value.