GPU Memory and Evaluator shape mismatch
See original GitHub issueSo this should probably be 2 separate problems but anyway here it is:
My first problem is that the StatisticsGen takes ~5700MiB of my 6077MiB GPU when I run it through InteractiveContext and doesn’t unload until I restart the notebook. I’m using CUDA V10.1.243 and the problem is independent of the python or tfx version.
The second problem is for an image classifier with TFX. When I run the Evaluator I get a shape mismatch from the metrics, whether I use CategoricalAccuracy, SparseCategoricalAccuracy, or even when I do not use any metric in the eval_config. My guess is that the evaluator is calling keras binary accuracy backend by default but I don’t know why.
Here’s the error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<timed eval> in <module>
~/.local/lib/python3.8/site-packages/tfx/orchestration/experimental/interactive/interactive_context.py in run_if_ipython(*args, **kwargs)
65 # __IPYTHON__ variable is set by IPython, see
66 # https://ipython.org/ipython-doc/rel-0.10.2/html/interactive/reference.html#embedding-ipython.
---> 67 return fn(*args, **kwargs)
68 else:
69 absl.logging.warning(
~/.local/lib/python3.8/site-packages/tfx/orchestration/experimental/interactive/interactive_context.py in run(self, component, enable_cache, beam_pipeline_args)
180 telemetry_utils.LABEL_TFX_RUNNER: runner_label,
181 }):
--> 182 execution_id = launcher.launch().execution_id
183
184 return execution_result.ExecutionResult(
~/.local/lib/python3.8/site-packages/tfx/orchestration/launcher/base_component_launcher.py in launch(self)
200 absl.logging.info('Running executor for %s',
201 self._component_info.component_id)
--> 202 self._run_executor(execution_decision.execution_id,
203 execution_decision.input_dict,
204 execution_decision.output_dict,
~/.local/lib/python3.8/site-packages/tfx/orchestration/launcher/in_process_component_launcher.py in _run_executor(self, execution_id, input_dict, output_dict, exec_properties)
65 executor_context) # type: ignore
66
---> 67 executor.Do(input_dict, output_dict, exec_properties)
~/.local/lib/python3.8/site-packages/tfx/components/evaluator/executor.py in Do(self, input_dict, output_dict, exec_properties)
250 tensor_adapter_config=tensor_adapter_config)
251
--> 252 (examples_list | 'FlattenExamples' >> beam.Flatten()
253 |
254 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults(
~/.local/lib/python3.8/site-packages/apache_beam/pipeline.py in __exit__(self, exc_type, exc_val, exc_tb)
566 try:
567 if not exc_type:
--> 568 self.result = self.run()
569 self.result.wait_until_finish()
570 finally:
~/.local/lib/python3.8/site-packages/apache_beam/pipeline.py in run(self, test_runner_api)
545 finally:
546 shutil.rmtree(tmpdir)
--> 547 return self.runner.run_pipeline(self, self._options)
548 finally:
549 shutil.rmtree(self.local_tempdir, ignore_errors=True)
~/.local/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py in run_pipeline(self, pipeline, options)
173 options.view_as(pipeline_options.ProfilingOptions))
174
--> 175 self._latest_run_result = self.run_via_runner_api(
176 pipeline.to_runner_api(default_environment=self._default_environment))
177 return self._latest_run_result
~/.local/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py in run_via_runner_api(self, pipeline_proto)
184 # TODO(pabloem, BEAM-7514): Create a watermark manager (that has access to
185 # the teststream (if any), and all the stages).
--> 186 return self.run_stages(stage_context, stages)
187
188 @contextlib.contextmanager
~/.local/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py in run_stages(self, stage_context, stages)
342 runner_execution_context, stage, self._num_workers)
343
--> 344 stage_results = self._run_stage(
345 runner_execution_context,
346 bundle_context_manager,
~/.local/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py in _run_stage(self, runner_execution_context, bundle_context_manager)
525
526 while True:
--> 527 last_result, deferred_inputs, fired_timers = self._run_bundle(
528 runner_execution_context,
529 bundle_context_manager,
~/.local/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py in _run_bundle(self, runner_execution_context, bundle_context_manager, data_input, data_output, input_timers, expected_timer_output, bundle_manager)
569 expected_timer_output)
570
--> 571 result, splits = bundle_manager.process_bundle(
572 data_input, data_output, input_timers, expected_timer_output)
573 # Now we collect all the deferred inputs remaining from bundle execution.
~/.local/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/fn_runner.py in process_bundle(self, inputs, expected_outputs, fired_timers, expected_output_timers, dry_run)
850 process_bundle_descriptor.id,
851 cache_tokens=[next(self._cache_token_generator)]))
--> 852 result_future = self._worker_handler.control_conn.push(process_bundle_req)
853
854 split_results = [] # type: List[beam_fn_api_pb2.ProcessBundleSplitResponse]
~/.local/lib/python3.8/site-packages/apache_beam/runners/portability/fn_api_runner/worker_handlers.py in push(self, request)
351 self._uid_counter += 1
352 request.instruction_id = 'control_%s' % self._uid_counter
--> 353 response = self.worker.do_instruction(request)
354 return ControlFuture(request.instruction_id, response)
355
~/.local/lib/python3.8/site-packages/apache_beam/runners/worker/sdk_worker.py in do_instruction(self, request)
481 if request_type:
482 # E.g. if register is set, this will call self.register(request.register))
--> 483 return getattr(self, request_type)(
484 getattr(request, request_type), request.instruction_id)
485 else:
~/.local/lib/python3.8/site-packages/apache_beam/runners/worker/sdk_worker.py in process_bundle(self, request, instruction_id)
517 with self.maybe_profile(instruction_id):
518 delayed_applications, requests_finalization = (
--> 519 bundle_processor.process_bundle(instruction_id))
520 monitoring_infos = bundle_processor.monitoring_infos()
521 monitoring_infos.extend(self.state_cache_metrics_fn())
~/.local/lib/python3.8/site-packages/apache_beam/runners/worker/bundle_processor.py in process_bundle(self, instruction_id)
988 for op in self.ops.values():
989 _LOGGER.debug('finish %s', op)
--> 990 op.finish()
991
992 # Close every timer output stream
~/.local/lib/python3.8/site-packages/apache_beam/runners/worker/operations.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.worker.operations.PGBKCVOperation.finish()
~/.local/lib/python3.8/site-packages/apache_beam/runners/worker/operations.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.worker.operations.PGBKCVOperation.finish()
~/.local/lib/python3.8/site-packages/apache_beam/runners/worker/operations.cpython-38-x86_64-linux-gnu.so in apache_beam.runners.worker.operations.PGBKCVOperation.output_key()
/usr/local/lib/python3.8/dist-packages/tensorflow_model_analysis/evaluators/metrics_and_plots_evaluator_v2.py in compact(self, accumulator)
347 def compact(self, accumulator: Any) -> Any:
348 self._num_compacts.inc(1)
--> 349 return super(_ComputationsCombineFn, self).compact(accumulator)
350
351 def extract_output(self, accumulator: Any) -> metric_types.MetricsDict:
~/.local/lib/python3.8/site-packages/apache_beam/transforms/combiners.py in compact(self, accumulator)
743
744 def compact(self, accumulator):
--> 745 return [c.compact(a) for c, a in zip(self._combiners, accumulator)]
746
747 def extract_output(self, accumulator):
~/.local/lib/python3.8/site-packages/apache_beam/transforms/combiners.py in <listcomp>(.0)
743
744 def compact(self, accumulator):
--> 745 return [c.compact(a) for c, a in zip(self._combiners, accumulator)]
746
747 def extract_output(self, accumulator):
/usr/local/lib/python3.8/dist-packages/tensorflow_model_analysis/metrics/tf_metric_wrapper.py in compact(self, accumulator)
586 self, accumulator: _CompilableMetricsAccumulator
587 ) -> _CompilableMetricsAccumulator:
--> 588 self._process_batch(accumulator)
589 return accumulator
590
/usr/local/lib/python3.8/dist-packages/tensorflow_model_analysis/metrics/tf_metric_wrapper.py in _process_batch(self, accumulator)
546 for metric_index, metric in enumerate(self._metrics[output_name]):
547 metric.reset_states()
--> 548 metric.update_state(*inputs)
549 accumulator.add_weights(output_index, metric_index,
550 metric.get_weights())
~/.local/lib/python3.8/site-packages/tensorflow/python/keras/utils/metrics_utils.py in decorated(metric_obj, *args, **kwargs)
88
89 with tf_utils.graph_context_for_symbolic_tensors(*args, **kwargs):
---> 90 update_op = update_state_fn(*args, **kwargs)
91 if update_op is not None: # update_op will be None in eager execution.
92 metric_obj.add_update(update_op)
~/.local/lib/python3.8/site-packages/tensorflow/python/keras/metrics.py in update_state_fn(*args, **kwargs)
174 control_status = ag_ctx.control_status_ctx()
175 ag_update_state = autograph.tf_convert(obj_update_state, control_status)
--> 176 return ag_update_state(*args, **kwargs)
177 else:
178 if isinstance(obj.update_state, def_function.Function):
~/.local/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py in wrapper(*args, **kwargs)
253 try:
254 with conversion_ctx:
--> 255 return converted_call(f, args, kwargs, options=options)
256 except Exception as e: # pylint:disable=broad-except
257 if hasattr(e, 'ag_error_metadata'):
~/.local/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py in converted_call(f, args, kwargs, caller_fn_scope, options)
455 if conversion.is_in_whitelist_cache(f, options):
456 logging.log(2, 'Whitelisted %s: from cache', f)
--> 457 return _call_unconverted(f, args, kwargs, options, False)
458
459 if ag_ctx.control_status_ctx().status == ag_ctx.Status.DISABLED:
~/.local/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py in _call_unconverted(f, args, kwargs, options, update_cache)
337
338 if kwargs is not None:
--> 339 return f(*args, **kwargs)
340 return f(*args)
341
~/.local/lib/python3.8/site-packages/tensorflow/python/keras/metrics.py in update_state(self, y_true, y_pred, sample_weight)
610
611 ag_fn = autograph.tf_convert(self._fn, ag_ctx.control_status_ctx())
--> 612 matches = ag_fn(y_true, y_pred, **self._fn_kwargs)
613 return super(MeanMetricWrapper, self).update_state(
614 matches, sample_weight=sample_weight)
~/.local/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py in wrapper(*args, **kwargs)
253 try:
254 with conversion_ctx:
--> 255 return converted_call(f, args, kwargs, options=options)
256 except Exception as e: # pylint:disable=broad-except
257 if hasattr(e, 'ag_error_metadata'):
~/.local/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py in converted_call(f, args, kwargs, caller_fn_scope, options)
530
531 if not options.user_requested and conversion.is_whitelisted(f):
--> 532 return _call_unconverted(f, args, kwargs, options)
533
534 # internal_convert_user_code is for example turned off when issuing a dynamic
~/.local/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py in _call_unconverted(f, args, kwargs, options, update_cache)
337
338 if kwargs is not None:
--> 339 return f(*args, **kwargs)
340 return f(*args)
341
~/.local/lib/python3.8/site-packages/tensorflow/python/keras/metrics.py in accuracy(y_true, y_pred)
3206 metrics_utils.ragged_assert_compatible_and_get_flat_values(
3207 [y_pred, y_true])
-> 3208 y_pred.shape.assert_is_compatible_with(y_true.shape)
3209 if y_true.dtype != y_pred.dtype:
3210 y_pred = math_ops.cast(y_pred, y_true.dtype)
~/.local/lib/python3.8/site-packages/tensorflow/python/framework/tensor_shape.py in assert_is_compatible_with(self, other)
1132 """
1133 if not self.is_compatible_with(other):
-> 1134 raise ValueError("Shapes %s and %s are incompatible" % (self, other))
1135
1136 def most_specific_compatible_shape(self, other):
ValueError: Shapes (8, 6) and (8, 1) are incompatible
(I have 6 categories as an output of my fully connected layer)
Thanks!
Issue Analytics
- State:
- Created 3 years ago
- Comments:14 (4 by maintainers)
Top GitHub Comments
For the second question, when you compile your model metric is set to ‘accuracy’, so in eval config it should match it, here is an example
Are you satisfied with the resolution of your issue? Yes No