Report bugs with costant column in Spark DataFrame
See original GitHub issueHi PandasProfiling Team,
I am testing your spark-branch
and I encountered the a bug with a Spark DataFrame holding constant columns. Full stack-trace here:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~/Code/spark-utils/pysparq/.env/lib/python3.9/site-packages/IPython/core/formatters.py in __call__(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:
~/Code/spark-utils/pysparq/.env/lib/python3.9/site-packages/pandas_profiling/profile_report.py in _repr_html_(self)
422 def _repr_html_(self) -> None:
423 """The ipython notebook widgets user interface gets called by the jupyter notebook."""
--> 424 self.to_notebook_iframe()
425
426 def __repr__(self) -> str:
~/Code/spark-utils/pysparq/.env/lib/python3.9/site-packages/pandas_profiling/profile_report.py in to_notebook_iframe(self)
402 with warnings.catch_warnings():
403 warnings.simplefilter("ignore")
--> 404 display(get_notebook_iframe(self.config, self))
405
406 def to_widgets(self) -> None:
~/Code/spark-utils/pysparq/.env/lib/python3.9/site-packages/pandas_profiling/report/presentation/flavours/widget/notebook.py in get_notebook_iframe(config, profile)
73 output = get_notebook_iframe_src(config, profile)
74 elif attribute == IframeAttribute.srcdoc:
---> 75 output = get_notebook_iframe_srcdoc(config, profile)
76 else:
77 raise ValueError(
~/Code/spark-utils/pysparq/.env/lib/python3.9/site-packages/pandas_profiling/report/presentation/flavours/widget/notebook.py in get_notebook_iframe_srcdoc(config, profile)
27 width = config.notebook.iframe.width
28 height = config.notebook.iframe.height
---> 29 src = html.escape(profile.to_html())
30
31 iframe = f'<iframe width="{width}" height="{height}" srcdoc="{src}" frameborder="0" allowfullscreen></iframe>'
~/Code/spark-utils/pysparq/.env/lib/python3.9/site-packages/pandas_profiling/profile_report.py in to_html(self)
372
373 """
--> 374 return self.html
375
376 def to_json(self) -> str:
~/Code/spark-utils/pysparq/.env/lib/python3.9/site-packages/pandas_profiling/profile_report.py in html(self)
185 def html(self) -> str:
186 if self._html is None:
--> 187 self._html = self._render_html()
188 return self._html
189
~/Code/spark-utils/pysparq/.env/lib/python3.9/site-packages/pandas_profiling/profile_report.py in _render_html(self)
287 from pandas_profiling.report.presentation.flavours import HTMLReport
288
--> 289 report = self.report
290
291 with tqdm(
~/Code/spark-utils/pysparq/.env/lib/python3.9/site-packages/pandas_profiling/profile_report.py in report(self)
179 def report(self) -> Root:
180 if self._report is None:
--> 181 self._report = get_report_structure(self.config, self.description_set)
182 return self._report
183
~/Code/spark-utils/pysparq/.env/lib/python3.9/site-packages/pandas_profiling/profile_report.py in description_set(self)
161 def description_set(self) -> Dict[str, Any]:
162 if self._description_set is None:
--> 163 self._description_set = describe_df(
164 self.config,
165 self.df,
~/Code/spark-utils/pysparq/.env/lib/python3.9/site-packages/pandas_profiling/model/describe.py in describe(config, df, summarizer, typeset, sample)
177 table_stats = table_stats.dict()
178
--> 179 alerts = progress(get_alerts, pbar, "Get alerts")(
180 config, table_stats, series_description, correlations
181 )
~/Code/spark-utils/pysparq/.env/lib/python3.9/site-packages/pandas_profiling/utils/progress_bar.py in inner(*args, **kwargs)
9 def inner(*args, **kwargs) -> Any:
10 bar.set_postfix_str(message)
---> 11 ret = fn(*args, **kwargs)
12 bar.update()
13 return ret
~/Code/spark-utils/pysparq/.env/lib/python3.9/site-packages/pandas_profiling/model/alerts.py in get_alerts(config, table_stats, series_description, correlations)
319 alerts = check_table_alerts(table_stats)
320 for col, description in series_description.items():
--> 321 alerts += check_variable_alerts(config, col, description)
322 alerts += check_correlation_alerts(config, correlations)
323 alerts.sort(key=lambda alert: str(alert.alert_type))
~/Code/spark-utils/pysparq/.env/lib/python3.9/site-packages/pandas_profiling/model/alerts.py in check_variable_alerts(config, col, description)
287 alerts += categorical_alerts(config, description)
288 if description["type"] == "Numeric":
--> 289 alerts += numeric_alerts(config, description)
290
291 for idx in range(len(alerts)):
~/Code/spark-utils/pysparq/.env/lib/python3.9/site-packages/pandas_profiling/model/alerts.py in numeric_alerts(config, summary)
137
138 # Skewness
--> 139 if skewness_alert(summary["skewness"], config.vars.num.skewness_threshold):
140 alerts.append(
141 Alert(
~/Code/spark-utils/pysparq/.env/lib/python3.9/site-packages/pandas_profiling/model/alerts.py in skewness_alert(v, threshold)
330
331 def skewness_alert(v: float, threshold: int) -> bool:
--> 332 return not np.isnan(v) and (v < (-1 * threshold) or v > threshold)
333
334
TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
To reproduce:
df_fake = spark.createDataFrame(data=pd.DataFrame(data={"colA": [1, 1, 1, 1], "colB": [51, 2, 3, 4]}))
a = ProfileReport(df_fake, minimal=True)
a
I’m using Python 3.9.9 on a M1 Mac.
Hope you’ll find a way to deal with constant columns.
François
Issue Analytics
- State:
- Created 2 years ago
- Comments:11 (1 by maintainers)
Top Results From Across the Web
Problem/bug with list values reading from Spark dataframe ...
The problem is that you are not working with the same objects. When you work with pandas and you get x = pdf.value...
Read more >Spark Add Constant Column to DataFrame
Let's see how to add a new column by assigning a literal or constant value to Spark DataFrame. Spark SQL provides lit() and...
Read more >select and add columns in PySpark - MungingData
This post shows you how to select a subset of the columns in a DataFrame with select . It also shows how select...
Read more >python - Replace all numeric values in a pyspark dataframe by ...
In general, the numeric elements have different values. How is it possible to replace all the numeric values of the dataframe by a...
Read more >sparklyr – news - RStudio
Fixed a bug that previously affected dplyr::tbl() when the source table is specified ... array<byte> and array<boolean> columns in a Spark dataframe will...
Read more >
Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start Free
Top Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found
Hi @chanedwin . It worked on a minimalist Virtualenv but my main project is not OK yet. Will try to run the update on this latter too ! I will keep you updated
Closing the issue as it was solved by #903.