Exception when converting int64 column to categorical then outputting to parquet
See original GitHub issueReproduction code
import dask.dataframe as ddf
import pandas as pd
import fastparquet
pdf = pd.DataFrame({'A':[1,1,12,3,4]})
df = ddf.from_pandas(pdf,npartitions=1)
df.A = df.A.astype('category')
out = df.compute()
fastparquet.write('test.parq', out)
df.to_parquet('test')
Exception
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-12-2ecc8ae72890> in <module>()
----> 1 df.to_parquet('test')
~/miniconda3/lib/python3.6/site-packages/dask/dataframe/core.py in to_parquet(self, path, *args, **kwargs)
1008 """ See dd.to_parquet docstring for more information """
1009 from .io import to_parquet
-> 1010 return to_parquet(self, path, *args, **kwargs)
1011
1012 def to_csv(self, filename, **kwargs):
~/miniconda3/lib/python3.6/site-packages/dask/dataframe/io/parquet.py in to_parquet(df, path, engine, compression, write_index, append, ignore_divisions, partition_on, storage_options, compute, **kwargs)
634
635 if compute:
--> 636 out.compute()
637 return None
638 return out
~/miniconda3/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
133 dask.base.compute
134 """
--> 135 (result,) = compute(self, traverse=False, **kwargs)
136 return result
137
~/miniconda3/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
331 postcomputes = [a.__dask_postcompute__() if is_dask_collection(a)
332 else (None, a) for a in args]
--> 333 results = get(dsk, keys, **kwargs)
334 results_iter = iter(results)
335 return tuple(a if f is None else f(next(results_iter), *a)
~/miniconda3/lib/python3.6/site-packages/dask/threaded.py in get(dsk, result, cache, num_workers, **kwargs)
73 results = get_async(pool.apply_async, len(pool._pool), dsk, result,
74 cache=cache, get_id=_thread_get_id,
---> 75 pack_exception=pack_exception, **kwargs)
76
77 # Cleanup pools associated to dead threads
~/miniconda3/lib/python3.6/site-packages/dask/local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
519 _execute_task(task, data) # Re-execute locally
520 else:
--> 521 raise_exception(exc, tb)
522 res, worker_id = loads(res_info)
523 state['cache'][key] = res
~/miniconda3/lib/python3.6/site-packages/dask/compatibility.py in reraise(exc, tb)
58 if exc.__traceback__ is not tb:
59 raise exc.with_traceback(tb)
---> 60 raise exc
61
62 else:
~/miniconda3/lib/python3.6/site-packages/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
288 try:
289 task, data = loads(task_info)
--> 290 result = _execute_task(task, data)
291 id = get_id()
292 result = dumps((result, id))
~/miniconda3/lib/python3.6/site-packages/dask/local.py in _execute_task(arg, cache, dsk)
269 func, args = arg[0], arg[1:]
270 args2 = [_execute_task(a, cache) for a in args]
--> 271 return func(*args2)
272 elif not ishashable(arg):
273 return arg
~/miniconda3/lib/python3.6/site-packages/dask/dataframe/io/parquet.py in _write_partition_fastparquet(df, fs, path, filename, fmd, compression, partition_on)
170 with fs.open(fs.sep.join([path, filename]), 'wb') as fil:
171 rgs = make_part_file(fil, df, fmd.schema, compression=compression,
--> 172 fmd=fmd)
173 return rgs
174
~/miniconda3/lib/python3.6/site-packages/fastparquet/writer.py in make_part_file(f, data, schema, compression, fmd)
604 with f as f:
605 f.write(MARKER)
--> 606 rg = make_row_group(f, data, schema, compression=compression)
607 if fmd is None:
608 fmd = parquet_thrift.FileMetaData(num_rows=len(data),
~/miniconda3/lib/python3.6/site-packages/fastparquet/writer.py in make_row_group(f, data, schema, compression)
592 comp = compression
593 chunk = write_column(f, data[column.name], column,
--> 594 compression=comp)
595 rg.columns.append(chunk)
596 rg.total_byte_size = sum([c.meta_data.total_uncompressed_size for c in
~/miniconda3/lib/python3.6/site-packages/fastparquet/writer.py in write_column(f, data, selement, compression)
460 num_values=len(data.cat.categories),
461 encoding=parquet_thrift.Encoding.PLAIN)
--> 462 bdata = encode['PLAIN'](pd.Series(data.cat.categories), selement)
463 bdata += 8 * b'\x00'
464 l0 = len(bdata)
~/miniconda3/lib/python3.6/site-packages/fastparquet/writer.py in encode_plain(data, se)
245 out = convert(data, se)
246 if se.type == parquet_thrift.Type.BYTE_ARRAY:
--> 247 return pack_byte_array(list(out))
248 else:
249 return out.tobytes()
~/miniconda3/lib/python3.6/site-packages/fastparquet/speedups.pyx in fastparquet.speedups.pack_byte_array (fastparquet/speedups.c:2810)()
TypeError: expected list of bytes
This exception comes from fastparquet but does not seem to be a fastparquet error as when you convert to pandas first and output to parquet without using dask it works fine.
Issue Analytics
- State:
- Created 6 years ago
- Comments:5 (5 by maintainers)
Top Results From Across the Web
Unable to convert dataframe to parquet, TypeError
Yes, parquet expects a single type per column. To fix a case like above (i.e. mixed value types), convert it to Pandas 'string'...
Read more >Reading and Writing the Apache Parquet Format
Categorical when converted to pandas. This option is only valid for string and binary column types, and it can yield significantly lower memory...
Read more >Reading and Writing the Apache Parquet Format
Categorical when converted to pandas. This option is only valid for string and binary column types, and it can yield significantly lower memory...
Read more >Reduce memory usage with Dask dtypes - Coiled
Columns in Dask DataFrames are typed, which means they can only hold certain values (e.g. integer columns can't hold string values).
Read more >IO tools (text, CSV, HDF5, …) — pandas 1.5.2 documentation
If the parsed data only contains one column then return a Series . ... The default uses dateutil.parser.parser to do the conversion. pandas...
Read more >Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start FreeTop Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found
Top GitHub Comments
I think we can close this since the behavior is understood and it seems like an extreme edge case.
Is there anything that needs to be done here? Close the issue / add an explanation note to the docs / do something else?