dask.Dataframe.to_parquet fail with Category column
See original GitHub issueIt looks like a dask.Dataframe with Category column will fail to_parquet. Specifically it fails when writing the Category enumeration Series object. The weird part is that directly writing the pandas DataFrame using fastparquet works fine. It could be fastparquet issue, but I report to dask because it doesn’t fail when using fastparquet directly.
Here’s the example that reproduce the problem:
from dask import dataframe as dd
import fastparquet as fpq
import numpy as np
import pandas as pd
# DataFrame with a Category column 'c' and other columns.
a = pd.DataFrame(np.random.randn(10000, 2), columns=['a','b'])
a['c'] = pd.Series(np.random.choice(['B','S'], size=len(a)), dtype='category')
# write using fpq directly is OK
fpq.write('/tmp/ah', a, file_scheme='hive')
# convert to dask.DataFrame and write parquet fail
da = dd.from_pandas(a, npartitions=4)
da.to_parquet('/tmp/da')
The data that it tries to write but fail is pd.Series(a[‘c’].cat.categories): 0 B 1 S dtype: object , by this point all the Dataframe data except for meta data related have be written successfully.
The versions of packages I use: {‘dask’: u’0.16.0’, ‘fastparquet’: u’0.1.3’, ‘pandas’: u’0.21.1’}
Below is the full traceback:
ValueErrorTraceback (most recent call last) <ipython-input-1-3bab8d0bab0a> in <module>() 17 # convert to dask.DataFrame and write parquet fail 18 da = dd.from_pandas(a, npartitions=4) —> 19 da.to_parquet(‘/tmp/da’)
/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/dask/dataframe/core.pyc in to_parquet(self, path, *args, **kwargs) 983 “”" See dd.to_parquet docstring for more information “”" 984 from .io import to_parquet –> 985 return to_parquet(self, path, *args, **kwargs) 986 987 def to_csv(self, filename, **kwargs):
/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/dask/dataframe/io/parquet.pyc in to_parquet(df, path, engine, compression, write_index, append, ignore_divisions, partition_on, storage_options, compute, **kwargs) 616 617 if compute: –> 618 out.compute() 619 return None 620 return out
/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/dask/base.pyc in compute(self, **kwargs) 133 dask.base.compute 134 “”" –> 135 (result,) = compute(self, traverse=False, **kwargs) 136 return result 137
/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/dask/base.pyc in compute(*args, **kwargs) 331 postcomputes = [a.dask_postcompute() if is_dask_collection(a) 332 else (None, a) for a in args] –> 333 results = get(dsk, keys, **kwargs) 334 results_iter = iter(results) 335 return tuple(a if f is None else f(next(results_iter), *a)
/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/dask/threaded.pyc in get(dsk, result, cache, num_workers, **kwargs) 73 results = get_async(pool.apply_async, len(pool._pool), dsk, result, 74 cache=cache, get_id=_thread_get_id, —> 75 pack_exception=pack_exception, **kwargs) 76 77 # Cleanup pools associated to dead threads
/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/dask/local.pyc in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs) 519 _execute_task(task, data) # Re-execute locally 520 else: –> 521 raise_exception(exc, tb) 522 res, worker_id = loads(res_info) 523 state[‘cache’][key] = res
/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/dask/local.pyc in execute_task(key, task_info, dumps, loads, get_id, pack_exception) 288 try: 289 task, data = loads(task_info) –> 290 result = _execute_task(task, data) 291 id = get_id() 292 result = dumps((result, id))
/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/dask/local.pyc in _execute_task(arg, cache, dsk) 269 func, args = arg[0], arg[1:] 270 args2 = [_execute_task(a, cache) for a in args] –> 271 return func(*args2) 272 elif not ishashable(arg): 273 return arg
/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/dask/dataframe/io/parquet.pyc in _write_partition_fastparquet(df, fs, path, filename, fmd, compression, partition_on) 171 with fs.open(fs.sep.join([path, filename]), ‘wb’) as fil: 172 rgs = make_part_file(fil, df, fmd.schema, compression=compression, –> 173 fmd=fmd) 174 return rgs 175
/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/fastparquet/writer.pyc in make_part_file(f, data, schema, compression, fmd) 604 with f as f: 605 f.write(MARKER) –> 606 rg = make_row_group(f, data, schema, compression=compression) 607 if fmd is None: 608 fmd = parquet_thrift.FileMetaData(num_rows=len(data),
/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/fastparquet/writer.pyc in make_row_group(f, data, schema, compression) 592 comp = compression 593 chunk = write_column(f, data[column.name], column, –> 594 compression=comp) 595 rg.columns.append(chunk) 596 rg.total_byte_size = sum([c.meta_data.total_uncompressed_size for c in
/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/fastparquet/writer.pyc in write_column(f, data, selement, compression) 460 num_values=len(data.cat.categories), 461 encoding=parquet_thrift.Encoding.PLAIN) –> 462 bdata = encode[‘PLAIN’](pd.Series(data.cat.categories), selement) 463 bdata += 8 * b’\x00’ 464 l0 = len(bdata)
/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/fastparquet/writer.pyc in encode_plain(data, se) 243 def encode_plain(data, se): 244 “”“PLAIN encoding; returns byte representation”“” –> 245 out = convert(data, se) 246 if se.type == parquet_thrift.Type.BYTE_ARRAY: 247 return pack_byte_array(list(out))
/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/fastparquet/writer.pyc in convert(data, se) 188 raise ValueError('Error converting column “%s” to bytes using ’ 189 'encoding %s. Original error: ’ –> 190 ‘%s’ % (data.name, ct, e)) 191 elif converted_type == parquet_thrift.ConvertedType.TIMESTAMP_MICROS: 192 out = np.empty(len(data), ‘int64’)
ValueError: Error converting column “None” to bytes using encoding UTF8. Original error: bad argument type for built-in operation
Issue Analytics
- State:
- Created 6 years ago
- Comments:8 (7 by maintainers)
Top GitHub Comments
I tried
pd.read_csv(path,low_memory=False)
while reading the table, then there was no issue while saving in parquet format.This issue seems to be well stale. @rraadd88 , if you think you have a new issue, please post again.