Stuck on an issue?

Lightrun Answers was designed to reduce the constant googling that comes with debugging 3rd party libraries. It collects links to all the places you might be looking at while hunting down a tough bug.

And, if you’re still stuck at the end, we’re happy to hop on a call to see how we can help out.

dask.Dataframe.to_parquet fail with Category column

See original GitHub issue

It looks like a dask.Dataframe with Category column will fail to_parquet. Specifically it fails when writing the Category enumeration Series object. The weird part is that directly writing the pandas DataFrame using fastparquet works fine. It could be fastparquet issue, but I report to dask because it doesn’t fail when using fastparquet directly.

Here’s the example that reproduce the problem:

from dask import dataframe as dd
import fastparquet as fpq
import numpy as np
import pandas as pd

# DataFrame with a Category column 'c' and other columns.
a = pd.DataFrame(np.random.randn(10000, 2), columns=['a','b'])
a['c'] = pd.Series(np.random.choice(['B','S'], size=len(a)), dtype='category')

# write using fpq directly is OK
fpq.write('/tmp/ah', a, file_scheme='hive') 

# convert to dask.DataFrame and write parquet fail
da = dd.from_pandas(a, npartitions=4)
da.to_parquet('/tmp/da')

The data that it tries to write but fail is pd.Series(a[‘c’].cat.categories): 0 B 1 S dtype: object , by this point all the Dataframe data except for meta data related have be written successfully.

The versions of packages I use: {‘dask’: u’0.16.0’, ‘fastparquet’: u’0.1.3’, ‘pandas’: u’0.21.1’}

Below is the full traceback:

ValueErrorTraceback (most recent call last) <ipython-input-1-3bab8d0bab0a> in <module>() 17 # convert to dask.DataFrame and write parquet fail 18 da = dd.from_pandas(a, npartitions=4) —> 19 da.to_parquet(‘/tmp/da’)

/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/dask/dataframe/core.pyc in to_parquet(self, path, *args, **kwargs) 983 “”" See dd.to_parquet docstring for more information “”" 984 from .io import to_parquet –> 985 return to_parquet(self, path, *args, **kwargs) 986 987 def to_csv(self, filename, **kwargs):

/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/dask/dataframe/io/parquet.pyc in to_parquet(df, path, engine, compression, write_index, append, ignore_divisions, partition_on, storage_options, compute, **kwargs) 616 617 if compute: –> 618 out.compute() 619 return None 620 return out

/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/dask/base.pyc in compute(self, **kwargs) 133 dask.base.compute 134 “”" –> 135 (result,) = compute(self, traverse=False, **kwargs) 136 return result 137

/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/dask/base.pyc in compute(*args, **kwargs) 331 postcomputes = [a.dask_postcompute() if is_dask_collection(a) 332 else (None, a) for a in args] –> 333 results = get(dsk, keys, **kwargs) 334 results_iter = iter(results) 335 return tuple(a if f is None else f(next(results_iter), *a)

/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/dask/threaded.pyc in get(dsk, result, cache, num_workers, **kwargs) 73 results = get_async(pool.apply_async, len(pool._pool), dsk, result, 74 cache=cache, get_id=_thread_get_id, —> 75 pack_exception=pack_exception, **kwargs) 76 77 # Cleanup pools associated to dead threads

/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/dask/local.pyc in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs) 519 _execute_task(task, data) # Re-execute locally 520 else: –> 521 raise_exception(exc, tb) 522 res, worker_id = loads(res_info) 523 state[‘cache’][key] = res

/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/dask/local.pyc in execute_task(key, task_info, dumps, loads, get_id, pack_exception) 288 try: 289 task, data = loads(task_info) –> 290 result = _execute_task(task, data) 291 id = get_id() 292 result = dumps((result, id))

/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/dask/local.pyc in _execute_task(arg, cache, dsk) 269 func, args = arg[0], arg[1:] 270 args2 = [_execute_task(a, cache) for a in args] –> 271 return func(*args2) 272 elif not ishashable(arg): 273 return arg

/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/dask/dataframe/io/parquet.pyc in _write_partition_fastparquet(df, fs, path, filename, fmd, compression, partition_on) 171 with fs.open(fs.sep.join([path, filename]), ‘wb’) as fil: 172 rgs = make_part_file(fil, df, fmd.schema, compression=compression, –> 173 fmd=fmd) 174 return rgs 175

/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/fastparquet/writer.pyc in make_part_file(f, data, schema, compression, fmd) 604 with f as f: 605 f.write(MARKER) –> 606 rg = make_row_group(f, data, schema, compression=compression) 607 if fmd is None: 608 fmd = parquet_thrift.FileMetaData(num_rows=len(data),

/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/fastparquet/writer.pyc in make_row_group(f, data, schema, compression) 592 comp = compression 593 chunk = write_column(f, data[column.name], column, –> 594 compression=comp) 595 rg.columns.append(chunk) 596 rg.total_byte_size = sum([c.meta_data.total_uncompressed_size for c in

/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/fastparquet/writer.pyc in write_column(f, data, selement, compression) 460 num_values=len(data.cat.categories), 461 encoding=parquet_thrift.Encoding.PLAIN) –> 462 bdata = encode[‘PLAIN’](pd.Series(data.cat.categories), selement) 463 bdata += 8 * b’\x00’ 464 l0 = len(bdata)

/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/fastparquet/writer.pyc in encode_plain(data, se) 243 def encode_plain(data, se): 244 “”“PLAIN encoding; returns byte representation”“” –> 245 out = convert(data, se) 246 if se.type == parquet_thrift.Type.BYTE_ARRAY: 247 return pack_byte_array(list(out))

/mnt/beegfs/xwang/app/anaconda2/lib/python2.7/site-packages/fastparquet/writer.pyc in convert(data, se) 188 raise ValueError('Error converting column “%s” to bytes using ’ 189 'encoding %s. Original error: ’ –> 190 ‘%s’ % (data.name, ct, e)) 191 elif converted_type == parquet_thrift.ConvertedType.TIMESTAMP_MICROS: 192 out = np.empty(len(data), ‘int64’)

ValueError: Error converting column “None” to bytes using encoding UTF8. Original error: bad argument type for built-in operation