Stuck on an issue?

Lightrun Answers was designed to reduce the constant googling that comes with debugging 3rd party libraries. It collects links to all the places you might be looking at while hunting down a tough bug.

And, if you’re still stuck at the end, we’re happy to hop on a call to see how we can help out.

read_csv skiprows parameter bug (possible)

See original GitHub issue

Here is the code

# import pandas as pd
import modin.pandas as pd
from distributed import Client

z = 1
print(f"{z }")

if __name__ == '__main__':
    if pd.__name__ == 'modin.pandas':
        client = Client(n_workers=3)

    us_columns_dtype = {
        'date/time': 'string',
        'sku': 'string',
        'description': 'string',
        'type': 'string',
        'settlement id': 'string',
        'order id': 'string',
        'quantity': 'Int32',
        'product sales': "float64",
        'fulfillment': str,
    }

    skiprows = 0
    file_path = r"D:\_\OD\DR_US_2021Jun6-2022Jun5CustomUnifiedTransaction.csv"
    separator = ','
    en_cod_ = 'utf-8'
    columns_dtype = us_columns_dtype.copy()

    readed_into_df = pd.read_csv(
        str(file_path),
        sep=separator,
        encoding=en_cod_,
        thousands=",",
        skiprows=skiprows,
        on_bad_lines="skip",
        usecols=columns_dtype.keys(),
        dtype=columns_dtype,
    )

Usage the skiprows parameter cause the next error listing


1
1
1
1
2022-06-07 14:45:15,373 - distributed.worker - WARNING - Compute Failed
Key:       parse-41bece4b074ebd6bc2c6f8a110fd29b1
Function:  parse
args:      ()
kwargs:    {'sep': ',', 'delimiter': None, 'header': 'infer', 'names': Index(['date/time', 'settlement id', 'type', 'order id', 'sku', 'description',
       'quantity', 'fulfillment', 'product sales'],
      dtype='object'), 'index_col': None, 'usecols': dict_keys(['date/time', 'sku', 'description', 'type', 'settlement id', 'order id', 'quantity', 'product sales', 
'fulfillment']), 'prefix': <no_default>, 'mangle_dupe_cols': True, 'dtype': {'date/time': 'string', 'sku': 'string', 'description': 'string', 'type': 'string', 'settlement id': 'string', 'order id': 'string', 'quantity': 'Int32', 'product sales': 'float64', 'fulfillment': <class 'str'>}, 'engine': None, 'converters': None, 'true_values': None, 'false_values': None, 'skipinitialspace': False, 'skiprows': None, 'nrows': None, 'na_values': None, 'keep_default_na': True, 'na_filter': True, 'verbose': False, 'skip_blank_lines': True, 'parse_dates': False, 'infer_datetime_format': False, 'keep_date_col': False, 'date_parser': None, 'dayfirst'
Exception: 'ValueError("could not convert string to float: \'Standard Orders\'")'

Traceback (most recent call last):
  File "d:\OD\OneDrive\Projects\Chud_Amaz\Soft_in_dev\moduled_way_OOP\modin_test_bug_report.py", line 30, in <module>
    readed_into_df = pd.read_csv(
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\modin\pandas\io.py", line 135, in read_csv
    return _read(**kwargs)
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\modin\pandas\io.py", line 60, in _read
    pd_obj = FactoryDispatcher.read_csv(**kwargs)
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\modin\data_management\factories\dispatcher.py", line 178, in read_csv
    return cls.__factory._read_csv(**kwargs)
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\modin\data_management\factories\factories.py", line 206, in _read_csv
    return cls.io_cls.read_csv(**kwargs)
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\modin\engines\base\io\file_dispatcher.py", line 68, in read
    query_compiler = cls._read(*args, **kwargs)
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\modin\engines\base\io\text\csv_dispatcher.py", line 157, in _read
    new_query_compiler = cls._get_new_qc(
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\modin\engines\base\io\text\csv_dispatcher.py", line 300, in _get_new_qc
    new_index, row_lengths = cls._define_index(index_ids, index_name)2022-06-07 14:45:15,440 - distributed.worker - WARNING - Compute Failed
Key:       parse-12a66d09f9162c480b1d2f6c748fc0ec
Function:  parse
args:      ()
kwargs:    {'sep': ',', 'delimiter': None, 'header': 'infer', 'names': Index(['date/time', 'settlement id', 'type', 'order id', 'sku', 'description',
       'quantity', 'fulfillment', 'product sales'],
      dtype='object'), 'index_col': None, 'usecols': dict_keys(['date/time', 'sku', 'description', 'type', 'settlement id', 'order id', 'quantity', 'product sales', 
'fulfillment']), 'prefix': <no_default>, 'mangle_dupe_cols': True, 'dtype': {'date/time': 'string', 'sku': 'string', 'description': 'string', 'type': 'string', 'settlement id': 'string', 'order id': 'string', 'quantity': 'Int32', 'product sales': 'float64', 'fulfillment': <class 'str'>}, 'engine': None, 'converters': None, 'true_values': None, 'false_values': None, 'skipinitialspace': False, 'skiprows': None, 'nrows': None, 'na_values': None, 'keep_default_na': True, 'na_filter': True, 'verbose': False, 'skip_blank_lines': True, 'parse_dates': False, 'infer_datetime_format': False, 'keep_date_col': False, 'date_parser': None, 'dayfirst'
Exception: 'ValueError("could not convert string to float: \'Standard Orders\'")'

2022-06-07 14:45:15,424 - distributed.worker - WARNING - Compute Failed
Key:       parse-9d26a0a1270fdd144b14b9d766dcb368
Function:  parse
args:      ()
kwargs:    {'sep': ',', 'delimiter': None, 'header': 'infer', 'names': Index(['date/time', 'settlement id', 'type', 'order id', 'sku', 'description',
       'quantity', 'fulfillment', 'product sales'],
      dtype='object'), 'index_col': None, 'usecols': dict_keys(['date/time', 'sku', 'description', 'type', 'settlement id', 'order id', 'quantity', 'product sales', 
'fulfillment']), 'prefix': <no_default>, 'mangle_dupe_cols': True, 'dtype': {'date/time': 'string', 'sku': 'string', 'description': 'string', 'type': 'string', 'settlement id': 'string', 'order id': 'string', 'quantity': 'Int32', 'product sales': 'float64', 'fulfillment': <class 'str'>}, 'engine': None, 'converters': None, 'true_values': None, 'false_values': None, 'skipinitialspace': False, 'skiprows': None, 'nrows': None, 'na_values': None, 'keep_default_na': True, 'na_filter': True, 'verbose': False, 'skip_blank_lines': True, 'parse_dates': False, 'infer_datetime_format': False, 'keep_date_col': False, 'date_parser': None, 'dayfirst'
Exception: 'ValueError("could not convert string to float: \'Standard Orders\'")'


  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\modin\engines\base\io\text\csv_dispatcher.py", line 243, in _define_index
    index_objs = cls.materialize(index_ids)
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\modin\engines\dask\task_wrapper.py", line 64, in materialize
    return client.gather(future)
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\distributed\client.py", line 2178, in gather
    return self.sync(
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\distributed\utils.py", line 318, in sync
    return sync(
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\distributed\utils.py", line 385, in sync
    raise exc.with_traceback(tb)
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\distributed\utils.py", line 358, in f
    result = yield future
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\tornado\gen.py", line 762, in run
    value = future.result()
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\distributed\client.py", line 2041, in _gather
    raise exception.with_traceback(traceback)
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\modin\backends\pandas\parsers.py", line 272, in parse
    pandas_df = pandas.read_csv(BytesIO(to_read), **kwargs)
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\pandas\util\_decorators.py", line 311, in wrapper
    return func(*args, **kwargs)
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\pandas\io\parsers\readers.py", line 586, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\pandas\io\parsers\readers.py", line 488, in _read
    return parser.read(nrows)
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\pandas\io\parsers\readers.py", line 1047, in read
    index, columns, col_dict = self._engine.read(nrows)
  File "D:\Games\Anaconda\envs\AnazModin\lib\site-packages\pandas\io\parsers\c_parser_wrapper.py", line 223, in read
    chunks = self._reader.read_low_memory(nrows)
  File "pandas\_libs\parsers.pyx", line 801, in pandas._libs.parsers.TextReader.read_low_memory
  File "pandas\_libs\parsers.pyx", line 880, in pandas._libs.parsers.TextReader._read_rows
  File "pandas\_libs\parsers.pyx", line 1026, in pandas._libs.parsers.TextReader._convert_column_data
  File "pandas\_libs\parsers.pyx", line 1101, in pandas._libs.parsers.TextReader._convert_tokens
ValueError: could not convert string to float: 'Standard Orders'

If I only commen row with skiprows=skiprows, all works. The same code with pure pandas works properly

Environment:

Dask version: IDK.
pd.pandas_version ‘1.3.4’
Python version: 3.9.12
Operating System: Windows10
Install method: conda

It seams patameter skiprows=skiprows, processing wrong way. BC in pure pandas all works.
But some files works on properly…

I have tryed read my file without parameter dtype=columns_dtype and the code was finished without any messages…

I just was trying transfer my working pandas project to modin[dask] so please help me to avoid this issue.

So in short.

read_csv() works with parameter skiprows or with parameter dtype. Using this parameters together produce crash.

This issue is a copy feom my Dask issue. They have recomended me to repost my qiestion here bc they think:

It looks like you are running into an issue with modin.pandas. afaik, modin is re-implementing the pandas API so you’re likely hitting a bug in their implementation.

And really - I had used import dask.dataframe as pd and error disappeared. So it looks like real modin arguments passing error. Can anybody help? Or I have to switch from modin to dask?

DR_US_2021Jun6-2022Jun5CustomUnifiedTransaction.csv

Issue Analytics

State:
Created a year ago
Comments:5 (3 by maintainers)

Top GitHub Comments

4reactions

prutskovcommented, Jun 8, 2022

Hello @VasilijKolomiets! Thank you for submitting the issue!

I downsized a bit your reproducer.

# import pandas as pd
import modin.pandas as pd

import modin.config as cfg
cfg.Engine.put("dask")


if __name__ == '__main__':
    us_columns_dtype = {
        'product sales': str,  # to see what's trying to read in real
        # 'product sales': "float64",
    }

    skiprows = 0
    file_path = "DR_US_2021Jun6-2022Jun5CustomUnifiedTransaction.csv"
    separator = ','
    en_cod_ = 'utf-8'
    columns_dtype = us_columns_dtype.copy()

    readed_into_df = pd.read_csv(
        str(file_path),
        sep=separator,
        encoding=en_cod_,
        thousands=",",
        skiprows=skiprows,
        on_bad_lines="skip",
        usecols=columns_dtype.keys(),
        dtype=columns_dtype,
    )
    print(readed_into_df)

Outputs:

Output for modin.pandas:

(modin) prutskov@prutskovPC:~/projects/modin$ python test2.py
UserWarning: Dask execution environment not yet initialized. Initializing...
To remove this warning, run the following python code before doing dataframe operations:

    from distributed import Client

    client = Client()

                     product sales
0       Jun 6, 2021 1:29:07 AM PDT
1       Jun 6, 2021 1:29:07 AM PDT
2       Jun 6, 2021 1:29:43 AM PDT
3       Jun 6, 2021 1:43:17 AM PDT
4       Jun 6, 2021 2:14:52 AM PDT
...                            ...
31753  Jun 5, 2022 10:38:01 PM PDT
31754  Jun 5, 2022 11:04:58 PM PDT
31755  Jun 5, 2022 11:42:56 PM PDT
31756  Jun 5, 2022 11:44:37 PM PDT
31757  Jun 5, 2022 11:54:52 PM PDT

[31758 rows x 1 columns]

Output for pandas:

(modin) prutskov@prutskovPC:~/projects/modin$ python test2.py
      product sales
0             29.49
1             29.49
2            149.99
3             39.99
4             58.98
...             ...
31753         33.99
31754             0
31755         29.99
31756             0
31757         59.99

[31758 rows x 1 columns]

Commenting of skiprows parameter makes output similar to pandas. The issue should be more investigated.

The additional one problem is discovered in case skiprows=[]:

Outputs 2:

Output for modin.pandas:

(modin) prutskov@prutskovPC:~/projects/modin$ python test2.py
(modin) prutskov@prutskovPC:~/projects/modin$ python test2.py
UserWarning: Dask execution environment not yet initialized. Initializing...
To remove this warning, run the following python code before doing dataframe operations:

    from distributed import Client

    client = Client()

Traceback (most recent call last):
  File "test2.py", line 20, in <module>
    readed_into_df = pd.read_csv(
  File "/home/prutskov/projects/modin/modin/logging/logger_function.py", line 65, in run_and_log
    return f(*args, **kwargs)
  File "/home/prutskov/projects/modin/modin/pandas/io.py", line 140, in read_csv
    return _read(**kwargs)
  File "/home/prutskov/projects/modin/modin/pandas/io.py", line 61, in _read
    pd_obj = FactoryDispatcher.read_csv(**kwargs)
  File "/home/prutskov/projects/modin/modin/core/execution/dispatching/factories/dispatcher.py", line 185, in read_csv
    return cls.__factory._read_csv(**kwargs)
  File "/home/prutskov/projects/modin/modin/core/execution/dispatching/factories/factories.py", line 217, in _read_csv
    return cls.io_cls.read_csv(**kwargs)
  File "/home/prutskov/projects/modin/modin/logging/logger_function.py", line 65, in run_and_log
    return f(*args, **kwargs)
  File "/home/prutskov/projects/modin/modin/core/io/file_dispatcher.py", line 153, in read
    query_compiler = cls._read(*args, **kwargs)
  File "/home/prutskov/projects/modin/modin/logging/logger_function.py", line 65, in run_and_log
    return f(*args, **kwargs)
  File "/home/prutskov/projects/modin/modin/core/io/text/text_file_dispatcher.py", line 1004, in _read
    ) = cls._manage_skiprows_parameter(skiprows, header_size)
  File "/home/prutskov/projects/modin/modin/logging/logger_function.py", line 65, in run_and_log
    return f(*args, **kwargs)
  File "/home/prutskov/projects/modin/modin/core/io/text/text_file_dispatcher.py", line 800, in _manage_skiprows_parameter
    skiprows_md[0] - header_size if skiprows_md[0] > header_size else 0
IndexError: index 0 is out of bounds for axis 0 with size 0

Output for pandas:

(modin) prutskov@prutskovPC:~/projects/modin$ python test2.py
      product sales
0             29.49
1             29.49
2            149.99
3             39.99
4             58.98
...             ...
31753         33.99
31754             0
31755         29.99
31756             0
31757         59.99

[31758 rows x 1 columns]

The root cause of the second issue: https://github.com/modin-project/modin/blob/dcee13d57ebf9a006460deedb734c15791acae7a/modin/core/io/text/text_file_dispatcher.py#L795-L803

The both issues are reproduced on both Ray/Dask engines. We will start working on this!

2reactions

prutskovcommented, Jun 8, 2022

So - now I have to reinstall modin in my conda and all will be fine? Or I have to w8?

The fixing is in progress right now in #4544. We will notify you when the fix will be merged in master branch. After that, you will just need to install modin from master branch.