Stuck on an issue?

Lightrun Answers was designed to reduce the constant googling that comes with debugging 3rd party libraries. It collects links to all the places you might be looking at while hunting down a tough bug.

And, if you’re still stuck at the end, we’re happy to hop on a call to see how we can help out.

Error in `infer_feature_types` for seoul bike dataset

See original GitHub issue

Running infer_feature_types on the seoul bike dataset errors out. Based on the stack trace, I think this might be related to the partial schema PR (#2774) but haven’t gotten a chance to dig too deeply.

SeoulBikeData.csv

import pandas as pd
import woodwork as ww
data_set = pd.read_csv("Regression/SeoulBikeData.csv")
infer_feature_types(data_set)

Produces:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~/Desktop/evalml_venv/lib/python3.7/site-packages/pandas/core/tools/datetimes.py in _to_datetime_with_format(arg, orig_arg, name, tz, fmt, exact, errors, infer_datetime_format)
    508         try:
--> 509             values, tz = conversion.datetime_to_datetime64(arg)
    510             dta = DatetimeArray(values, dtype=tz_to_dtype(tz))

~/Desktop/evalml_venv/lib/python3.7/site-packages/pandas/_libs/tslibs/conversion.pyx in pandas._libs.tslibs.conversion.datetime_to_datetime64()

TypeError: Unrecognized value type: <class 'str'>

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
~/Desktop/evalml_venv/lib/python3.7/site-packages/woodwork/logical_types.py in transform(self, series)
    212                 else:
--> 213                     series = pd.to_datetime(series, format=self.datetime_format)
    214             except (TypeError, ValueError):

~/Desktop/evalml_venv/lib/python3.7/site-packages/pandas/core/tools/datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, format, exact, unit, infer_datetime_format, origin, cache)
    882     elif isinstance(arg, ABCSeries):
--> 883         cache_array = _maybe_cache(arg, format, cache, convert_listlike)
    884         if not cache_array.empty:

~/Desktop/evalml_venv/lib/python3.7/site-packages/pandas/core/tools/datetimes.py in _maybe_cache(arg, format, cache, convert_listlike)
    194         if len(unique_dates) < len(arg):
--> 195             cache_dates = convert_listlike(unique_dates, format)
    196             cache_array = Series(cache_dates, index=unique_dates)

~/Desktop/evalml_venv/lib/python3.7/site-packages/pandas/core/tools/datetimes.py in _convert_listlike_datetimes(arg, format, name, tz, unit, errors, infer_datetime_format, dayfirst, yearfirst, exact)
    393         res = _to_datetime_with_format(
--> 394             arg, orig_arg, name, tz, format, exact, errors, infer_datetime_format
    395         )

~/Desktop/evalml_venv/lib/python3.7/site-packages/pandas/core/tools/datetimes.py in _to_datetime_with_format(arg, orig_arg, name, tz, fmt, exact, errors, infer_datetime_format)
    512         except (ValueError, TypeError):
--> 513             raise err
    514 

~/Desktop/evalml_venv/lib/python3.7/site-packages/pandas/core/tools/datetimes.py in _to_datetime_with_format(arg, orig_arg, name, tz, fmt, exact, errors, infer_datetime_format)
    500         res = _array_strptime_with_fallback(
--> 501             arg, name, tz, fmt, exact, errors, infer_datetime_format
    502         )

~/Desktop/evalml_venv/lib/python3.7/site-packages/pandas/core/tools/datetimes.py in _array_strptime_with_fallback(arg, name, tz, fmt, exact, errors, infer_datetime_format)
    435     try:
--> 436         result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors)
    437         if "%Z" in fmt or "%z" in fmt:

~/Desktop/evalml_venv/lib/python3.7/site-packages/pandas/_libs/tslibs/strptime.pyx in pandas._libs.tslibs.strptime.array_strptime()

ValueError: time data '13/12/2017' does not match format '%m/%d/%Y' (match)

During handling of the above exception, another exception occurred:

TypeConversionError                       Traceback (most recent call last)
<ipython-input-17-0cd25bce426e> in <module>
      1 # X_train, X_holdout, y_train, y_holdout = evalml.preprocessing.split_data(X, y, problem_type='regression', test_size=0.25, random_seed=0)
----> 2 infer_feature_types(data_set)

~/Desktop/evalml/evalml/utils/woodwork_utils.py in infer_feature_types(data, feature_types)
    110     else:
    111         ww_data = data.copy()
--> 112         ww_data.ww.init(logical_types=feature_types)
    113         return convert_all_nan_unknown_to_double(ww_data)
    114 

~/Desktop/evalml_venv/lib/python3.7/site-packages/woodwork/table_accessor.py in init(self, **kwargs)
     96                 Any errors resulting from skipping validation with invalid inputs may not be easily understood.
     97         """
---> 98         self.init_with_partial_schema(**kwargs)
     99 
    100     def init_with_full_schema(self, schema: TableSchema, validate: bool = True, **kwargs) -> None:

~/Desktop/evalml_venv/lib/python3.7/site-packages/woodwork/table_accessor.py in init_with_partial_schema(self, schema, index, time_index, logical_types, already_sorted, name, semantic_tags, table_metadata, column_metadata, use_standard_tags, column_descriptions, column_origins, validate, **kwargs)
    203 
    204         # overwrite schema parameters with specified kwargs
--> 205         logical_types = _infer_missing_logical_types(self._dataframe, logical_types, existing_logical_types)
    206         column_descriptions = {**existing_col_descriptions, **(column_descriptions or {})}
    207         column_metadata = {**existing_col_metadata, **(column_metadata or {})}

~/Desktop/evalml_venv/lib/python3.7/site-packages/woodwork/table_accessor.py in _infer_missing_logical_types(dataframe, force_logical_types, existing_logical_types)
   1060         logical_type = force_logical_types.get(name) if name in force_logical_types else existing_logical_types.get(name)
   1061         parsed_logical_types[name] = _get_column_logical_type(series, logical_type, name)
-> 1062         updated_series = parsed_logical_types[name].transform(series)
   1063         if updated_series is not series:
   1064             dataframe[name] = updated_series

~/Desktop/evalml_venv/lib/python3.7/site-packages/woodwork/logical_types.py in transform(self, series)
    213                     series = pd.to_datetime(series, format=self.datetime_format)
    214             except (TypeError, ValueError):
--> 215                 raise TypeConversionError(series, new_dtype, type(self))
    216         return super().transform(series)
    217 

TypeConversionError: Error converting datatype for Date from type object to type datetime64[ns]. Please confirm the underlying data is consistent with logical type Datetime.

Issue Analytics

State:
Created 2 years ago
Comments:9 (9 by maintainers)

Top GitHub Comments

2reactions

thehomebrewnerdcommented, Sep 22, 2021

Created Woodwork Issue 1152 to resolve this problem.

2reactions

thehomebrewnerdcommented, Sep 22, 2021

@freddyaboulton @angela97lin I’m pretty sure I know what is happening here. Let me dig a little deeper to confirm, and then I will see if there is a workaround or if we need to update Woodwork to resolve this.