Error in `infer_feature_types` for seoul bike dataset
See original GitHub issueRunning infer_feature_types
on the seoul bike dataset errors out. Based on the stack trace, I think this might be related to the partial schema PR (#2774) but haven’t gotten a chance to dig too deeply.
import pandas as pd
import woodwork as ww
data_set = pd.read_csv("Regression/SeoulBikeData.csv")
infer_feature_types(data_set)
Produces:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~/Desktop/evalml_venv/lib/python3.7/site-packages/pandas/core/tools/datetimes.py in _to_datetime_with_format(arg, orig_arg, name, tz, fmt, exact, errors, infer_datetime_format)
508 try:
--> 509 values, tz = conversion.datetime_to_datetime64(arg)
510 dta = DatetimeArray(values, dtype=tz_to_dtype(tz))
~/Desktop/evalml_venv/lib/python3.7/site-packages/pandas/_libs/tslibs/conversion.pyx in pandas._libs.tslibs.conversion.datetime_to_datetime64()
TypeError: Unrecognized value type: <class 'str'>
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
~/Desktop/evalml_venv/lib/python3.7/site-packages/woodwork/logical_types.py in transform(self, series)
212 else:
--> 213 series = pd.to_datetime(series, format=self.datetime_format)
214 except (TypeError, ValueError):
~/Desktop/evalml_venv/lib/python3.7/site-packages/pandas/core/tools/datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, format, exact, unit, infer_datetime_format, origin, cache)
882 elif isinstance(arg, ABCSeries):
--> 883 cache_array = _maybe_cache(arg, format, cache, convert_listlike)
884 if not cache_array.empty:
~/Desktop/evalml_venv/lib/python3.7/site-packages/pandas/core/tools/datetimes.py in _maybe_cache(arg, format, cache, convert_listlike)
194 if len(unique_dates) < len(arg):
--> 195 cache_dates = convert_listlike(unique_dates, format)
196 cache_array = Series(cache_dates, index=unique_dates)
~/Desktop/evalml_venv/lib/python3.7/site-packages/pandas/core/tools/datetimes.py in _convert_listlike_datetimes(arg, format, name, tz, unit, errors, infer_datetime_format, dayfirst, yearfirst, exact)
393 res = _to_datetime_with_format(
--> 394 arg, orig_arg, name, tz, format, exact, errors, infer_datetime_format
395 )
~/Desktop/evalml_venv/lib/python3.7/site-packages/pandas/core/tools/datetimes.py in _to_datetime_with_format(arg, orig_arg, name, tz, fmt, exact, errors, infer_datetime_format)
512 except (ValueError, TypeError):
--> 513 raise err
514
~/Desktop/evalml_venv/lib/python3.7/site-packages/pandas/core/tools/datetimes.py in _to_datetime_with_format(arg, orig_arg, name, tz, fmt, exact, errors, infer_datetime_format)
500 res = _array_strptime_with_fallback(
--> 501 arg, name, tz, fmt, exact, errors, infer_datetime_format
502 )
~/Desktop/evalml_venv/lib/python3.7/site-packages/pandas/core/tools/datetimes.py in _array_strptime_with_fallback(arg, name, tz, fmt, exact, errors, infer_datetime_format)
435 try:
--> 436 result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors)
437 if "%Z" in fmt or "%z" in fmt:
~/Desktop/evalml_venv/lib/python3.7/site-packages/pandas/_libs/tslibs/strptime.pyx in pandas._libs.tslibs.strptime.array_strptime()
ValueError: time data '13/12/2017' does not match format '%m/%d/%Y' (match)
During handling of the above exception, another exception occurred:
TypeConversionError Traceback (most recent call last)
<ipython-input-17-0cd25bce426e> in <module>
1 # X_train, X_holdout, y_train, y_holdout = evalml.preprocessing.split_data(X, y, problem_type='regression', test_size=0.25, random_seed=0)
----> 2 infer_feature_types(data_set)
~/Desktop/evalml/evalml/utils/woodwork_utils.py in infer_feature_types(data, feature_types)
110 else:
111 ww_data = data.copy()
--> 112 ww_data.ww.init(logical_types=feature_types)
113 return convert_all_nan_unknown_to_double(ww_data)
114
~/Desktop/evalml_venv/lib/python3.7/site-packages/woodwork/table_accessor.py in init(self, **kwargs)
96 Any errors resulting from skipping validation with invalid inputs may not be easily understood.
97 """
---> 98 self.init_with_partial_schema(**kwargs)
99
100 def init_with_full_schema(self, schema: TableSchema, validate: bool = True, **kwargs) -> None:
~/Desktop/evalml_venv/lib/python3.7/site-packages/woodwork/table_accessor.py in init_with_partial_schema(self, schema, index, time_index, logical_types, already_sorted, name, semantic_tags, table_metadata, column_metadata, use_standard_tags, column_descriptions, column_origins, validate, **kwargs)
203
204 # overwrite schema parameters with specified kwargs
--> 205 logical_types = _infer_missing_logical_types(self._dataframe, logical_types, existing_logical_types)
206 column_descriptions = {**existing_col_descriptions, **(column_descriptions or {})}
207 column_metadata = {**existing_col_metadata, **(column_metadata or {})}
~/Desktop/evalml_venv/lib/python3.7/site-packages/woodwork/table_accessor.py in _infer_missing_logical_types(dataframe, force_logical_types, existing_logical_types)
1060 logical_type = force_logical_types.get(name) if name in force_logical_types else existing_logical_types.get(name)
1061 parsed_logical_types[name] = _get_column_logical_type(series, logical_type, name)
-> 1062 updated_series = parsed_logical_types[name].transform(series)
1063 if updated_series is not series:
1064 dataframe[name] = updated_series
~/Desktop/evalml_venv/lib/python3.7/site-packages/woodwork/logical_types.py in transform(self, series)
213 series = pd.to_datetime(series, format=self.datetime_format)
214 except (TypeError, ValueError):
--> 215 raise TypeConversionError(series, new_dtype, type(self))
216 return super().transform(series)
217
TypeConversionError: Error converting datatype for Date from type object to type datetime64[ns]. Please confirm the underlying data is consistent with logical type Datetime.
Issue Analytics
- State:
- Created 2 years ago
- Comments:9 (9 by maintainers)
Top Results From Across the Web
Seoul Bike Usage prediction - Kaggle
Use the Regression to predict the count of bike count required at each hour for the stable supply of rental bikes.
Read more >Seoul Bike Sharing Demand Data Set
Abstract: The dataset contains count of public bikes rented at each hour in Seoul Bike haring System with the corresponding Weather data and ......
Read more >syedsharin/Seoul-Bike-Sharing-Demand-Prediction - GitHub
This dataset contains information about the total count of rented bikes at each hour, as well as the date of observation and meteorological...
Read more >A rule-based model for Seoul Bike sharing demand prediction ...
This research paper presents a rule-based regression predictive model for bike sharing demand prediction. In recent days, Pubic rental bike ...
Read more >Predicting Bikes Rental Demand Using Weather and Holiday ...
This Dataset is taken from kaggle open datasets: Seoul Bike Sharing Demand ... Then statistical tests such as the non-constant error variance test...
Read more >Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start FreeTop Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found
Top GitHub Comments
Created Woodwork Issue 1152 to resolve this problem.
@freddyaboulton @angela97lin I’m pretty sure I know what is happening here. Let me dig a little deeper to confirm, and then I will see if there is a workaround or if we need to update Woodwork to resolve this.