如果未返回ValueE，则无法fit（）ScikitLearn管道

# Creating a custom transformer to calculate the difference between survey # 1 & survey 2 times class TimedeltaTransformer(BaseEstimator, TransformerMixin): def __init__(self, t1_col, t2_col): self.t1_col = t1_col self.t2_col = t2_col def fit(self, X, y=None): self.col_1 = X[self.t1_col].apply(pd.to_datetime) self.col_2 = X[self.t2_col].apply(pd.to_datetime) return self def transform(self, X): difference = self.col_1 - self.col_2 return difference.values # Creating TimedeltaTransformer object cycle_1_date = 'CYCLE_1_SURVEY_DATE' cycle_2_date = 'CYCLE_2_SURVEY_DATE' time_feature = TimedeltaTransformer(cycle_1_date, cycle_2_date) # Using a custom column selecter transformer to extract cycle_1_features cycle_1_cols = ['CYCLE_1_DEFS', 'CYCLE_1_NFROMDEFS', 'CYCLE_1_NFROMCOMP', 'CYCLE_1_DEFS_SCORE', 'CYCLE_1_NUMREVIS', 'CYCLE_1_REVISIT_SCORE', 'CYCLE_1_TOTAL_SCORE'] cycle_1_features = Pipeline([ ('cst2', ColumnSelectTransformer(cycle_1_cols)), ]) # Creating my survey_model Pipeline object # Pipeline object is a 2 step process, first a feature union transforming # and combining the business features, cycle_1 features as well as time # feature; followed by fitting the transformed features into a # RandomForestRegressor survey_model = Pipeline([ ('features', FeatureUnion([ ('business', business_features), ('survey', cycle_1_features), ('time', time_feature), ])), ('forest', RandomForestRegressor()), ]) # Trying to fit my Pipeline throws the ValueError described above survey_model.fit(data, cycle_2_score.astype(int))

# Custom transformer to select columns from a dataframe and returns the # dataframe as an array class ColumnSelectTransformer(BaseEstimator, TransformerMixin): def __init__(self, columns): self.columns = columns def fit(self, X, y=None): return self def transform(self, X): if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) return X[self.columns].values simple_features = Pipeline([ ('cst', ColumnSelectTransformer(simple_cols)), ('imputer', SimpleImputer(strategy='mean')), ]) owner_onehot = Pipeline([ ('cst', ColumnSelectTransformer(['OWNERSHIP'])), ('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder()), ]) cert_onehot = Pipeline([ ('cst', ColumnSelectTransformer(['CERTIFICATION'])), ('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder()), ]) categorical_features = FeatureUnion([ ('owner_onehot', owner_onehot), ('cert_onehot', cert_onehot), ]) business_features = FeatureUnion([ ('simple', simple_features), ('categorical', categorical_features) ])

--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-218-046724d81b69> in <module>() ----> 1 survey_model.fit(data, cycle_2_score.astype(int)) /opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params) 350 This estimator 351 """ --> 352 Xt, fit_params = self._fit(X, y, **fit_params) 353 with _print_elapsed_time('Pipeline', 354 self._log_message(len(self.steps) - 1)): /opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params) 315 message_clsname='Pipeline', 316 message=self._log_message(step_idx), --> 317 **fit_params_steps[name]) 318 # Replace the transformer of the step with the fitted 319 # transformer. This is necessary when loading the transformer /opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs) 353 354 def __call__(self, *args, **kwargs): --> 355 return self.func(*args, **kwargs) 356 357 def call_and_shelve(self, *args, **kwargs): /opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params) 714 with _print_elapsed_time(message_clsname, message): 715 if hasattr(transformer, 'fit_transform'): --> 716 res = transformer.fit_transform(X, y, **fit_params) 717 else: 718 res = transformer.fit(X, y, **fit_params).transform(X) /opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params) 919 920 if any(sparse.issparse(f) for f in Xs): --> 921 Xs = sparse.hstack(Xs).tocsr() 922 else: 923 Xs = np.hstack(Xs) /opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype) 463 464 """ --> 465 return bmat([blocks], format=format, dtype=dtype) 466 467 /opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype) 584 exp=brow_lengths[i], 585 got=A.shape[0])) --> 586 raise ValueError(msg) 587 588 if bcol_lengths[j] == 0: ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 1, expected 13892.

%%bash mkdir data wget http://dataincubator-wqu.s3.amazonaws.com/mldata/providers-train.csv -nc -P ./ml-data wget http://dataincubator-wqu.s3.amazonaws.com/mldata/providers-metadata.csv -nc -P ./ml-data

1条回答

网友

1楼 · 发布于 2024-04-24 10:04:41

改变我的时间转换器似乎有帮助。首先将其更改为一系列整数，然后将其整形为整形（-1,1）。你知道吗

class TimedeltaTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, t1_col, t2_col):
        self.t1_col = t1_col
        self.t2_col = t2_col

    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        self.col_1 = X[self.t1_col].apply(pd.to_datetime)
        self.col_2 = X[self.t2_col].apply(pd.to_datetime)
        return self

    def transform(self, X):
        difference_list = []
        difference = self.col_1 - self.col_2
        for obj in difference:
            difference_list.append(obj.total_seconds())
        return np.array(difference_list).reshape(-1,1)

相关问题更多 >

编程相关推荐

热门问题

热门文章