如果未返回ValueE,则无法fit()ScikitLearn管道

2024-04-24 10:04:41 发布

您现在位置:Python中文网/ 问答频道 /正文

我需要你的帮助!你知道吗

我在尝试适应我的管道时遇到了一个ValueError。你知道吗

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 1, expected 13892.

我的任务是建立一个模型,将养老院的业务特点与第一周期的调查结果,以及第一周期和第二周期调查之间的时间结合起来,预测第二周期的总分。你知道吗

这是我用来完成上述任务的代码。你知道吗

# Creating a custom transformer to calculate the difference between survey
# 1 & survey 2 times
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, t1_col, t2_col):
        self.t1_col = t1_col
        self.t2_col = t2_col

    def fit(self, X, y=None):
        self.col_1 = X[self.t1_col].apply(pd.to_datetime)
        self.col_2 = X[self.t2_col].apply(pd.to_datetime)
        return self

    def transform(self, X):
        difference = self.col_1 - self.col_2
        return difference.values

# Creating TimedeltaTransformer object
cycle_1_date = 'CYCLE_1_SURVEY_DATE'
cycle_2_date = 'CYCLE_2_SURVEY_DATE'
time_feature = TimedeltaTransformer(cycle_1_date, cycle_2_date)

# Using a custom column selecter transformer to extract cycle_1_features
cycle_1_cols = ['CYCLE_1_DEFS', 'CYCLE_1_NFROMDEFS', 'CYCLE_1_NFROMCOMP',
                'CYCLE_1_DEFS_SCORE', 'CYCLE_1_NUMREVIS',
                'CYCLE_1_REVISIT_SCORE', 'CYCLE_1_TOTAL_SCORE']
cycle_1_features = Pipeline([
    ('cst2', ColumnSelectTransformer(cycle_1_cols)),
    ])

# Creating my survey_model Pipeline object
# Pipeline object is a 2 step process, first a feature union transforming 
# and combining the business features, cycle_1 features as well as time   
# feature; followed by fitting the transformed features into a            
# RandomForestRegressor
survey_model = Pipeline([
    ('features', FeatureUnion([
        ('business', business_features),
        ('survey', cycle_1_features),
        ('time', time_feature),
    ])),
    ('forest', RandomForestRegressor()),
])

# Trying to fit my Pipeline throws the ValueError described above
survey_model.fit(data, cycle_2_score.astype(int))

一些额外的上下文:我正在构建这个模型,以便将它的predict\u proba方法传递给一个项目的定制分级器。评分员将字典列表传递给我的估计器的predict或predict\u proba方法,而不是数据帧。这意味着模型必须同时处理这两种数据类型。因此,我需要提供一个定制的ColumnSelectTransformer来代替sciketlearn自己的columntranformer。你知道吗

下面是与业务特性和列SelectTransformer相关的附加代码

# Custom transformer to select columns from a dataframe and returns the   
# dataframe as an array
class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        return X[self.columns].values

simple_features = Pipeline([
    ('cst', ColumnSelectTransformer(simple_cols)),
    ('imputer', SimpleImputer(strategy='mean')),
])

owner_onehot = Pipeline([
    ('cst', ColumnSelectTransformer(['OWNERSHIP'])),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder()),
])

cert_onehot = Pipeline([
    ('cst', ColumnSelectTransformer(['CERTIFICATION'])),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder()),
])

categorical_features = FeatureUnion([
    ('owner_onehot', owner_onehot),
    ('cert_onehot', cert_onehot),
])

business_features = FeatureUnion([
    ('simple', simple_features),
    ('categorical', categorical_features)
])

最后,下面是提出的完整错误

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-218-046724d81b69> in <module>()
----> 1 survey_model.fit(data, cycle_2_score.astype(int))

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    350             This estimator
    351         """
--> 352         Xt, fit_params = self._fit(X, y, **fit_params)
    353         with _print_elapsed_time('Pipeline',
    354                                  self._log_message(len(self.steps) - 1)):

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    315                 message_clsname='Pipeline',
    316                 message=self._log_message(step_idx),
--> 317                 **fit_params_steps[name])
    318             # Replace the transformer of the step with the fitted
    319             # transformer. This is necessary when loading the transformer

/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    353 
    354     def __call__(self, *args, **kwargs):
--> 355         return self.func(*args, **kwargs)
    356 
    357     def call_and_shelve(self, *args, **kwargs):

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    714     with _print_elapsed_time(message_clsname, message):
    715         if hasattr(transformer, 'fit_transform'):
--> 716             res = transformer.fit_transform(X, y, **fit_params)
    717         else:
    718             res = transformer.fit(X, y, **fit_params).transform(X)

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    919 
    920         if any(sparse.issparse(f) for f in Xs):
--> 921             Xs = sparse.hstack(Xs).tocsr()
    922         else:
    923             Xs = np.hstack(Xs)

/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype)
    463 
    464     """
--> 465     return bmat([blocks], format=format, dtype=dtype)
    466 
    467 

/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype)
    584                                                     exp=brow_lengths[i],
    585                                                     got=A.shape[0]))
--> 586                     raise ValueError(msg)
    587 
    588                 if bcol_lengths[j] == 0:

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 1, expected 13892.

此外,数据和元数据可以在这里获得

%%bash
mkdir data
wget http://dataincubator-wqu.s3.amazonaws.com/mldata/providers-train.csv -nc -P ./ml-data
wget http://dataincubator-wqu.s3.amazonaws.com/mldata/providers-metadata.csv -nc -P ./ml-data

Tags: theinselfmessagepipelinedefcolparams
1条回答
网友
1楼 · 发布于 2024-04-24 10:04:41

改变我的时间转换器似乎有帮助。 首先将其更改为一系列整数,然后将其整形为整形(-1,1)。你知道吗

class TimedeltaTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, t1_col, t2_col):
        self.t1_col = t1_col
        self.t2_col = t2_col

    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        self.col_1 = X[self.t1_col].apply(pd.to_datetime)
        self.col_2 = X[self.t2_col].apply(pd.to_datetime)
        return self

    def transform(self, X):
        difference_list = []
        difference = self.col_1 - self.col_2
        for obj in difference:
            difference_list.append(obj.total_seconds())
        return np.array(difference_list).reshape(-1,1)

相关问题 更多 >