
2024-05-13 22:18:05 发布

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.linear_model import LinearRegression

df = pd.DataFrame({'brand'      : ['aaaa', 'asdfasdf', 'sadfds', 'NaN'],
                   'category'   : ['asdf','asfa','asdfas','as'], 
                   'num1'       : [1, 1, 0, 0] ,
                   'target'     : [0.2,0.11,1.34,1.123]})

train_continuous_cols = df.select_dtypes(include=["int64","float64"]).columns.tolist()
train_categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()

preprocess = make_column_transformer( 
    (OneHotEncoder(), train_categorical_cols)
df= preprocess.fit_transform(df)




Transformer standardscaler (type StandardScaler) does not provide get_feature_names


所有的荣誉都归于Johannes Haupt,他提供了get_feature_names()函数,该函数对没有该函数的变压器具有弹性(参见blogpostExtracting Column Names from the ColumnTransformer)。我注释掉了这些警告,因为我不想要它们,并且预先将转换步骤设置为列名;但你想怎么说就怎么说是很容易的

#import warnings
import sklearn
import pandas as pd

class ColumnTransformerWithNames(ColumnTransformer):
    def get_feature_names(column_transformer):
        """Get feature names from all transformers.
        feature_names : list of strings
            Names of the features produced by transform.
        # Remove the internal helper function

        # Turn loopkup into function for better handling with pipeline later
        def get_names(trans):
            # >> Original get_feature_names() method
            if trans == 'drop' or (
                    hasattr(column, '__len__') and not len(column)):
                return []
            if trans == 'passthrough':
                if hasattr(column_transformer, '_df_columns'):
                    if ((not isinstance(column, slice))
                            and all(isinstance(col, str) for col in column)):
                        return column
                        return column_transformer._df_columns[column]
                    indices = np.arange(column_transformer._n_features)
                    return ['x%d' % i for i in indices[column]]
            if not hasattr(trans, 'get_feature_names'):
            # >>> Change: Return input column names if no method avaiable
                # Turn error into a warning
    #             warnings.warn("Transformer %s (type %s) does not "
    #                                  "provide get_feature_names. "
    #                                  "Will return input column names if available"
    #                                  % (str(name), type(trans).__name__))
                # For transformers without a get_features_names method, use the input
                # names to the column transformer
                if column is None:
                    return []
                    return [#name + "__" + 
                            f for f in column]

            return [#name + "__" + 
                    f for f in trans.get_feature_names()]

        ### Start of processing
        feature_names = []

        # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
        if type(column_transformer) == sklearn.pipeline.Pipeline:
            l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
            # For column transformers, follow the original method
            l_transformers = list(column_transformer._iter(fitted=True))

        for name, trans, column, _ in l_transformers: 
            if type(trans) == sklearn.pipeline.Pipeline:
                # Recursive call on pipeline
                _names = column_transformer.get_feature_names(trans)
                # if pipeline has no transformer that returns names
                if len(_names)==0:
                    _names = [#name + "__" + 
                              f for f in column]

        return feature_names
    def transform(self, X):
        indices = X.index.values.tolist()
        original_columns = X.columns.values.tolist()
        X_mat = super().transform(X)
        new_cols = self.get_feature_names()
        new_X = pd.DataFrame(X_mat.toarray(), index=indices, columns=new_cols)
        return new_X

    def fit_transform(self, X, y=None):
        super().fit_transform(X, y)
        return self.transform(X)





transformers_ : list
   The collection of fitted transformers as tuples of
   (name, fitted_transformer, column). `fitted_transformer` can be an
   estimator, 'drop', or 'passthrough'. In case there were no columns
   selected, this will be the unfitted transformer.
   If there are remaining columns, the final element is a tuple of the
   ('remainder', transformer, remaining_columns) corresponding to the
   ``remainder`` parameter. If there are remaining columns, then
   ``len(transformers_)==len(transformers)+1``, otherwise


notes: The order of the columns in the transformed feature matrix follows the order of how the columns are specified in the transformers list.


import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer

df = pd.DataFrame({'brand'      : ['aaaa', 'asdfasdf', 'sadfds', 'NaN'],
                   'category'   : ['asdf','asfa','asdfas','asd'],
                   'num1'       : [1, 1, 0, 0] ,
                   'target'     : [0.2,0.11,1.34,1.123]})

train_continuous_cols = df.select_dtypes(include=["int64","float64"]).columns.tolist()
train_categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
# get n_categories for categorical features
n_categories = [df[x].nunique() for x in train_categorical_cols]

preprocess = make_column_transformer(
    (OneHotEncoder(), train_categorical_cols)
preprocessed_df = preprocess.fit_transform(df)
# the scaler yield 1 column each
indexes_scaler = list(range(0,len(train_continuous_cols)))
# the encoder yields a number of columns equal to the number of categories in the data
cum_index_encoder = [0] + list(np.cumsum(n_categories))

# the encoder indexes come after the scaler indexes
start_index_encoder = indexes_scaler[-1]+1
indexes_encoder = [x + start_index_encoder for x in cum_index_encoder]
# get both lower and uper bound of index
index_pairs= zip (indexes_encoder[:-1],indexes_encoder[1:])


print ('Transformed {} continious cols resulting in a df with shape:'.format(len(train_continuous_cols)))
print (preprocessed_df[: , indexes_scaler].shape)

Transformed 2 continious cols resulting in a df with shape: (4, 2)

for column, (start_id, end_id) in zip (train_categorical_cols,index_pairs):
    print('Transformed column {} resulted in a df with shape:'.format(column))
    print(preprocessed_df[:, start_id:end_id].shape) 

Transformed column brand resulted in a df with shape: (4, 4)

Transformed column category resulted in a df with shape: (4, 4)

