Pandas Sklearn Pipeline CV on DataMapper转换？

# use pandas sklearn to do some preprocessing full_mapper = DataFrameMapper([ ('Name', Pipeline([ ('name_vect', CountVectorizer()) , ('name_tfidf', TfidfTransformer()) ]) ), ('Ticket', Pipeline([ ('ticket_vect', CountVectorizer()) , ('ticket_tfidf', TfidfTransformer()) ]) ), ('Sex', LabelBinarizer()), (['Age', 'Fare'], None), # i tried to use Impute() but got an error ])

# make pipeline for individual variables name_to_tfidf = Pipeline([ ('name_vect', CountVectorizer()) , ('name_tfidf', TfidfTransformer()) ]) ticket_to_tfidf = Pipeline([ ('ticket_vect', CountVectorizer()) , ('ticket_tfidf', TfidfTransformer()) ]) # data frame mapper full_mapper = DataFrameMapper([ ('Name', name_to_tfidf ), ('Ticket', ticket_to_tfidf ), ('Sex', LabelBinarizer()), (['Age', 'Fare'], None), # i tried to use Impute() but got an error ]) # build full pipeline full_pipeline = Pipeline([ ('mapper',full_mapper), ('clf', SGDClassifier(n_iter=15, warm_start=True)) ]) # determine full param search space full_params = {'clf__alpha': [1e-2,1e-3,1e-4], 'clf__loss':['modified_huber','hinge'], 'clf__penalty':['l2','l1'], # now set the params for the datamapper part of the pipeline 'mapper__features':[[ ('Name',deepcopy(name_to_tfidf).set_params(name_vect__analyzer = 'char_wb')), # How can i set up a list for searching in here ('Ticket',deepcopy(ticket_to_tfidf).set_params(ticket_vect__analyzer = 'char')) # How can i set up a list for searching in here ]] } # set up grid search gs_clf = GridSearchCV(full_pipeline, full_params, n_jobs=-1) # do the fit gs_clf.fit(df,df['Survived']) print("Best score: %0.3f" % gs_clf.best_score_) print("Best parameters set:") best_parameters = gs_clf.best_estimator_.get_params() for param_name in sorted(full_params.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name]))

> Best score: 0.746 Best parameters set: clf__alpha: 0.01 clf__loss: > 'modified_huber' clf__penalty: 'l1' mapper__features: [('Name', > Pipeline(memory=None, > steps=[('name_vect', CountVectorizer(analyzer='char_wb', binary=False, decode_error='strict', > dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', > lowercase=True, max_df=1.0, max_features=None, min_df=1, > ngram_range=(1, 1), preprocessor=None, stop_words=None, > strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b', > tokenizer=None, vocabulary=None)), ('name_tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, > use_idf=True))])), ('Ticket', Pipeline(memory=None, > steps=[('ticket_vect', CountVectorizer(analyzer='char', binary=False, decode_error='strict', > dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', > lowercase=True, max_df=1.0, max_features=None, min_df=1, > ngram_range=(1, 1), preprocessor=None, stop_words=None, > strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b', > tokenizer=None, vocabulary=None)), ('ticket_tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, > use_idf=True))]))]

# determine full param search space (need to get the params for the mapper parts in here somehow) full_params = {'clf__alpha': [1e-2,1e-3,1e-4], 'clf__loss':['modified_huber','hinge'], 'clf__penalty':['l2','l1'], # now set the params for the datamapper part of the pipeline 'mapper__features':[[ ('Name',deepcopy(name_to_tfidf).set_params(name_vect__analyzer = ['char', 'char_wb'])), ('Ticket',deepcopy(ticket_to_tfidf).set_params(ticket_vect__analyzer = ['char', 'char_wb'])) ]] }

C:\Users\Andrew\Miniconda3\lib\site-packages\sklearn\feature_extraction\text.py in build_analyzer(self=CountVectorizer(analyzer=['char', 'char_wb'], bi...)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)) 265 return lambda doc: self._word_ngrams( 266 tokenize(preprocess(self.decode(doc))), stop_words) 267 268 else: 269 raise ValueError('%s is not a valid tokenization scheme/analyzer' % --> 270 self.analyzer) self.analyzer = ['char', 'char_wb'] 271 272 def _validate_vocabulary(self): 273 vocabulary = self.vocabulary 274 if vocabulary is not None: ValueError: ['char', 'char_wb'] is not a valid tokenization scheme/analyzer

1条回答

网友

1楼 · 发布于 2024-04-25 12:40:34

这只是我在sklearn pandas包中遇到的一个缺点。然而，我发现编写自己的transformer类可以让您完全控制管道中甚至功能联合中发生的事情。在

我们可以定制每个sklearn转换器，只选择某些pandas列，甚至输出转换为pandas数据帧，并进行一些调整。在

查看我的博客了解全面的旅游： https://wkirgsn.github.io/2018/02/15/pandas-pipelines/

相关问题更多 >

编程相关推荐

热门问题

热门文章