向CountVectorizer（sklearn）添加词干支持

from nltk.stem.snowball import FrenchStemmer stop = stopwords.words('french') stemmer = FrenchStemmer() class StemmedCountVectorizer(CountVectorizer): def __init__(self, stemmer): super(StemmedCountVectorizer, self).__init__() self.stemmer = stemmer def build_analyzer(self): analyzer = super(StemmedCountVectorizer, self).build_analyzer() return lambda doc:(self.stemmer.stem(w) for w in analyzer(doc)) stem_vectorizer = StemmedCountVectorizer(stemmer) text_clf = Pipeline([('vect', stem_vectorizer), ('tfidf', TfidfTransformer()), ('clf', SVC(kernel='linear', C=1)) ])

Process PoolWorker-12: Traceback (most recent call last): File "C:\Anaconda2\lib\multiprocessing\process.py", line 258, in _bootstrap self.run() File "C:\Anaconda2\lib\multiprocessing\process.py", line 114, in run self._target(*self._args, **self._kwargs) File "C:\Anaconda2\lib\multiprocessing\pool.py", line 102, in worker task = get() File "C:\Anaconda2\lib\site-packages\sklearn\externals\joblib\pool.py", line 360, in get return recv() AttributeError: 'module' object has no attribute 'StemmedCountVectorizer'

from sklearn.pipeline import Pipeline from sklearn import grid_search from sklearn.svm import SVC from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from nltk.stem.snowball import FrenchStemmer stemmer = FrenchStemmer() analyzer = CountVectorizer().build_analyzer() def stemming(doc): return (stemmer.stem(w) for w in analyzer(doc)) X = ['le chat est beau', 'le ciel est nuageux', 'les gens sont gentils', 'Paris est magique', 'Marseille est tragique', 'JCVD est fou'] Y = [1,0,1,1,0,0] text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SVC())]) parameters = { 'vect__analyzer': ['word', stemming]} gs_clf = grid_search.GridSearchCV(text_clf, parameters, n_jobs=-1) gs_clf.fit(X, Y)

3条回答

网友

1楼 · 编辑于 2024-05-14 08:55:08

可以将可调用的作为analyzer传递给CountVectorizer构造函数以提供自定义分析器。这似乎对我有用。

from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import FrenchStemmer

stemmer = FrenchStemmer()
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

stem_vectorizer = CountVectorizer(analyzer=stemmed_words)
print(stem_vectorizer.fit_transform(['Tu marches dans la rue']))
print(stem_vectorizer.get_feature_names())

打印出来：

  (0, 4)    1
  (0, 2)    1
  (0, 0)    1
  (0, 1)    1
  (0, 3)    1
[u'dan', u'la', u'march', u'ru', u'tu']

网友

2楼 · 编辑于 2024-05-14 08:55:08

我知道我发布答案有点晚了。但在这里，以防有人还需要帮助。

下面是通过重写build_analyser()将语言词干分析器添加到计数向量器的最干净方法

from sklearn.feature_extraction.text import CountVectorizer
import nltk.stem

french_stemmer = nltk.stem.SnowballStemmer('french')
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([french_stemmer.stem(w) for w in analyzer(doc)])

vectorizer_s = StemmedCountVectorizer(min_df=3, analyzer="word", stop_words='french')

您可以在vectorizer_s对象上自由调用countvector类的fit和transform函数

网友

3楼 · 编辑于 2024-05-14 08:55:08

您可以尝试：

def build_analyzer(self):
    analyzer = super(CountVectorizer, self).build_analyzer()
    return lambda doc:(stemmer.stem(w) for w in analyzer(doc))

并删除__init__方法。

相关问题更多 >

编程相关推荐

热门问题

热门文章