“Phrase”在sklearn中被忽略/n

import re import nltk from nltk.stem import WordNetLemmatizer from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as ESW, CountVectorizer # Make sure we have the corpora used by nltk's lemmatizer try: nltk.data.find('corpora/wordnet') except: nltk.download('wordnet') # "Naive" token similar to that used by sklearn TOKEN = re.compile(r'\b\w{2,}\b') # Tokenize, then lemmatize these tokens # Modified from: # http://scikit-learn.org/stable/modules/feature_extraction.html#customizing-the-vectorizer-classes class LemmaTokenizer(object): def __init__(self): self.wnl = WordNetLemmatizer() def __call__(self, doc): return (self.wnl.lemmatize(t) for t in TOKEN.findall(doc)) # Add 1 more phrase to sklearn's stop word list sw = ESW.union(frozenset(['sinclair broadcast group'])) vect = CountVectorizer(stop_words=sw, ngram_range=(1, 4), tokenizer=LemmaTokenizer()) # These are nonsense babbling docs = ["""And you ask Why You Are Sinclair Broadcast Group is Asking It""", """Why are you asking what Sinclair Broadcast Group and you"""] tf = vect.fit_transform(docs)

2条回答

网友

1楼 · 编辑于 2024-05-16 14:24:25

来自the documentation of ^{}：

stop_words : string {‘english’}, list, or None (default)
If ‘english’, a built-in stop word list for English is used.
If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if analyzer == 'word'.
If None, no stop words will be used. max_df can be set to a value in the range [0.7, 1.0) to automatically detect and filter stop words based on intra corpus document frequency of terms.

再往下看参数token_pattern：

token_pattern : string
Regular expression denoting what constitutes a “token”, only used if analyzer == 'word'. The default regexp select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).

因此，只有当analyzer(token)的结果等于'sinclair broadcast group'时，它才会删除停止字。但是默认的analyzer是'word'，这意味着停止单词检测只适用于单个单词，因为如上所述，标记是由默认的token_pattern定义的。在

令牌不是n个gram（相反，n个gram是由令牌组成的，在构造n个gram之前，停止字移除似乎发生在令牌级别）。在

作为一个快速检查，您可以将您的自定义stopword更改为'sinclair'，以便在将其视为独立单词时正确删除该单词。在

换句话说，您需要将自己的callable作为analyzer传递给它，以便它也将分析器逻辑应用于n-gram，您必须手动进行检查。但默认行为假设stopword detection不能应用于n-gram，只适用于单个单词。在

下面是针对您的案例的自定义分析器函数的示例。这是based on this answer。。。注意，我没有测试它，所以可能有bug。在

def trigram_match(i, trigram, words):
    if i < len(words) - 2 and words[i:i + 3] == trigram:
        return True
    if (i > 0 and i < len(words) - 1) and words[i - 1:i + 2] == trigram:
        return True
    if i > 1 and words[i - 2:i + 1] == trigram:
        return True
    return False


def custom_analyzer(text):
    bad_trigram = ['sinclair', 'broadcasting', 'group']
    words = [str.lower(w) for w in re.findall(r'\w{2,}', text)]
    for i, w in enumerate(words):
        if w in sw or trigram_match(i, bad_trigram, words):
            continue
        yield w

网友

2楼 · 编辑于 2024-05-16 14:24:25

这是一个适合我的自定义分析器。它有点老套，但实际上只需一步就可以完成所有文本处理，而且速度相当快：

from functools import partial
from itertools import islice
import re

import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS


def window(seq, n=3):
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result


class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc, stop_words):
        return tuple(self.wnl.lemmatize(i.lower()) for i in
                     re.findall(r'\b\w{3,}\b', doc)
                     if i.lower() not in stop_words)


def analyzer(doc, stop_words=None, stop_phr=None, ngram_range=(1, 4)):
    if not stop_words:
        stop_words = {}
    if not stop_phr:
        stop_phr = {}
    start, stop = ngram_range
    lt = LemmaTokenizer()
    words = lt(doc, stop_words=stop_words)
    for n in range(start, stop + 1):
        for ngram in window(words, n=n):
            res = ' '.join(ngram)
            if res not in stop_phr:
                yield res
    for w in words:
        yield w


analyzer_ = partial(analyzer, stop_words=ENGLISH_STOP_WORDS,
                    stop_phr={'sinclair broadcast group'})
vect = CountVectorizer(analyzer=analyzer_)

相关问题更多 >

编程相关推荐

热门问题

热门文章