python：基于字典的分词

def getallpossiblewords(string): allwords = preprocessingcorpus("corpus.txt") temp = [] for i in range(0, len(string)): for j in range(1, len(string) + 1): if string[i:j] in allwords: temp += [string[i:j]] allposwords = sorted(temp, key=len, reverse=True) #print(allposwords) return allposwords def wordseg(string): a = string b = getallpossiblewords(string) cuts = [] allpos = [] for i in range(0,len(a)): cuts.extend(combinations(range(1,len(a)),i)) for i in cuts: last = 0 output = [] for j in i: output.append(a[last:j]) last = j output.append(a[last:]) for x in range(len(output)): if output[x] in b: allpos += [output] #print(output) #print(allpos) fixallpos = list() for sublist in allpos: if sublist not in fixallpos: fixallpos.append(sublist)

1条回答

网友

1楼 · 发布于 2024-04-28 22:25:53

这似乎是str.partition()的完美递归用法。下面是我的示例实现，我不会声称它解决了所有问题（因为实际上没有测试用例），而是尝试在这种特定方法上做销售工作：

def segmented(string):

    segmentations = set()

    for word in words:
        before, match, after = string.partition(word)

        if not match:
            continue

        prefixes = segmented(before) or [before]
        suffixes = segmented(after) or [after]

        if prefixes and suffixes:
            for prefix in prefixes:
                for suffix in suffixes:
                    segmentations.add((*prefix, word, *suffix))
        elif prefixes:
            for prefix in prefixes:
                    segmentations.add((*prefix, word, *suffixes))
        elif suffixes:
            for suffix in suffixes:
                    segmentations.add((*prefixes, word, suffix))
        else:
            segmentations.add((*prefixes, word, *suffixes))

    return segmentations

words = ["as", "ass", "share", "rest"]

print(segmented("xassharest"))

输出

^{pr2}$

相关问题更多 >

编程相关推荐

热门问题

热门文章