二进制解析树表示的字符串列表

import nltk import collections def tree_to_spans(tree): if isinstance(tree, str): tree = nltk.Tree.fromstring(tree) length = len(tree.pos()) queue = collections.deque(tree.treepositions()) stack = [(queue.popleft(), 0)] j = 0 spans = [] while stack != []: (p, i) = stack[-1] if not queue or queue[0][:-1] != p: if isinstance(tree[p], nltk.tree.Tree): if j - i > 1: spans.append((tree[p].label(), (i, j))) else: j = i + 1 stack.pop() else: q = queue.popleft() stack.append((q, j)) return spans

def get_constituents(sample_string): t = nltk.Tree.fromstring(sample_string) spans = evaluate.tree_to_spans(t) sentence = " ".join(item[0] for item in t.pos()).split() constituents = [" ".join(sentence[span[0]: span[1]])for span in spans] # Add original sentence constituents = constituents + [" ".join(sentence)] return constituents

parts = [ 'Our intent', 'the best', 'the best alternative', 'promote the best alternative', 'to promote the best alternative', 'Our intent is to promote the best alternative', 'Our intent is to promote the best alternative he says' ]

1条回答

网友

1楼 · 发布于 2024-06-13 05:54:38

我将以相反的顺序迭代这些成分，以便获得树的有序遍历（在遍历左侧之前遍历右侧）。因此，在这个解决方案中，我假设成分的顺序与从代码中获得它们的顺序相同

通过递归，您可以递归地重建每个子树：

def to_tree(parts):
    i = len(parts) - 1

    def recur(part, expect=False):
        nonlocal i
        words = part.split()
        if len(words) == 1:  # leaf
            return "(S {})".format(words[0])
        if expect and i > 0 and parts[i-1] == part:
            i -= 1
        if len(words) == 2:  # 2 leaves
            return "(S (S {}) (S {}))".format(*words)
        i -= 1
        nextpart = parts[i]
        if part.endswith(" " + nextpart):
            right = recur(nextpart)
            left = recur(part[0:-len(nextpart)-1], True)
        elif part.startswith(nextpart + " "):
            right = recur(part[len(nextpart)+1:], True)
            left = recur(nextpart)
        else: 
            sides = part.split(" " + nextpart + " ")
            assert len(sides) == 2, "Could not parse input" 
            right = recur(sides[1], True)
            left = recur(sides[0], True)
        return "(S {} {})".format(left, right)
            
    return "(ROOT {})".format(recur(parts[i]))

该示例可以按以下方式运行：

parts = [
    'Our intent', 
    'Our intent is', 
    'the best', 
    'the best alternative', 
    'promote the best alternative', 
    'to promote the best alternative', 
    'Our intent is to promote the best alternative', 
    'he says', 
    'Our intent is to promote the best alternative he says'
]

print(to_tree(parts))

…将输出原始字符串

根据您的编辑，上述代码可以在从输入中删除一些内容后“保留”下来。例如，“我们的意图是”和“他说”可以从输入中删除，而输出仍然相同。但也有局限性。如果删除的内容过多，则无法再重建树

相关问题更多 >

编程相关推荐

热门问题

热门文章