在Python中快速实现括号的嵌套拆分有什么方法?
我有一个文件,格式如下:
ID1 { some text }
ID2 { some text }
这些内容不一定是逐行排列的,所以我们可能会看到:
ID1 { some [crlf]
text [crlf]
}
ID2 [crlf] { some t [crlf]
ex [crlf]
t}
也就是说,some text
可能会占用多行,并且在 ID
后面可能会紧跟着一个 CRLF
。最重要的是,所有的 ID 都是用 {
和 }
包起来的。
问题是,some text
本身可能也会包含 {
和 }
。
有没有什么快速的方法可以把这样的文件分割成一个字符串列表,每个字符串的格式是 ID { text }
,同时还要考虑到嵌套的括号?
如果能考虑到一些错误分析,比如括号不匹配的情况,那就更好了。
4 个回答
2
提到的“regex”是正则表达式,这个在这里不适用,大家都知道的。你有没有看过“pyparsing”这个库呢?
[编辑]
另外,这个方法可能有效:
from functools import wraps
def transition(method):
@wraps(method)
def trans(state, *args, **kwargs):
command = method(state, *args, **kwargs)
state.__class__ = command(state)
return trans
class State(object):
def __new__(cls):
state = object.__new__(cls)
state._identities = []
return state
def unchanged(state):
return state.__class__
def shifting(identity):
def command(state):
return identity
return command
def pushing(identity, afterwards=None):
def command(state):
state._identities.append(afterwards or state.__class__)
return identity
return command
def popped(state):
return state._identities.pop()
##############################################################################
import re
tokenize = re.compile(flags=re.VERBOSE | re.MULTILINE, pattern=r"""
(?P<word> \w+ ) |
(?P<braceleft> { ) |
(?P<braceright> } ) |
(?P<eoi> $ ) |
(?P<error> \S ) # catch all (except white space)
""").finditer
def parse(parser, source, builder):
for each in tokenize(source):
dispatch = getattr(parser, each.lastgroup)
dispatch(each.group(), builder)
class ParsingState(State):
def eoi(self, token, *args):
raise ValueError('premature end of input in parsing state %s' %
self.__class__.__name__
)
def error(self, token, *args):
raise ValueError('parsing state %s does not understand token %s' % (
self.__class__.__name__, token
))
def __getattr__(self, name):
def raiser(token, *args):
raise ValueError(
'parsing state %s does not understand token "%s" of type %s' %
(self.__class__.__name__, token, name)
)
return raiser
class Id(ParsingState):
@transition
def word(self, token, builder):
builder.add_id(token)
return shifting(BeginContent)
@transition
def eoi(self, token, builder):
return shifting(DoneParsing)
class BeginContent(ParsingState):
@transition
def braceleft(self, token, builder):
return shifting(Content)
class Content(ParsingState):
@transition
def word(self, token, builder):
builder.add_text(token)
return unchanged
@transition
def braceleft(self, token, builder):
builder.add_text(token)
return pushing(PushedContent)
@transition
def braceright(self, token, builder):
return shifting(Id)
class PushedContent(Content):
@transition
def braceright(self, token, builder):
builder.add_text(token)
return popped
class DoneParsing(ParsingState):
pass
##############################################################################
class Entry(object):
def __init__(self, idname):
self.idname = idname
self.text = []
def __str__(self):
return '%s { %s }' % (self.idname, ' '.join(self.text))
class Builder(object):
def __init__(self):
self.entries = []
def add_id(self, id_token):
self.entries.append(Entry(id_token))
def add_text(self, text_token):
self.entries[-1].text.append(text_token)
##############################################################################
if __name__ == '__main__':
file_content = """
id1 { some text } id2 {
some { text }
}
"""
builder = Builder()
parse(Id(), file_content, builder)
for entry in builder.entries:
print entry
3
这是一个简单的问题,关于“我该如何写一个递归下降解析器来匹配括号”。
给定这个语法:
STMT_LIST := STMT+
STMT := ID '{' DATA '}'
DATA := TEXT | STMT
ID := [a-z0-9]+
TEXT := [^}]*
一个解析器可能看起来像这样:
import sys
import re
def parse(data):
"""
STMT
"""
while data:
data, statement_id, clause = parse_statement(data)
print repr((statement_id, clause))
def consume_whitespace(data):
return data.lstrip()
def parse_statement(data):
m = re.match('[a-zA-Z0-9]+', data)
if not m:
raise ValueError, "No ID found"
statement_id = m.group(0)
data = consume_whitespace(data[len(statement_id):])
data, clause = parse_clause(data)
return consume_whitespace(data), statement_id, clause
def parse_clause(data):
clause = []
if not data.startswith('{'):
raise ValueError, "No { found"
data = data[1:]
closebrace = data.index('}')
try:
openbrace = data.index('{')
except ValueError:
openbrace = sys.maxint
while openbrace < closebrace:
clause.append(data[:openbrace])
data, subclause = parse_clause(data[openbrace:])
clause.append(subclause)
closebrace = data.index('}')
try:
openbrace = data.index('{')
except ValueError:
openbrace = sys.maxint
clause.append(data[:closebrace])
data = data[closebrace+1:]
return data, clause
parse("ID { foo { bar } }")
parse("ID { foo { bar } } baz { tee fdsa { fdsa } }")
老实说,这个解析器写得有点复杂。如果你想让它结构更好,你应该先用一个词法分析器(lexxer)生成一个合适的令牌流,然后再把这个流传给真正的解析器。现在的“令牌流”其实只是一个字符串,我们从中剥离信息。
如果你想做得更复杂一点,我建议你看看pyparsing这个库。
4
使用pyparsing这个工具,你可以用大约6行代码就完成这个任务,然后继续做其他的事情。下面有两种不同的解决方案,具体取决于你希望解析结果的结构是什么样的:
data = """ID1 { some text } ID2 { some {with some more text nested in braces} text }"""
from pyparsing import Word, alphas, alphanums, dictOf, nestedExpr, originalTextFor
# identifier starts with any alpha, followed by any alpha, num, or '_'
ident = Word(alphas,alphanums+"_")
# Solution 1
# list of items is a dict of pairs of idents and nested {}'s
# - returns {}'s expressions as nested structures
itemlist = dictOf(ident, nestedExpr("{","}"))
items = itemlist.parseString(data)
print items.dump()
"""
prints:
[['ID1', ['some', 'text']], ['ID2', ['some', ['with', 'some', 'more', ...
- ID1: ['some', 'text']
- ID2: ['some', ['with', 'some', 'more', 'text', 'nested', 'in', 'braces'], 'text']
"""
# Solution 2
# list of items is a dict of pairs of idents and nested {}'s
# - returns {}'s expressions as strings of text extract from the
# original input string
itemlist = dictOf(ident, originalTextFor(nestedExpr("{","}")))
items = itemlist.parseString(data)
print items.dump()
"""
prints:
[['ID1', '{ some text }'], ['ID2', '{ some {with some more text nested in ...
- ID1: { some text }
- ID2: { some {with some more text nested in braces} text }
"""