在Python中有效地匹配多个regex

import re import sys class Token(object): """ A simple Token structure. Contains the token type, value and position. """ def __init__(self, type, val, pos): self.type = type self.val = val self.pos = pos def __str__(self): return '%s(%s) at %s' % (self.type, self.val, self.pos) class LexerError(Exception): """ Lexer error exception. pos: Position in the input line where the error occurred. """ def __init__(self, pos): self.pos = pos class Lexer(object): """ A simple regex-based lexer/tokenizer. See below for an example of usage. """ def __init__(self, rules, skip_whitespace=True): """ Create a lexer. rules: A list of rules. Each rule is a `regex, type` pair, where `regex` is the regular expression used to recognize the token and `type` is the type of the token to return when it's recognized. skip_whitespace: If True, whitespace (\s+) will be skipped and not reported by the lexer. Otherwise, you have to specify your rules for whitespace, or it will be flagged as an error. """ self.rules = [] for regex, type in rules: self.rules.append((re.compile(regex), type)) self.skip_whitespace = skip_whitespace self.re_ws_skip = re.compile('\S') def input(self, buf): """ Initialize the lexer with a buffer as input. """ self.buf = buf self.pos = 0 def token(self): """ Return the next token (a Token object) found in the input buffer. None is returned if the end of the buffer was reached. In case of a lexing error (the current chunk of the buffer matches no rule), a LexerError is raised with the position of the error. """ if self.pos >= len(self.buf): return None else: if self.skip_whitespace: m = self.re_ws_skip.search(self.buf[self.pos:]) if m: self.pos += m.start() else: return None for token_regex, token_type in self.rules: m = token_regex.match(self.buf[self.pos:]) if m: value = self.buf[self.pos + m.start():self.pos + m.end()] tok = Token(token_type, value, self.pos) self.pos += m.end() return tok # if we're here, no rule matched raise LexerError(self.pos) def tokens(self): """ Returns an iterator to the tokens found in the buffer. """ while 1: tok = self.token() if tok is None: break yield tok if __name__ == '__main__': rules = [ ('\d+', 'NUMBER'), ('[a-zA-Z_]\w+', 'IDENTIFIER'), ('\+', 'PLUS'), ('\-', 'MINUS'), ('\*', 'MULTIPLY'), ('\/', 'DIVIDE'), ('\(', 'LP'), ('\)', 'RP'), ('=', 'EQUALS'), ] lx = Lexer(rules, skip_whitespace=True) lx.input('erw = _abc + 12*(R4-623902) ') try: for tok in lx.tokens(): print tok except LexerError, err: print 'LexerError at position', err.pos

3条回答

网友

1楼 · 编辑于 2024-05-23 13:52:14

我建议使用re.Scanner类，它没有在标准库中记录，但很值得使用。下面是一个例子：

import re

scanner = re.Scanner([
    (r"-?[0-9]+\.[0-9]+([eE]-?[0-9]+)?", lambda scanner, token: float(token)),
    (r"-?[0-9]+", lambda scanner, token: int(token)),
    (r" +", lambda scanner, token: None),
])

>>> scanner.scan("0 -1 4.5 7.8e3")[0]
[0, -1, 4.5, 7800.0]

网友

2楼 · 编辑于 2024-05-23 13:52:14

您可以使用“|”运算符将所有regex合并为一个regex，并让regex库完成区分标记的工作。应注意确保标记的首选项（例如，避免将关键字匹配为标识符）。

网友

3楼 · 编辑于 2024-05-23 13:52:14

我在python文档中找到了this。只是简单而优雅。

import collections
import re

Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column'])

def tokenize(s):
    keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
    token_specification = [
        ('NUMBER',  r'\d+(\.\d*)?'), # Integer or decimal number
        ('ASSIGN',  r':='),          # Assignment operator
        ('END',     r';'),           # Statement terminator
        ('ID',      r'[A-Za-z]+'),   # Identifiers
        ('OP',      r'[+*\/\-]'),    # Arithmetic operators
        ('NEWLINE', r'\n'),          # Line endings
        ('SKIP',    r'[ \t]'),       # Skip over spaces and tabs
    ]
    tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
    get_token = re.compile(tok_regex).match
    line = 1
    pos = line_start = 0
    mo = get_token(s)
    while mo is not None:
        typ = mo.lastgroup
        if typ == 'NEWLINE':
            line_start = pos
            line += 1
        elif typ != 'SKIP':
            val = mo.group(typ)
            if typ == 'ID' and val in keywords:
                typ = val
            yield Token(typ, val, line, mo.start()-line_start)
        pos = mo.end()
        mo = get_token(s, pos)
    if pos != len(s):
        raise RuntimeError('Unexpected character %r on line %d' %(s[pos], line))

statements = '''
    IF quantity THEN
        total := total + price * quantity;
        tax := price * 0.05;
    ENDIF;
'''

for token in tokenize(statements):
    print(token)

这里的诀窍是：

tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)

这里(?P<ID>PATTERN)将用ID指定的名称标记匹配的结果。

相关问题更多 >

编程相关推荐

热门问题

热门文章