抽象正则表达式

2024-05-14 08:12:43 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在用俄语处理查询(将它们翻译成要执行的SQL代码)。我将查询标记化,进行形态分析,从引理中获得指向数据库对象的链接。所以现在我想使用正则表达式之类的模式来获取条件、ORDER BY表达式等等。但问题是,正则表达式只能与字符(字符串)列表一起使用

是否有一个Python库/解决方案(最好)可以像正则表达式一样工作,但可以用于任何类型的对象(具有语法属性的标记列表或数据库对象),而不仅仅是字符串

因此,作为一个例子,我想写一些模式,看起来像这样:

[db-column]((','|'and')[db-column])*

此模式将匹配以下数据库对象列表:columnA、columnB和columnC


Tags: 对象字符串代码标记数据库列表dbsql
1条回答
网友
1楼 · 发布于 2024-05-14 08:12:43

如果有人对这个主题感兴趣,有一个GitHub repository with my work,我为抽象正则表达式编写了以下代码:

"""
  AbstractRegularExpressions.py
  
  Abstract Regular Expressions.
  They are like regular expressions, but can work with any kinds of objects,
  not just characters as in regex.
  This code is based on Mark-Jason Dominus's article «How Regexes Work»,
  you can find it here: https://perl.plover.com/Regex/article.html
"""

# NFA - Nondeterministic Finite Automata (machine).

# Used for defining custom operators.
# http://code.activestate.com/recipes/384122/
class Infix:
  def __init__(self, function):
    self.function = function
  def __ror__(self, other):
    return Infix(lambda x, self=self, other=other: self.function(other, x))
  def __or__(self, other):
    return self.function(other)
  def __rlshift__(self, other):
    return Infix(lambda x, self=self, other=other: self.function(other, x))
  def __rshift__(self, other):
    return self.function(other)
  def __call__(self, value1, value2):
    return self.function(value1, value2)

# Alternatives («|» in regex): a|b|c (in regex) <==> a |OR| b |OR| c (here). 
class Cases:
  def __init__(self, cases):
    self.cases = cases
  def add(self, case):
    self.cases.append(case)
    return self
# Operator for joining cases, used like this: a |OR| b |OR| c.
OR = Infix(lambda x, y: x.add(y) if isinstance(x, Cases) else Cases([x, y]))

# Primitives are used to test one token for some predicate.
# In regex there's only one primitive — whether a token is equal to some character or not.
# Examples of primitives: «is token a table column?», «is token's type — string» etc.
class Primitive:
  def __init__(self, name, predicate):
    self.predicate = predicate
    self.name = name
  def test(self, *args):
    return self.predicate(*args)
  def __str__(self):
    return self.name

# Transitions are the arrows in NFAs,
# one transition leads from one state to another if the pattern's condition is met for a token.
# If there is no pattern (pattern = None), then you can always go to the next state (epsilon-transition).
class Transition:
  def __init__(self, pattern, nextState=None):
    self.pattern = pattern
    self.nextState = nextState

stateID = 0 # Only used for pretty-printing Patterns (machines).
# State is just a bunch of transitions leading to other states or the final state (None).
class State:
  def __init__(self, transitions):
    global stateID
    stateID += 1
    self.ID = stateID
    self.transitions = transitions
  def __str__(self):
    return str(self.ID)

# Recursively creates an NFA
# (makes simple NFAs from primitives and then connects them with connectMachines).
def makeMachine(pattern):
  # Subpatterns are treated the same as primitives.
  if (isinstance(pattern, Pattern) or isinstance(pattern, Primitive)):
    return State({ Transition(pattern) })
  # Make a machine out of each member of a list and then connect them with connectMachines.
  elif (isinstance(pattern, list)):
    if (len(pattern) == 0): return None
    accMachine = makeMachine(pattern[-1])
    for i in range(1, len(pattern)):
      accMachine = connectMachines(makeMachine(pattern[-1 - i]), accMachine)
    return accMachine
  # Quantifiers (+, ?, *).
  elif (isinstance(pattern, tuple)):
    (p, quantifier) = pattern
    machine = makeMachine(p)
    if (quantifier == '+' or quantifier == '*'):
      endTransitions = []
      for state in statesIterator(machine, set()):
        for transition in state.transitions:
          if transition.nextState == None:
            endTransitions.append((state, transition))
      for (state, transition) in endTransitions:
        transition.nextState = machine
        state.transitions.add(Transition(transition.pattern, None))
    if (quantifier == '?' or quantifier == '*'):
      if (len([m for m in machine.transitions if m.pattern == None and m.nextState == None]) == 0):
        machine.transitions.add(Transition(None))
    return machine
  # For cases (a |OR| b |OR| c).
  elif (isinstance(pattern, Cases)):
    return combineMachines([makeMachine(p) for p in pattern.cases])

# Goes through each state of a machine.
def statesIterator(state, passedStates = set()):
  if (state != None): yield state
  else: return
  passedStates.add(state)
  for transition in state.transitions:
    if (not transition.nextState in passedStates):
      for s in statesIterator(transition.nextState, passedStates):
        yield s

# Connects two machines into one (replaces finish states of machineA with start states of machineB).
def connectMachines(machineA, machineB):
  endTransitions = []
  for state in statesIterator(machineA, set()):
    for transition in state.transitions:
      if transition.nextState == None:
        endTransitions.append(transition)
  for transition in endTransitions:
    transition.nextState = machineB
  return machineA

# Connects cases of machines (machineA |OR| machineB |OR| ... |OR| machineN) into one machine.
def combineMachines(machines):
  return State({ t for machine in machines for t in machine.transitions })

# Pattern, as in regular expressions.
# Example: p = Pattern('name', [a, b, c, (d, '+') |OR| e])
# which gives you a regex «abc(d+|e)»,
# where a, b, c, d and e are another patterns or primitives.
class Pattern:
  def __init__(self, name, pattern):
    global stateID
    stateID = 0
    self.name = name
    self.machine = makeMachine(pattern)
  def __str__(self):
    return self.name

# Structure stores tokens of a found pattern.
class Structure:
  def __init__(self, name, elements=None):
    self.name = name
    self.elements = elements if elements != None else []
  def __str__(self):
    return f'  - {self.name}  - {[str(el) for el in self.elements]}'

# Class for storing a token and pattern which found this token.
class PatternToken:
  def __init__(self, pattern, token):
    self.pattern = pattern
    self.token = token
  def __str__(self):
    return f'{self.pattern}: {self.token.text}'

# Used for storing linked matched tokens.
class CurrentState:
  def __init__(self, token, transition=None, previousState=None, patternsStack=[]):
    self.transition = transition
    self.token = token
    self.previousState = previousState
    self.patternsStack = patternsStack
  def __str__(self):
    t = self.token.text if self.token != None else ''
    return f'== [{self.transition.pattern.name} ({len(self.patternsStack)}): {t}]\n{self.previousState}'
  # Connects linked CurrentStates into one Structure.
  def connect(self, name):
    state = self
    structuresStack = []
    result = []
    indexes = []
    while True:
      token = state.token
      transition = state.transition
      # Pattern
      if (isinstance(transition.pattern, Pattern)):
        if (len(structuresStack) == 0 or structuresStack[-1].name != transition.pattern.name):
          structuresStack.append(Structure(transition.pattern.name))
        else:
          structure = structuresStack.pop()
          structure.elements = structure.elements[::-1] # reverse the elements.
          if (len(structuresStack) == 0):
            result.append(structure)
          else:
            structuresStack[-1].elements.append(structure)
      # Primitive
      elif (isinstance(transition.pattern, Primitive)):
        indexes.append(token.index)
        if (len(structuresStack) == 0):
          result.append(PatternToken(transition.pattern, token))
        else:
          structuresStack[-1].elements.append(PatternToken(transition.pattern, token))
      state = state.previousState
      if (state == None): break
    return ((min(indexes), max(indexes)), Structure(name, result[::-1]))

# Used for running a pattern on a list of tokens.
class Automata:
  def __init__(self, pattern):
    self.pattern = pattern
    self.finalStates = set()
    self.currentStates = set()
  def feedToken(self, token):
    currentStates = set(self.currentStates)
    self.currentStates = set()
    for state in currentStates:
      for transition in state.transition.nextState.transitions:
        self.processTransition(transition, token, state)
    for transition in self.pattern.machine.transitions:
      self.processTransition(transition, token)
  def __str__(self):
    return "\n\n".join([str(state) for state in self.finalStates])
  def processTransition(self, transition, token, previousState=None):
    # Epsilon
    if (transition.pattern == None):
      if (transition.nextState != None):
        for t in transition.nextState.transitions:
          self.processTransition(t, token, previousState)
      elif (len(previousState.patternsStack) == 0):
        self.finalStates.add(previousState)
      else:
        newState = previousState
        while True:
          patternsStack = list(newState.patternsStack)
          patternState = patternsStack.pop()
          newState = CurrentState(None, patternState.transition, newState, patternsStack)
          if (newState.transition.nextState != None):
            for t in newState.transition.nextState.transitions:
              self.processTransition(t, token, newState)
            break
          if (len(newState.patternsStack) == 0):
            self.finalStates.add(newState)
            break
    # Primitive
    elif (isinstance(transition.pattern, Primitive)):
      if (transition.pattern.test(token)):
        patternsStack = previousState.patternsStack if previousState != None else []
        newState = CurrentState(token, transition, previousState, list(patternsStack))
        if (newState.transition.nextState != None):
          self.currentStates.add(newState)
        elif (len(newState.patternsStack) == 0):
          self.finalStates.add(newState)
        else:
          while True:
            patternsStack = list(newState.patternsStack)
            patternState = patternsStack.pop()
            newState = CurrentState(None, patternState.transition, newState, patternsStack)
            if (newState.transition.nextState != None):
              self.currentStates.add(newState)
              break
            if (len(newState.patternsStack) == 0):
              self.finalStates.add(newState)
              break
    # Pattern
    elif (isinstance(transition.pattern, Pattern)):
      patternsStack = previousState.patternsStack if previousState != None else []
      newState = CurrentState(None, transition, previousState, list(patternsStack))
      newState.patternsStack.append(newState)
      for t in transition.pattern.machine.transitions:
        self.processTransition(t, token, newState)

# Pretty-print a machine.
def printMachine(machine):
  padding = 0
  for state in statesIterator(machine, set()):
    for t in state.transitions:
      print(padding*' ' + f'{state}:  {t.pattern}->{t.nextState}')
    padding += 1

# Pretty-print a pattern.
def printPattern(pattern):
  print(f'-=-=-=-=-= {pattern.name} =-=-=-=-=-')
  printMachine(pattern.machine)

这允许您编写如下模板:

# Selecting
table = Primitive('table', lambda token: token.type == 'table')
column = Primitive('column', lambda token: token.type == 'column')
columnExpr = Pattern('columnExpr', [(operator, '*'), column])
columnLiteralExpr = Pattern('columnExpr', [(operator, '*'), column |OR| literal])
listOfTables = Pattern('listOfTables', [table])
listOfColumns = Pattern('listOfColumns', [columnExpr, ([connector, columnExpr], '*'), (table, '?')])
selectExpr = Pattern('selectExpr', [listOfColumns |OR| listOfTables])

相关问题 更多 >

    热门问题