拆分可以包含引号字符串的括号分隔文本

from splitter import split_text def test_normal(): assert split_text("('1'),('2')") == ["('1')", "('2')"] assert split_text("(1),(2),(3)") == ["(1)", "(2)", "(3)"] def test_complex(): assert split_text("('1','a'),('2','b')") == ["('1','a')", "('2','b')"] assert split_text("('1','a',NULL),(2,'b')") == ["('1','a',NULL)", "(2,'b')"] def test_apostrophe(): assert split_text("('\\'1','a'),('2','b')") == ["('\\'1','a')", "('2','b')"] def test_coma_in_string(): assert split_text("('1','a,c'),('2','b')") == ["('1','a,c')", "('2','b')"] def test_bracket_in_string(): assert split_text("('1','a)c'),('2','b')") == ["('1','a)c')", "('2','b')"] def test_bracket_and_coma_in_string(): assert split_text("('1','a),(c'),('2','b')") == ["('1','a),(c')", "('2','b')"] def test_bracket_and_coma_in_string_apostrophe(): assert split_text("('1','a\\'),(c'),('2','b')") == ["('1','a\\'),(c')", "('2','b')"]

OUTSIDE, IN_BRACKETS, IN_STRING, AFTER_BACKSLASH = range(4) def split_text(text): state = OUTSIDE read = [] result = [] for character in text: if state == OUTSIDE: if character == ',': result.append(''.join(read)) read = [] elif character == '(': read.append(character) state = IN_BRACKETS else: read.append(character) elif state == IN_BRACKETS: read.append(character) if character == ')': state = OUTSIDE elif character == "'": state = IN_STRING elif state == IN_STRING: read.append(character) if character == "'": state = IN_BRACKETS elif character == '\\': state = AFTER_BACKSLASH elif state == AFTER_BACKSLASH: read.append(character) state = IN_STRING result.append(''.join(read)) # The rest of string return result

from pyparsing import QuotedString, ZeroOrMore, Literal, Group, Suppress, Word, nums null_value = Literal('NULL') number_value = Word(nums) string_value = QuotedString("'", escChar='\\', unquoteResults=False) value = null_value | number_value | string_value one_bracket = Group(Literal('(') + value + ZeroOrMore(Literal(',') + value) + Literal(')')) all_brackets = one_bracket + ZeroOrMore(Suppress(',') + one_bracket) def split_text(text): parse_result = all_brackets.parseString(text) return [''.join(a) for a in parse_result]

3条回答

网友

1楼 · 编辑于 2024-06-12 02:28:17

一种方法是使用较新的^{}模块，该模块支持(*SKIP)(*FAIL)功能：

import regex as re

def split_text(text):
    rx = r"""'.*?(?<!\\)'(*SKIP)(*FAIL)|(?<=\)),(?=\()"""
    return re.split(rx, text)

细细地说：

^{pr2}$

这个succeeds on all your examples。在

网友

2楼 · 编辑于 2024-06-12 02:28:17

我做了这个，它在给定的测试中起作用。在

tests = ["('1'),('2')",
"(1),(2),(3)",
"('1','a'),('2','b')",
"('1','a',NULL),(2,'b')",
"('\\'1','a'),('2','b')",
"('1','a,c'),('2','b')",
"('1','a)c'),('2','b')",
"('1','a),(c'),('2','b')",
"('1','a\\'),(c'),('2','b')"]

for text in tests:
    tmp = ''
    res = []
    bracket = 0
    quote = False

    for idx,i in enumerate(text):
        if i=="'":
            if text[idx-1]!='\\':
                quote = not quote
            tmp += i
        elif quote:
            tmp += i
        elif i==',':
            if bracket: tmp += i
            else:   pass
        else:
            if i=='(':      bracket += 1
            elif i==')':    bracket -= 1

            if bracket:   tmp += i
            else:
                tmp += i
                res.append(tmp)
                tmp = ''

    print res

输出：

^{pr2}$

代码还有改进的余地，欢迎编辑。：）

网友

3楼 · 编辑于 2024-06-12 02:28:17

这是一个正则表达式，它似乎可以工作并通过所有测试。在实际数据上运行它比用Python实现的有限状态机快6倍。在

PATTERN = re.compile(
    r"""
        \(  # Opening bracket

            (?:

            # String
            (?:'(?:
               (?:\\')|[^']  # Either escaped apostrophe, or other character
               )*'
            )
            |
            # or other literal not containing right bracket
            [^')]

            )

            (?:, # Zero or more of them separated with comma following the first one

            # String
            (?:'(?:
               (?:\\')|[^']  # Either escaped apostrophe, or other character
               )*'
            )
            |
            # or other literal
            [^')]

            )*

        \)  # Closing bracket
    """,
    re.VERBOSE)


def split_text(text):
    return PATTERN.findall(text)

相关问题更多 >

编程相关推荐

热门问题

热门文章