如果匹配不存在，如何匹配正则表达式中可能存在或可能不存在但有占位符的字符串

3条回答

网友

1楼 · 编辑于 2024-06-09 18:54:14

您可以分析文件数据，对结果进行分组，并传递到数据帧：

import re
import pandas as pd
def group_results(d):
   _group = [d[0]]
   for a, b in d[1:]:
     if a == 'Name' and not any(c == 'Name' for c, _ in _group):
       _group.append([a, b])
     elif a == 'Surname' and any(c == 'Name' for c, _ in _group):
       yield _group
       _group = [[a, b]]
     else:
       if a == 'Name':
         yield _group
         _group = [[a, b]]
       else:
         _group.append([a, b])
   yield _group

headers = ["Surname","Name","Age","Weight","Height","School","Siblings","Quote"]
data = list(filter(None, [i.strip('\n') for i in open('filename.txt')]))
parsed = [(lambda x:[x[0], x[-1][1:-1]])(re.findall('(?<=^\[)\w+|".*?"(?=\]$)', i)) for i in data]
_grouped = list(map(dict, group_results(parsed)))
result = pd.DataFrame([[c.get(i, "") for i in headers] for c in _grouped], columns=headers)

输出：

  Surname    Name                ...                 Siblings                              Quote
0  Gordon   James                ...                                        I want to be a pilot
1          Monica                ...                           I am looking forward to christmas

[2 rows x 8 columns]

网友

2楼 · 编辑于 2024-06-09 18:54:14

你可以重写你的数据文件。代码将原始文件解析为类D，然后使用csv.DictWriter文件要将其写入熊猫可读的常规样式csv，请执行以下操作：

创建演示文件：

fn = "t.txt"
with open (fn,"w") as f:
    f.write("""
[Surname: "Gordon"]
[Name: "James"]
[Age: "13"]
[Weight: "46"]
[Height: "12"]
[Quote: "I want to be a pilot"]

[Name: "Monica"]
[Weight: "33"]
[Quote: "I am looking forward to christmas"]
""")

Itermediate类：

class D:
    fields = ["Surname","Name","Age","Weight","Height","Quote"]

    def __init__(self,textlines):
        t = [(k.strip(),v.strip()) for k,v in (x.strip().split(":",1) for x in textlines)]
        self.data = {k:"" for k in D.fields}
        self.data.update(t) 

    def surname(self):    return self.data["Surname"]
    def name(self):       return self.data["Name"]
    def age(self):        return self.data["Age"]
    def weight(self):     return self.data["Weight"]
    def height(self):     return self.data["Height"]
    def quote(self):      return self.data["Quote"]

    def get_data(self):
        return self.data

解析和重写：

fn = "t.txt"

# list of all collected D-Instances
data = []
with open(fn) as f:
    # each dataset contains all lines belonging to one "person"
    dataset = []
    surname = False
    for line in f.readlines():
        clean = line.strip().strip("[]")
        if clean and (clean.startswith("Surname") or clean.startswith("Name")):
            if any(e.startswith("Name") for e in dataset):
                data.append(D(dataset))
                dataset = []
                if clean:
                    dataset.append(clean)
            else:
                if clean:
                    dataset.append(clean)
        elif clean:
            dataset.append(clean)
    if dataset:
        data.append(D(dataset))

import csv
with open("other.txt", "w", newline="") as f:
    dw = csv.DictWriter(f,fieldnames=D.fields)
    dw.writeheader()
    for entry in data:
        dw.writerow(entry.get_data())

检查所写内容：

with open("other.txt","r") as f:
    print(f.read())

输出：

Surname,Name,Age,Weight,Height,Quote
"""Gordon""","""James""","""13""","""46""","""12""","""I want to be a pilot"""
,"""Monica""",,"""33""",,"""I am looking forward to christmas"""

网友

3楼 · 编辑于 2024-06-09 18:54:14

基于@WiktorStribiżew comment，您可以使用groupby（来自itertools）将行分组为空行和数据行，例如：

import re
from itertools import groupby

text = '''[Surname: "Gordon"]
[Name: "James"]
[Age: "13"]
[Weight: "46"]
[Height: "12"]
[Quote: "I want to be a pilot"]

[Name: "Monica"]
[Weight: "33"]
[Quote: "I am looking forward to christmas"]

[Name: "John"]
[Height: "33"]
[Quote: "I am looking forward to christmas"]

[Surname: "Gordon"]
[Name: "James"]
[Height: "44"]
[Quote: "I am looking forward to christmas"]'''

patterns = [re.compile('(\[Surname: "(?P<surname>\w+?)"\])'),
            re.compile('(\[Name: "(?P<name>\w+?)"\])'),
            re.compile('(\[Age: "(?P<age>\d+?)"\])'),
            re.compile('\[Weight: "(?P<weight>\d+?)"\]'),
            re.compile('\[Height: "(?P<height>\d+?)"\]'),
            re.compile('\[Quote: "(?P<quote>.+?)"\]')]

records = []
for non_empty, group in groupby(text.splitlines(), key=lambda l: bool(l.strip())):
    if non_empty:
        lines = list(group)
        record = {}
        for line in lines:
            for pattern in patterns:
                match = pattern.search(line)
                if match:
                    record.update(match.groupdict())
                    break
        records.append(record)

for record in records:
    print(record)

输出

{'weight': '46', 'quote': 'I want to be a pilot', 'age': '13', 'name': 'James', 'height': '12', 'surname': 'Gordon'}
{'weight': '33', 'quote': 'I am looking forward to christmas', 'name': 'Monica'}
{'height': '33', 'quote': 'I am looking forward to christmas', 'name': 'John'}
{'height': '44', 'surname': 'Gordon', 'quote': 'I am looking forward to christmas', 'name': 'James'}

注意：这将创建一个字典，其中键是字段名，值是每个字段的值，此格式与您的预期输出不匹配，但我相信比您要求的更完整。在任何情况下，您都可以轻松地将此格式转换为所需的元组格式。你知道吗

解释

itertools的groupby函数将输入数据分组为空行和记录行的连续组。然后您只需要处理不为空的组。对于每一行，处理都很简单如果模式是匹配的，则尝试匹配一个模式，假设每一行对于每个匹配都是独占的，则利用命名组用字段的值更新record字典。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章