从大型结构化文件中提取信息

import re, pprint class Despacho(object): """ Class to parse each line, applying the regexp and storing the results for future use """ regexp = { re.compile(r'No.([\d]{9}) ([\d]{2}/[\d]{2}/[\d]{4}) (.*)'): lambda self: self._processo, re.compile(r'Tit.(.*)'): lambda self: self._titular, re.compile(r'Procurador: (.*)'): lambda self: self._procurador, re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'): lambda self: self._documento, re.compile(r'Apres.: (.*) ; Nat.: (.*)'): lambda self: self._apresentacao, re.compile(r'Marca: (.*)'): lambda self: self._marca, re.compile(r'Clas.Prod/Serv: (.*)'): lambda self: self._classe, re.compile(r'\*(.*)'): lambda self: self._complemento, } def __init__(self): """ 'complemento' is the only field that can be multiple in a single registry """ self.complemento = [] def _processo(self, matches): self.processo, self.data, self.despacho = matches.groups() def _titular(self, matches): self.titular = matches.group(1) def _procurador(self, matches): self.procurador = matches.group(1) def _documento(self, matches): self.documento = matches.group(1) def _apresentacao(self, matches): self.apresentacao, self.natureza = matches.groups() def _marca(self, matches): self.marca = matches.group(1) def _classe(self, matches): self.classe = matches.group(1) def _complemento(self, matches): self.complemento.append(matches.group(1)) def read(self, line): for pattern in Despacho.regexp: m = pattern.match(line) if m: Despacho.regexp[pattern](self)(m) def process(rpi): """ read data and process each group """ rpi = (line for line in rpi) group = False for line in rpi: if line.startswith('No.'): group = True d = Despacho() if not line.strip() and group: # empty line - end of block yield d group = False d.read(line) arquivo = open('rm1972.txt') # file to process for desp in process(arquivo): pprint.pprint(desp.__dict__) print('--------------')

3条回答

网友

1楼 · 编辑于 2024-04-25 16:37:42

如果你有特别的顾虑，帮助别人会更容易。性能在很大程度上取决于所使用的特定regex引擎的效率。一个文件中的100K行听起来并不是那么大，但这完全取决于您的环境。在

我在.NET开发中使用Expresso来测试表达式的准确性和性能。谷歌搜索发现了Kodos，一个GUI Python regex创作工具。在

网友

2楼 · 编辑于 2024-04-25 16:37:42

整体看起来不错，但你为什么要说：

rpi = (line for line in rpi)

您已经可以在不使用此中间步骤的情况下迭代file对象。在

网友

3楼 · 编辑于 2024-04-25 16:37:42

很不错。下面是一些建议，如果你喜欢，请告诉我：

import re
import pprint
import sys

class Despacho(object):
    """
    Class to parse each line, applying the regexp and storing the results
    for future use
    """
    #used a dict with the keys instead of functions.
    regexp = {
        ('processo', 
         'data', 
         'despacho'): re.compile(r'No.([\d]{9})  ([\d]{2}/[\d]{2}/[\d]{4})  (.*)'),
        ('titular',): re.compile(r'Tit.(.*)'),
        ('procurador',): re.compile(r'Procurador: (.*)'),
        ('documento',): re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'),
        ('apresentacao',
         'natureza'): re.compile(r'Apres.: (.*) ; Nat.: (.*)'),
        ('marca',): re.compile(r'Marca: (.*)'),
        ('classe',): re.compile(r'Clas.Prod/Serv: (.*)'),
        ('complemento',): re.compile(r'\*(.*)'),
    }

    def __init__(self):
        """
        'complemento' is the only field that can be multiple in a single registry
        """
        self.complemento = []


    def read(self, line):
        for attrs, pattern in Despacho.regexp.iteritems():
            m = pattern.match(line)
            if m:
                for groupn, attr in enumerate(attrs):
                    # special case complemento:
                    if attr == 'complemento':
                        self.complemento.append(m.group(groupn + 1))
                    else:
                        # set the attribute on the object
                        setattr(self, attr, m.group(groupn + 1))

    def __repr__(self):
        # defines object printed representation
        d = {}
        for attrs in self.regexp:
            for attr in attrs:
                d[attr] = getattr(self, attr, None)
        return pprint.pformat(d)

def process(rpi):
    """
    read data and process each group
    """
    #Useless line, since you're doing a for anyway
    #rpi = (line for line in rpi)
    group = False

    for line in rpi:
        if line.startswith('No.'):
            group = True
            d = Despacho()        

        if not line.strip() and group: # empty line - end of block
            yield d
            group = False

        d.read(line)

def main():
    arquivo = open('rm1972.txt') # file to process
    for desp in process(arquivo):
        print desp # can print directly here.
        print('-' * 20)
    return 0

if __name__ == '__main__':
    main()

相关问题更多 >

编程相关推荐

热门问题

热门文章