用特殊分隔符分割文本文件为多个部分 - python
我有一个输入文件,内容如下:
This is a text block start
This is the end
And this is another
with more than one line
and another line.
我想要做的事情是按某个特殊的行来分段读取文件,在这个例子中,特殊的行是一个空行,比如说:[out]:
[['This is a text block start', 'This is the end'],
['And this is another','with more than one line', 'and another line.']]
我通过这样的方法得到了想要的输出:
def per_section(it):
""" Read a file and yield sections using empty line as delimiter """
section = []
for line in it:
if line.strip('\n'):
section.append(line)
else:
yield ''.join(section)
section = []
# yield any remaining lines as a section too
if section:
yield ''.join(section)
但是如果这个特殊的行是以#
开头的,比如:
# Some comments, maybe the title of the following section
This is a text block start
This is the end
# Some other comments and also the title
And this is another
with more than one line
and another line.
我就得这样做:
def per_section(it):
""" Read a file and yield sections using empty line as delimiter """
section = []
for line in it:
if line[0] != "#":
section.append(line)
else:
yield ''.join(section)
section = []
# yield any remaining lines as a section too
if section:
yield ''.join(section)
如果我想让per_section()
这个函数可以接受一个分隔符参数,我可以尝试这样:
def per_section(it, delimiter== '\n'):
""" Read a file and yield sections using empty line as delimiter """
section = []
for line in it:
if line.strip('\n') and delimiter == '\n':
section.append(line)
elif delimiter= '\#' and line[0] != "#":
section.append(line)
else:
yield ''.join(section)
section = []
# yield any remaining lines as a section too
if section:
yield ''.join(section)
但是有没有办法让我不用把所有可能的分隔符都写死在代码里呢?
3 个回答
1
这样做怎么样呢?
from itertools import groupby
def per_section(s, delimiters=()):
def key(s):
return not s or s.isspace() or any(s.startswith(x) for x in delimiters)
for k, g in groupby(s.splitlines(), key=key):
if not k:
yield list(g)
if __name__ == '__main__':
print list(per_section('''This is a text block start
This is the end
And this is another
with more than one line
and another line.'''))
print list(per_section('''# Some comments, maybe the title of the following section
This is a text block start
This is the end
# Some other comments and also the title
And this is another
with more than one line
and another line.''', ('#')))
print list(per_section('''!! Some comments, maybe the title of the following section
This is a text block start
This is the end
$$ Some other comments and also the title
And this is another
with more than one line
and another line.''', ('!', '$')))
输出结果:
[['This is a text block start', 'This is the end'], ['And this is another', 'with more than one line', 'and another line.']]
[['This is a text block start', 'This is the end'], ['And this is another', 'with more than one line', 'and another line.']]
[['This is a text block start', 'This is the end'], ['And this is another', 'with more than one line', 'and another line.']]
2
简单地这样做:
with open('yorfileaname.txt') as f: #open desired file
data = f.read() #read the whole file and save to variable data
print(*(data.split('=========='))) #now split data when "=.." and print it
#usually it would ouput a list but if you use * it will print as string
输出结果:
content content
more content
content conclusion
content again
more of it
content conclusion
content
content
contend done
2
那我们来看看怎么传递一个条件函数吧?
def per_section(it, is_delimiter=lambda x: x.isspace()):
ret = []
for line in it:
if is_delimiter(line):
if ret:
yield ret # OR ''.join(ret)
ret = []
else:
ret.append(line.rstrip()) # OR ret.append(line)
if ret:
yield ret
用法:
with open('/path/to/file.txt') as f:
sections = list(per_section(f)) # default delimiter
with open('/path/to/file.txt.txt') as f:
sections = list(per_section(f, lambda line: line.startswith('#'))) # comment