import tokenize
from collections import namedtuple
MyToken = namedtuple('MyToken', 'type string startpos endpos start end')
def my_tokenize(infile):
'''Generator which requires one argument, typically an io.ioBase
object, with `tell` and `readline` member functions.
'''
# Used to track starting position of each line.
# Note that tokenize starts line numbers at 1 and column numbers at 0
offsets = [0]
# Function used to wrap calls to infile.readline(); stores current
# stream position at the beginning of each line.
def wrapped_readline():
offsets.append(infile.tell())
return infile.readline()
# For each returned token, substitute type with exact_type and
# add token boundaries as stream positions
for t in tokenize.tokenize(wrapped_readline):
startline, startcol = t.start
endline, endcol = t.end
yield MyToken(t.exact_type, t.string,
offsets[startline] + startcol,
offsets[endline] + endcol,
t.start, t.end)
# Adapted from tokenize.tokenize.main(). Errors are mine.
def main():
import sys
from token import tok_name
def print_tokens(gen):
for t in gen:
rangepos = f'{t.startpos}-{t.endpos}'
range = f'{t.start[0]},{t.start[1]}-{t.end[0]},{t.end[1]}'
print(f'{rangepos:<10} {range:<20} '
f'{tok_name[t.type]:<15}{t.string!r}')
if len(sys.argv) <= 1:
print_tokens(my_tokenize(sys.stdin.buffer))
else:
for filename in sys.argv[1:]:
with open(filename, 'rb') as infile:
print_tokens(my_tokenize(infile))
if __name__ == '__main__':
main()
原则上,要将行号/偏移量对转换为文档中的字节偏移量,只需列出每行的起始字节偏移量。因此,一种简单的方法是在读取文件时积累信息。这相当简单,因为您可以给
tokenize
您自己的函数返回输入行。因此,您可以收集从行号到文件位置的映射,然后将tokenize
包装到一个函数中,该函数使用该映射添加开始和结束索引在下面的示例中,我使用
file.tell
提取当前文件位置。但是,如果输入不是可查找的文件,那么这将不起作用;在这种情况下,您需要想出一些替代方案,例如跟踪返回的字节数[注1]。根据您需要索引的内容,这可能很重要,也可能不重要:例如,如果您只需要唯一的数字,就足以保持每行字符串长度的运行总数注释
readline
返回的是字符串,而不是bytes
对象,因此它的长度是以字符而不是字节度量的;此外,在行尾不是单个字符的平台(如Windows)上,用\n
替换行尾意味着读取的字符数与文件中的字符数不对应李>相关问题 更多 >
编程相关推荐