通过使用mmap和re.findall搜索大文件，在Python中实现MemoryError

import re import mmap def file_len(fname): with open(fname) as f: for i, l in enumerate(f): pass return i + 1 def searchFile(list_txt, raw_str): search="^"+raw_str #add regex ^ newline operator search_rgx=re.sub(r'\*+',r'[\\s\\S]*?',search) #replace * with regex function #search file with open(list_txt, 'r+') as f: data = mmap.mmap(f.fileno(), 0) results = re.findall(bytes(search_rgx,encoding="utf-8"),data, re.MULTILINE) #save results f1 = open('results.txt', 'w+b') results_bin = b'\n'.join(results) f1.write(results_bin) f1.close() print("Found "+str(file_len("results.txt"))+" results") searchFile("largelist.txt","ab**cd")

Traceback (most recent call last): File "c:\Programming\test.py", line 27, in <module> searchFile("largelist.txt","ab**cd") File "c:\Programming\test.py", line 21, in searchFile results_bin = b'\n'.join(results) MemoryError

3条回答

网友

1楼 · 编辑于 2024-04-19 16:23:11

我不确定您认为使用mmap打开输入文件会有什么好处，但由于必须匹配的每个字符串都由新行分隔（根据您的注释），因此我将使用以下方法（注意，它是Python，但故意保留为伪代码）：

with open(input_file_path, "r") as input_file:
  with open(output_file_path, "x" as output_file:
    for line in input_file:
      if is_match(line):
        print(line, file=output_file)

可能会根据需要调整print函数的endline参数

这样，结果在生成时被写入，并且在写入之前避免在内存中有一个大的results。此外，您不需要专注于换行。只有每一行是否匹配

网友

2楼 · 编辑于 2024-04-19 16:23:11

这个怎么样？在这种情况下，您需要的是以字符串表示的所有行的列表。以下内容模拟了该操作，生成了字符串列表：

import io

longstring = """ab12345cd
abbbcd
ab_fghfghfghcd
1abcd
agcd
bb111cd
"""

list_of_strings = io.StringIO(longstring).read().splitlines()
list_of_strings

输出

['ab12345cd', 'abbbcd', 'ab_fghfghfghcd', '1abcd', 'agcd', 'bb111cd']

这是最重要的部分

s = pd.Series(list_of_strings)
s[s.str.match('^ab[\s\S]*?cd')]

输出

0         ab12345cd
1            abbbcd
2    ab_fghfghfghcd
dtype: object

Edit2：试试这个：（我看不出你有什么理由想把它作为一个函数，但我已经这样做了，因为你在评论中就是这么做的。）

def newsearch(filename):
    with open(filename, 'r', encoding="utf-8") as f:
        list_of_strings = f.read().splitlines()
    s = pd.Series(list_of_strings)
    s = s[s.str.match('^ab[\s\S]*?cd')]
    s.to_csv('output.txt', header=False, index=False)

newsearch('list.txt')

基于块的方法

import os

def newsearch(filename):
    outpath = 'output.txt'
    if os.path.exists(outpath):
        os.remove(outpath)
    for chunk in pd.read_csv(filename, sep='|', header=None, chunksize=10**6):
        chunk = chunk[chunk[0].str.match('^ab[\s\S]*?cd')]
        chunk[0].to_csv(outpath, index=False, header=False, mode='a')

newsearch('list.txt')

dask方法

import dask.dataframe as dd

def newsearch(filename):
    chunk = dd.read_csv(filename, header=None, blocksize=25e6)
    chunk = chunk[chunk[0].str.match('^ab[\s\S]*?cd')]
    chunk[0].to_csv('output.txt', index=False, header=False, single_file = True)

newsearch('list.txt')

网友

3楼 · 编辑于 2024-04-19 16:23:11

您正在逐行处理，因此希望避免在内存中积累数据。在这里，常规的文件读写应该可以很好地工作mmap由虚拟内存支持，但在读取时必须转换为真实内存。在findall中累积结果也是一种内存占用。请尝试以下替代方法：

import re

# buffer to 1Meg but any effect would be modest
MEG = 2**20

def searchFile(filename, raw_str):
    # extract start and end from "ab***cd"
    startswith, endswith = re.match(r"([^\*]+)\*+?([^\*]+)", raw_str).groups()
    with open(filename, buffering=MEG) as in_f, open("results.txt", "w", buffering=MEG) as out_f:
        for line in in_f:
            stripped = line.strip()
            if stripped.startswith(startswith) and stripped.endswith(endswith):
                out_f.write(line)

# write test file

test_txt = """ab12345cd
abbbcd
ab_fghfghfghcd
1abcd
agcd
bb111cd
"""

want = """ab12345cd
abbbcd
ab_fghfghfghcd
"""

open("test.txt", "w").write(test_txt)

searchFile("test.txt", "ab**cd")

result = open("results.txt").read()
print(result == want)

相关问题更多 >

编程相关推荐

热门问题

热门文章