将大型文本文件（约50GB）拆分为多个文件

13 投票

6 回答

32512 浏览

提问于 2025-04-18 00:36

我想把一个大约50GB的文本文件分成多个小文件。文件里的数据是这样的-[x= 0到9之间的任意整数]

xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
...............
...............

这个文件里可能有几十亿行，我想每个小文件里写大约3000万到4000万行。我想的步骤是-

首先，我得打开这个文件
然后用readline()方法一行一行地读取文件，同时写入到一个新文件里
一旦达到最大行数，就创建另一个文件，继续写入。

我在想，怎么把这些步骤组合起来，既节省内存又能更快。我在StackOverflow上看到一些例子，但没有一个完全符合我的需求。如果有人能帮帮我，我会非常感激。

大数据处理性能提升行读取文本文件处理内存优化文件写入数据分割文件拆分

6 个回答

我正在写一个Python3的代码，通常用来分割大小在MB（兆字节）级别的文件。

不过，我还没有尝试过分割大小在GB（千兆字节）级别的文件。

TextFileSplitter.py

import traceback

#get a file name to be read
fileToRead = input("Enter file name : ")

# max lines you want to write in a single file
fileLineCount = 2000
lineCount = 0
fileCount = 1    

try:
    print('Start splitting...')
    #read a file
    fileReader = open(fileToRead)
    line = fileReader.readline()
    fileWriter = open(str(fileCount)+".txt","a")

    while line != '':#empty is EOF
        if lineCount == 0:
            #create a file in append mode
            fileWriter = open(str(fileCount)+".txt","a")
            #increment file count, use it for new file name
            fileCount += 1
        #write a line
        fileWriter.write(line+"\n")
        lineCount += 1
        if lineCount == fileLineCount:
            lineCount = 0
            fileWriter.close()
        #read a line
        line = fileReader.readline()

    fileWriter.close()

except Exception as e:
    #print the exception if any
    print(e.__traceback__)
    traceback.print_exc()
finally:
    #close the file reader
    fileReader.close()

输出的文件看起来会像这样，每个文件包含2000行（也就是fileLineCount）内容，都会在同一个目录下创建：

1.txt
2.txt
3.txt
.
.
.
.
n.txt

回答于 2025-04-18 由 Python大师

分享举报

这个类可能会解决你的问题。我在Linux和Windows操作系统上都测试过，效果都很好。还有，我也测试了不同大小的二进制文件和文本文件，每次结果都很不错。希望你喜欢 :)

import os
import math

class FileSpliter:
    # If file type is text then CHUNK_SIZE is count of chars
    # If file type is binary then CHUNK_SIZE is count of bytes
    def __init__(self, InputFile, FileType="b", CHUNK_SIZE=524288, OutFile="outFile"):
        self.CHUNK_SIZE = CHUNK_SIZE    # byte or char
        self.InputFile = InputFile
        self.FileType = FileType        # b: binary,  t: text
        self.OutFile = OutFile
        self.FileSize = 0
        self.Parts = None
        self.CurrentPartNo = 0
        self.Progress = 0.0

    def Prepare(self):
        if not(os.path.isfile(self.InputFile) and os.path.getsize(self.InputFile) > 0):
            print("ERROR: The file is not exists or empty!")
            return False
        self.FileSize = os.path.getsize(self.InputFile)
        if self.CHUNK_SIZE >= self.FileSize:
            self.Parts = 1
        else:
            self.Parts = math.ceil(self.FileSize / self.CHUNK_SIZE)
        return True

    def Split(self):
        if self.FileSize == 0 or self.Parts == None:
            print("ERROR: File is not prepared for split!")
            return False        
        with open(self.InputFile, "r" + self.FileType) as f:
            while True:
                if self.FileType == "b":
                    buf = bytearray(f.read(self.CHUNK_SIZE))
                elif self.FileType == "t":
                    buf = f.read(self.CHUNK_SIZE)
                else:
                    print("ERROR: File type error!")
                if not buf:
                    # we've read the entire file in, so we're done.
                    break
                of = self.OutFile + str(self.CurrentPartNo)
                outFile = open(of, "w" + self.FileType)
                outFile.write(buf)                              
                outFile.close()
                self.CurrentPartNo += 1 
                self.ProgressBar()
        return True

    def Rebuild(self):
        self.CurrentPartNo = 0
        if self.Parts == None:
            return False    
        with open(self.OutFile, "w" + self.FileType) as f:
            while self.CurrentPartNo < self.Parts:
                If = self.OutFile + str(self.CurrentPartNo) 
                if not(os.path.isfile(If) and os.path.getsize(If) > 0):
                    print("ERROR: The file [" + If + "] is not exists or empty!")
                    return False
                InputFile = open(If, "r" + self.FileType)
                buf = InputFile.read()
                if not buf:
                    # we've read the entire file in, so we're done.
                    break               
                f.write(buf)                                
                InputFile.close()
                os.remove(If)
                self.CurrentPartNo += 1 
                self.ProgressBar()
        return True 

    def ProgressBar(self, BarLength=20, ProgressIcon="#", BarIcon="-"):
        try:
            # You can't have a progress bar with zero or negative length.
            if BarLength <1:
                BarLength = 20
            # Use status variable for going to the next line after progress completion.
            Status = ""
            # Calcuting progress between 0 and 1 for percentage.
            self.Progress = float(self.CurrentPartNo) / float(self.Parts)
            # Doing this conditions at final progressing.
            if self.Progress >= 1.:
                self.Progress = 1
                Status = "\r\n"    # Going to the next line             
            # Calculating how many places should be filled
            Block = int(round(BarLength * self.Progress))
            # Show this
            Bar = "\r[{}] {:.0f}% {}".format(ProgressIcon * Block + BarIcon * (BarLength - Block), round(self.Progress * 100, 0), Status)
            print(Bar, end="")
        except:
            print("\rERROR")

def main():
    fp = FileSpliter(InputFile="inFile", FileType="b") #, CHUNK_SIZE=300000)
    if fp.Prepare():
        # Spliting ...      
        print("Spliting ...")
        sr = fp.Split()
        if sr == True:
            print("The file splited successfully.")
        print()
        # Rebuilding ...
        print("Rebuilding ...") 
        rr = fp.Rebuild()
        if rr == True:
            print("The file rebuilded successfully.")

if __name__ == "__main__":
    main()

回答于 2025-04-18 由 Python大师

分享举报

如果你有Unix工具的split可以用，而且你的任务就是把文件分开，那就可以用这个工具。不过，这里有一个纯Python的解决方案：

import contextlib

file_large = 'large_file.txt'
l = 30*10**6  # lines per split file
with contextlib.ExitStack() as stack:
    fd_in = stack.enter_context(open(file_large))
    for i, line in enumerate(fd_in):
        if not i % l:
           file_split = '{}.{}'.format(file_large, i//l)
           fd_out = stack.enter_context(open(file_split, 'w'))
        fd_out.write('{}\n'.format(line))

如果你每一行都有4个三位数的数字，并且你有多个处理器可以使用，那你可以利用文件的寻址功能，来同时运行多个进程。

回答于 2025-04-18 由 Python大师

分享举报

在编程中，有时候我们会遇到一些问题，特别是在使用某些工具或库的时候。比如，有人可能会在使用某个库时，发现它的某些功能没有按照预期工作。这种情况可能会让人感到困惑，不知道该如何解决。

通常，解决这类问题的第一步是查看相关的文档或说明。文档里会详细介绍这个库的功能、用法以及可能出现的问题。如果文档没有提供足够的信息，接下来可以尝试在网上搜索一下，看看其他人是否遇到过类似的问题，并找到了解决方案。

如果还是找不到答案，可以考虑在一些技术论坛或者社区发帖求助，比如StackOverflow。在发帖时，最好能详细描述你遇到的问题，包括你使用的代码、错误信息以及你尝试过的解决方法。这样，其他人才能更好地理解你的问题，并提供帮助。

总之，遇到问题时，不要着急，先查文档、搜索一下，最后再寻求帮助。这样可以提高解决问题的效率。

from itertools import chain, islice

def chunks(iterable, n):
   "chunks(ABCDE,2) => AB CD E"
   iterable = iter(iterable)
   while True:
       # store one line in memory,
       # chain it to an iterator on the rest of the chunk
       yield chain([next(iterable)], islice(iterable, n-1))

l = 30*10**6
file_large = 'large_file.txt'
with open(file_large) as bigfile:
    for i, lines in enumerate(chunks(bigfile, l)):
        file_split = '{}.{}'.format(file_large, i)
        with open(file_split, 'w') as f:
            f.writelines(lines)

回答于 2025-04-18 由 Python大师

分享举报

这个有效的解决方案使用了在命令行中可以找到的 split 命令。因为作者已经接受了非Python解决方案的可能性，所以请不要给这个答案差评。

首先，我创建了一个包含1000M条目的测试文件（大小为15 GB），使用了：

awk 'BEGIN{for (i = 0; i < 1000000000; i++) {print "123.123.123.123"} }' > t.txt

然后我使用了 split 命令：

split --lines=30000000 --numeric-suffixes --suffix-length=2 t.txt t

这个过程花了5分钟，生成了一组34个小文件，文件名为 t00 到 t33。其中33个文件的大小都是458 MB，最后一个 t33 的大小是153 MB。

回答于 2025-04-18 由 Python大师

分享举报

将大型文本文件（约50GB）拆分为多个文件

6 个回答

撰写回答