Pandas用chunksize内存读_csv

i = 0 for chunk in pd.read_csv(filename,chunksize=500000,names=colnames): if i <= 349500000: # Previously processed chunks i = i+chunksize print 'Skipping chunk ', i continue enriched_chunk = enrich_df(chunk, users_df) enriched_chunk.to_csv('enriched.csv', mode='a', header=False, index=False) def enrich_df(c, users_df): d = pd.merge(c, users_df, how='left', left_on='user_id', right_on='userId') return d

1条回答

网友

1楼 · 发布于 2024-05-16 20:52:59

我已经写了一个简化的测试脚本，不能重现你的错误。我使用的是python3.6.1和Pandas 0.20.3。在

测试脚本使用一个恒定的306 MB内存（至少对于前1亿行，它仍在运行）。在

你可以试试这个脚本，看看它是否仍然会导致内存泄漏？它可以作为调试的起点。在

import os
import time

import numpy as np
import pandas as pd
import matplotlib.pylab as plt

SOURCE = 'test.csv'
CHUNCKSIZE = int(5e5)

def main():

    if not os.path.exists(SOURCE):
        df = pd.DataFrame(np.random.rand(1000000, 4), columns=list('ABCD'))
        df.to_csv(SOURCE)
        del df
    print("df %s ready" %(SOURCE))

    i=0; lines=[]; timings = []
    while(i < int(7e8)): 
        t = time.time()
        print(i, end = ' ')
        i += CHUNCKSIZE
        chunk = pd.read_csv(SOURCE, chunksize=CHUNCKSIZE)
        chunk = pd.concat(chunk, ignore_index=True)
        chunk.to_csv('enriched.csv', mode='a', header=False, index=False)
        print(" in %.3f seconds" %(time.time() -t))
        lines.append(i)
        timings.append(time.time()-t)


    plt.figure(figsize=(10,5))
    plt.plot(lines, timings, 'o-')
    plt.xlabel("Nr of lines"); plt.ylabel("Time for last chunck")
    plt.show()




if __name__ == "__main__":
    st = time.time()
    main()
    print(" - total runtime %.3f seconds  -" %(time.time() - st))

相关问题更多 >

编程相关推荐

热门问题

热门文章