我正在使用多重处理来加速使用Pandas处理大约1000~500MB的csv文件。我试图将一个简单的字符串正则表达式应用于一个列。这个程序可以正常工作,但似乎不能正确释放内存,最终每个进程会消耗掉40-80GB的内存,尽管没有一个文件超过10GB。你知道为什么会这样吗?我试过很多方法来清除记忆,但都没用。在
import pandas as pd
import numpy as np
import os
import multiprocessing
import gc
from ctypes import cdll, CDLL
from random import shuffle
oldc = ""
newc = ""
NUMPROC = 8
rep = None
cdll.LoadLibrary("libc.so.6")
libc = CDLL("libc.so.6")
def main(filename, oldcol, newcol):
global oldc
global newc
global rep
names = np.empty([1,1])
oldc = oldcol
newc = newcol
df = pd.read_csv(filename)
names = df.as_matrix()
del df
rep = {}
rep[newc] = {}
for row in names[1:]:
oldname = r"^"+str(row[0])+r"( .*|$)"
newname = str(row[1]) + r"\1"
rep[newc][oldname]=newname
if not os.path.exists("./standardized/"):
print("Making dir!")
os.makedirs("./standardized/")
files = [f for f in os.listdir('.') if (os.path.isfile(f) and ".csv" in f and not (f==filename or "household" in str(f) or os.path.exists("./standardized/"+f[:-4]+"_stnd.csv")))]
shuffle(files)
allfiles = [f for f in os.listdir('.') if ".csv" in f]
for f in allfiles:
if os.path.exists("./standardized/"+f[:-4]+"_stnd.csv"):
if os.path.getsize(f) > os.path.getsize("./standardized/"+f[:-4]+"_stnd.csv"):
files.append(f)
print(len(files))
bundle = [(idx, f) for idx, f in enumerate(files)]
pool = multiprocessing.Pool(processes=NUMPROC, maxtasksperchild=1)
r = pool.map_async(process, bundle)
pool.close()
pool.join()
def process(bundle):
global oldc
global rep
global newc
fname = bundle[1]
idx = bundle[0]
try:
print(idx)
libc.malloc_trim(0)
curfile = pd.read_csv(fname, dtype="str")
curfile[newc] = curfile[oldc].str.lower()
curfile.replace(to_replace=rep, regex=True, inplace=True)
curfile.to_csv("./standardized/"+fname[:-4]+"_stnd.csv")
del curfile
except:
print("error on: " + str(fname))
finally:
gc.collect()
libc.malloc_trim(0)
main("lookup.csv","namefrst","stndfrst")
目前没有回答
相关问题 更多 >
编程相关推荐