Python defaultdict(list) 的序列化/反序列化性能
我正在写一个脚本,需要在启动时处理一个相当大的词典,里面有620,000个单词。这个词典会一个一个单词地处理,结果存储在一个叫做defaultdict(list)
的结构里,键是字母的二元组和三元组,值是包含这些字母组合的单词列表。
for word in lexicon_file:
word = word.lower()
for letter n-gram in word:
lexicon[n-gram].append(word)
比如说,
> lexicon["ab"]
["abracadabra", "abbey", "abnormal"]
处理后得到的结构有25,000个键,每个键对应的列表里有1到133,000个字符串(平均500个,媒体值20个)。所有的字符串都是用windows-1250
编码的。
这个处理过程花费了很多时间(虽然考虑到脚本的实际运行时间,这个时间可以忽略不计,但在测试时还是挺耗时的)。因为词典本身是不会改变的,所以我想把处理后的defaultdict(list)
序列化,这样在后续的启动中可以更快地反序列化。
我发现即使使用cPickle
,反序列化的速度也大约是处理词典速度的两倍,平均值接近:
> normal lexicon creation
45 seconds
> cPickle deserialization
80 seconds
我对序列化没有经验,但我本来以为反序列化应该比正常处理快,至少对于cPickle
模块来说。
我的问题是,这个结果是正常的吗?为什么会这样?有没有更快的方式来存储和加载我的结构?
1 个回答
2
想要搞清楚这种事情,最好的办法就是写很多测试,然后用timeit
来看看哪个更快。我下面做了一些测试,但你最好用自己的词典来试试,因为你的结果可能会不一样。
如果你想让时间的测量更稳定(更准确),可以把timeit
里的number
参数调高,这样测试会花更多时间。另外,要注意timeit
返回的值是总的执行时间,而不是每次运行的时间。
testing with 10 keys...
serialize flat: 2.97198390961
serialize eval: 4.60271120071
serialize defaultdict: 20.3057091236
serialize dict: 20.2011070251
serialize defaultdict new pickle: 14.5152060986
serialize dict new pickle: 14.7755970955
serialize json: 13.5039670467
serialize cjson: 4.0456969738
unserialize flat: 1.29577493668
unserialize eval: 25.6548647881
unserialize defaultdict: 10.2215960026
unserialize dict: 10.208122015
unserialize defaultdict new pickle: 5.70747089386
unserialize dict new pickle: 5.69750404358
unserialize json: 5.34811091423
unserialize cjson: 1.50241613388
testing with 100 keys...
serialize flat: 2.91076397896
serialize eval: 4.72978711128
serialize defaultdict: 21.331786871
serialize dict: 21.3218340874
serialize defaultdict new pickle: 15.7140991688
serialize dict new pickle: 15.6440980434
serialize json: 14.3557379246
serialize cjson: 5.00576901436
unserialize flat: 1.6677339077
unserialize eval: 22.9142649174
unserialize defaultdict: 10.7773029804
unserialize dict: 10.7524499893
unserialize defaultdict new pickle: 6.13370203972
unserialize dict new pickle: 6.18057107925
unserialize json: 5.92281794548
unserialize cjson: 1.91151690483
代码:
import cPickle
import json
try:
import cjson # not Python standard library
except ImportError:
cjson = False
from collections import defaultdict
dd1 = defaultdict(list)
dd2 = defaultdict(list)
for i in xrange(1000000):
dd1[str(i % 10)].append(str(i))
dd2[str(i % 100)].append(str(i))
dt1 = dict(dd1)
dt2 = dict(dd2)
from timeit import timeit
def testdict(dd, dt):
def serialize_defaultdict():
with open('defaultdict.pickle', 'w') as f:
cPickle.dump(dd, f)
def serialize_p2_defaultdict():
with open('defaultdict.pickle2', 'w') as f:
cPickle.dump(dd, f, -1)
def serialize_dict():
with open('dict.pickle', 'w') as f:
cPickle.dump(dt, f)
def serialize_p2_dict():
with open('dict.pickle2', 'w') as f:
cPickle.dump(dt, f, -1)
def serialize_json():
with open('dict.json', 'w') as f:
json.dump(dt, f)
if cjson:
def serialize_cjson():
with open('dict.cjson', 'w') as f:
f.write(cjson.encode(dt))
def serialize_flat():
with open('dict.flat', 'w') as f:
f.write('\n'.join([' '.join([k] + v) for k, v in dt.iteritems()]))
def serialize_eval():
with open('dict.eval', 'w') as f:
f.write('\n'.join([k + '\t' + repr(v) for k, v in dt.iteritems()]))
def unserialize_defaultdict():
with open('defaultdict.pickle') as f:
assert cPickle.load(f) == dd
def unserialize_p2_defaultdict():
with open('defaultdict.pickle2') as f:
assert cPickle.load(f) == dd
def unserialize_dict():
with open('dict.pickle') as f:
assert cPickle.load(f) == dt
def unserialize_p2_dict():
with open('dict.pickle2') as f:
assert cPickle.load(f) == dt
def unserialize_json():
with open('dict.json') as f:
assert json.load(f) == dt
if cjson:
def unserialize_cjson():
with open('dict.cjson') as f:
assert cjson.decode(f.read()) == dt
def unserialize_flat():
with open('dict.flat') as f:
dtx = {}
for line in f:
vals = line.split()
dtx[vals[0]] = vals[1:]
assert dtx == dt
def unserialize_eval():
with open('dict.eval') as f:
dtx = {}
for line in f:
vals = line.split('\t')
dtx[vals[0]] = eval(vals[1])
assert dtx == dt
print 'serialize flat:', timeit(serialize_flat, number=10)
print 'serialize eval:', timeit(serialize_eval, number=10)
print 'serialize defaultdict:', timeit(serialize_defaultdict, number=10)
print 'serialize dict:', timeit(serialize_dict, number=10)
print 'serialize defaultdict new pickle:', timeit(serialize_p2_defaultdict, number=10)
print 'serialize dict new pickle:', timeit(serialize_p2_dict, number=10)
print 'serialize json:', timeit(serialize_json, number=10)
if cjson:
print 'serialize cjson:', timeit(serialize_cjson, number=10)
print 'unserialize flat:', timeit(unserialize_flat, number=10)
print 'unserialize eval:', timeit(unserialize_eval, number=10)
print 'unserialize defaultdict:', timeit(unserialize_defaultdict, number=10)
print 'unserialize dict:', timeit(unserialize_dict, number=10)
print 'unserialize defaultdict new pickle:', timeit(unserialize_p2_defaultdict, number=10)
print 'unserialize dict new pickle:', timeit(unserialize_p2_dict, number=10)
print 'unserialize json:', timeit(unserialize_json, number=10)
if cjson:
print 'unserialize cjson:', timeit(unserialize_cjson, number=10)
print 'testing with 10 keys...'
testdict(dd1, dt1)
print 'testing with 100 keys...'
testdict(dd2, dt2)