import random
import string
import timeit
letters = string.ascii_lowercase
def _get_word():
return ''.join(random.choice(letters) for i in range(10))
N = 10000
CHUNK = 400
ITERATIONS = 30
# dummy dict with N entries
data = {_get_word(): _get_word() for _ in range(0, N)}
for _ in range(0,ITERATIONS):
starttime = timeit.default_timer()
sub_dicts = []
temp = {}
for idx,(k,v) in enumerate(data.items()):
temp[k] = v
if idx % CHUNK == 0:
sub_dicts.append(temp)
temp = {}
print("Option 2: The time difference is :", timeit.default_timer() - starttime)
输出(以秒为单位的增量T)
Option 2: The time difference is : 0.003928793012164533
Option 2: The time difference is : 0.006472171982750297
Option 2: The time difference is : 0.006867899966891855
Option 2: The time difference is : 0.007105670985765755
Option 2: The time difference is : 0.004680399026256055
Option 2: The time difference is : 0.0034195330226793885
Option 2: The time difference is : 0.0034254349884577096
Option 2: The time difference is : 0.003625455021392554
Option 2: The time difference is : 0.003444572037551552
Option 2: The time difference is : 0.003441892040427774
Option 2: The time difference is : 0.0038558689993806183
Option 2: The time difference is : 0.00340640899958089
Option 2: The time difference is : 0.004021176020614803
Option 2: The time difference is : 0.004761706979479641
Option 2: The time difference is : 0.0039043189608491957
Option 2: The time difference is : 0.0035160399856977165
Option 2: The time difference is : 0.003446961985900998
Option 2: The time difference is : 0.0038189300103113055
Option 2: The time difference is : 0.003589348983950913
Option 2: The time difference is : 0.0033721639774739742
Option 2: The time difference is : 0.0033731560106389225
Option 2: The time difference is : 0.0033843390410766006
Option 2: The time difference is : 0.003322889970149845
Option 2: The time difference is : 0.0034315589582547545
Option 2: The time difference is : 0.003400936024263501
Option 2: The time difference is : 0.0032910890295170248
Option 2: The time difference is : 0.003319227951578796
Option 2: The time difference is : 0.003392132988665253
Option 2: The time difference is : 0.0032848399714566767
Option 2: The time difference is : 0.003325468976981938
import random
import timeit
m = 3_000
n0 = 300
n1 = 600
trials = 10_000
def remove(parent_dict, kept_keys):
result = parent_dict.copy()
for key in parent_dict.keys():
if key not in kept_keys:
del result[key]
return result
def add_dict_comprehension(parent_dict, kept_keys):
return {k:v for k,v in parent_dict.items() if k in kept_keys}
def add_key_comprehension(parent_dict, kept_keys):
return {k:parent_dict[k] for k in kept_keys}
def add_dict_comprehension_set(parent_dict, kept_keys):
return {k:v for k,v in parent_dict.items() if k in kept_keys}
def generate_test_data(m, n0, n1):
parent_dict = {}
for key in range(m):
parent_dict[key] = 0
target_keys = random.sample(range(m), random.randint(n0, n1))
return parent_dict, target_keys
test_functions = {
'remove': (remove, lambda x: x),
'add_dict_comprehension': (add_dict_comprehension, lambda x: x),
'add_key_comprehension': (add_key_comprehension, lambda x: x),
'add_dict_comprehension_set': (add_dict_comprehension, lambda keys: set(keys)),
'remove_set': (remove, lambda keys: set(keys)),
}
results = {k:[] for k in test_functions.keys()}
for _ in range(trials):
parent_dict, target_keys = generate_test_data(m, n0, n1)
for k,(func, preprocessor) in test_functions.items():
# Not timing the preprocessing, assuming this is done on construction
processed_keys = preprocessor(target_keys)
starttime = timeit.default_timer()
func(parent_dict, processed_keys)
results[k].append(timeit.default_timer() - starttime)
def avg(lst):
return sum(lst)/len(lst)
def stddev(lst):
mean = avg(lst)
r2 = sum([(i-mean)**2 for i in lst])
bias = len(lst) - 1
return (r2/bias)**.5
result_data = {k: {'avg': avg(v), 'stddev': stddev(v)} for k,v in results.items()}
title_len = max(len(i) for i in result_data.keys())
for (name, data) in sorted(result_data.items(), key = lambda item: item[1]['avg']):
print(f"{name}: {''.join([' ']*(title_len-len(name)))} Avg. {data['avg']} [Stddev: {data['stddev']}]")
选项2更快:
输出:
选项2看起来足够快-见下文
输出(以秒为单位的增量T)
对两个答案都有一点反馈;这两种方法都不准确,因为他们选择关键点的方式
他们有效地将字典视为一个列表,如果您的数据是以这种方式设置的,您可能不需要字典(而是一个排序列表),则保留n个项目
关于选项2
我敢打赌您正在执行这样的查找:
new_dict = {k:v for k,v in parent_dict.items() if k in selections}
在这种情况下,性能随着
len(selections)
的增加而降低;某种程度上缓解这种情况的一种方法是使用集合而不是列表进行选择;有效地O(m*n)
(其中m是父目录的大小,n是选择的数量)因为n上的循环并不总是需要在完整的n上循环,所以它可能更接近
O((n**2-n)/2)
,其中(n**2-2)/2
是1和n之间所有数字的总和另一方面,选项1中删除的循环是
O(2m-n)
O(m)
用于复制dict和O(m-n)
用于删除键有一些非普通的(我想到的是lodash)解决方案是高度优化的,可能会更快,但严格地看时间复杂度,似乎在现实的关键点选择中删除应该更快
哪个选项最好可能取决于
m
&n
我刚刚经历了一个金发碧眼的时刻,但我想把我的无知留给未来的任何路人
您应该这样做:
new_dict = {k: parent_dict[k] for k in selections}
您不必为复制dict支付
O(m)
罚款,向dict添加项目是O(1)
,在选择上循环是O(n)
,因此您在O(2n)
上坐得很好,即O(n)
我认为没有比这更快的了好的,下面是代码:
正如预期的那样,关键理解是最快的,使用集合而不是列表对于选项1和;2) 集合的平均查找时间为
O(1)
,因此可以避免数组的大部分搜索开销相关问题 更多 >
编程相关推荐