将.apply（）更改为使用列表理解将一个数据帧与一列列表与另一个数据帧中的值进行比较的函数

d1 = pd.DataFrame({'A' : ['A', 'B', 'C', 'D'], 'B' : [['84'], ['8420', '8421', '8422', '8423', '8424', '8425', '8426'], ['847', '8475'], ['8470', '8471']]}) A B 0 A [84] 1 B [8420, 8421, 8422, 8423, 8424, 8425, 8426] 2 C [847, 8475] 3 D [8470, 8471] d2 = pd.DataFrame({'C' : [8420513, 8421513, 8426513, 8427513, 8470513, 8470000, 8475000]}) C 0 8420513 1 8421513 2 8426513 3 8427513 4 8470513 5 8470000 6 8475000

from tqdm import tqdm, tqdm_notebook tqdm_notebook().pandas() d1 = pd.DataFrame({'A' : ['A', 'B', 'C', 'D'], 'B' : [['84'], ['8420', '8421', '8422', '8423', '8424', '8425', '8426'], ['847', '8475'], ['8470', '8471']]}) d2 = pd.DataFrame({'C' : [8420513, 8421513, 8426513, 8427513, 8470513, 8470000, 8475000]}) d2['C'] = d2['C'].astype(str) d2['B'] = d2['C'].progress_apply(lambda x: [z for y in d1['B'] for z in y if x.startswith(z)]) d2['B'] = d2['B'].progress_apply(max) d2

3条回答

网友
1楼 · 编辑于 2024-04-19 22:22:57

让我们尝试使用explode和带有extract的正则表达式：
d1e = d1['B'].explode() regstr = '('+'|'.join(sorted(d1e)[::-1])+')' d2['B'] = d2['C'].astype('str').str.extract(regstr)
输出：
C B 0 8420513 8420 1 8421513 8421 2 8426513 8426 3 8427513 84 4 8470513 8470 5 8470000 8470 6 8475000 8475
因为，.str访问比列表理解慢
import re regstr = '|'.join(sorted(d1e)[::-1]) d2['B'] = [re.match(regstr, i).group() for i in d2['C'].astype('str')]
计时：
from timeit import timeit import re d1 = pd.DataFrame({'A' : ['A', 'B', 'C', 'D'], 'B' : [['84'], ['8420', '8421', '8422', '8423', '8424', '8425', '8426'], ['847', '8475'], ['8470', '8471']]}) d2 = pd.DataFrame({'C' : [8420513, 8421513, 8426513, 8427513, 8470513, 8470000, 8475000]}) d2['C'] = d2['C'].astype(str) def orig(d): d['B'] = d['C'].apply(lambda x: [z for y in d1['B'] for z in y if x.startswith(z)]) d['B'] = d['B'].apply(max) return d def comtorecords(d): d['B']=[max([z for y in d1.B for z in y if str(row[1]) .startswith(z)]) for row in d.to_records()] return d def regxstracc(d): d1e = d1['B'].explode() regstr = '('+'|'.join(sorted(d1e)[::-1])+')' d['B'] = d['C'].astype('str').str.extract(regstr) return d def regxcompre(d): regstr = '|'.join(sorted(d1e)[::-1]) d['B'] = [re.match(regstr, i).group() for i in d['C'].astype('str')] return d res = pd.DataFrame( index=[10, 30, 100, 300, 1000, 3000, 10000, 30000], columns='orig comtorecords regxstracc regxcompre'.split(), dtype=float ) for i in res.index: d = pd.concat([d2]*i) for j in res.columns: stmt = '{}(d)'.format(j) setp = 'from __main__ import d, {}'.format(j) print(stmt, d.shape) res.at[i, j] = timeit(stmt, setp, number=100) # res.groupby(res.columns.str[4:-1], axis=1).plot(loglog=True); res.plot(loglog=True);
输出：

网友
2楼 · 编辑于 2024-04-19 22:22:57

与斯科特的两个命题相比，我找到了更快的解决方案
def vect(d): def extr(txt): mtch = pat.match(txt) return mtch.group() if mtch else '' d1e = d1.B.explode() pat = re.compile('|'.join(sorted(d1e)[::-1])) d['B'] = np.vectorize(extr)(d.C) return d
一个速度增益来自之前的正则表达式编译
第二个增益是由于使用了Numpy矢量化，而不是一份清单
运行一个类似于Scott使用的测试循环，我收到结果如下：
所以“我的”执行时间（红线），特别是对于较大的数据量，约占regxstracc和regxcompre的60%

网友
3楼 · 编辑于 2024-04-19 22:22:57

试试exploded1，这样我们就减少了一个循环

[max([z for z in d1.B.explode() if  x.startswith(z)]) for x in d2.C.astype(str) ]

['8420', '8421', '8426', '84', '8470', '8470', '8475']

计时：

相关问题更多 >

编程相关推荐

热门问题

热门文章