如何提高Python循环的性能？

from tqdm import tqdm import pandas as pd # Treasury maturities, in years treasury_maturities = [1/12, 2/12, 3/12, 6/12, 1, 2, 3, 5, 7, 10, 20, 30] # Useful lists treasury_maturities1 = [3/12, 6/12, 1, 2, 3, 5, 7, 10, 20, 30] treasury_maturities2 = [1/12] treasury_maturities3 = [6/12, 1, 2, 3, 5, 7, 10, 20, 30] treasury_maturities4 = [1, 2, 3, 5, 7, 10, 20, 30] treasury_maturities5 = [1/12, 2/12, 3/12, 6/12, 1, 2, 3, 5, 7, 10, 20] # Dataframe that will contain the difference between the time to maturity of option and the different maturities differences = pd.DataFrame(columns = treasury_maturities) # Options Dataframe sample options_list = [[pd.to_datetime("2004-01-02"), pd.to_datetime("2004-01-17"), 800.0, "c", 309.1, 311.1, 1108.49, 1108.49, 0.0410958904109589, 310.1], [pd.to_datetime("2004-01-02"), pd.to_datetime("2004-01-17"), 800.0, "p", 0.0, 0.05, 1108.49, 1108.49, 0.0410958904109589, 0.025]] options = pd.DataFrame(options_list, columns = ['QuoteDate', 'expiration', 'strike', 'OptionType', 'bid_eod', 'ask_eod', 'underlying_bid_eod', 'underlying_ask_eod', 'Time_to_Maturity', 'Option_Average_Price']) # Loop for index, row in tqdm(options.iterrows()): if pd.to_datetime("2004-01-02") <= row.QuoteDate <= pd.to_datetime("2018-10-15"): if pd.to_datetime("2004-01-02") <= row.QuoteDate <= pd.to_datetime("2006-02-08") and row.Time_to_Maturity > 25: list_s = ([abs(maturity - row.Time_to_Maturity) for maturity in treasury_maturities5]) list_s = [list_s + [40]] # 40 is an arbitrary number bigger than 30 differences = differences.append(pd.DataFrame(list_s, columns = treasury_maturities), ignore_index = True) elif (pd.to_datetime("2008-12-10") or pd.to_datetime("2008-12-18") or pd.to_datetime("2008-12-24")) == row.QuoteDate and 1.5/12 <= row.Time_to_Maturity <= 3.5/12: list_s = [0, 40, 40] list_s = [list_s + [abs(maturity - row.Time_to_Maturity) for maturity in treasury_maturities3]] differences = differences.append(pd.DataFrame(list_s, columns = treasury_maturities), ignore_index = True) elif (pd.to_datetime("2008-12-10") or pd.to_datetime("2008-12-18") or pd.to_datetime("2008-12-24")) == row.QuoteDate and 3.5/12 < row.Time_to_Maturity <= 4.5/12: list_s = ([abs(maturity - row.Time_to_Maturity) for maturity in treasury_maturities2]) list_s = list_s + [40, 40, 0] list_s = [list_s + [abs(maturity - row.Time_to_Maturity) for maturity in treasury_maturities4]] differences = differences.append(pd.DataFrame(list_s, columns = treasury_maturities), ignore_index = True) else: if 1.5/12 <= row.Time_to_Maturity <= 2/12: list_s = [0, 40] list_s = [list_s + [abs(maturity - row.Time_to_Maturity) for maturity in treasury_maturities1]] differences = differences.append(pd.DataFrame(list_s, columns = treasury_maturities), ignore_index = True) elif 2/12 < row.Time_to_Maturity <= 2.5/12: list_s = ([abs(maturity - row.Time_to_Maturity) for maturity in treasury_maturities2]) list_s = list_s + [40, 0] list_s = [list_s + [abs(maturity - row.Time_to_Maturity) for maturity in treasury_maturities3]] differences = differences.append(pd.DataFrame(list_s, columns = treasury_maturities), ignore_index = True) else: list_s = [[abs(maturity - row.Time_to_Maturity) for maturity in treasury_maturities]] differences = differences.append(pd.DataFrame(list_s, columns = treasury_maturities), ignore_index = True) else: list_s = [[abs(maturity - row.Time_to_Maturity) for maturity in treasury_maturities]] differences = differences.append(pd.DataFrame(list_s, columns = treasury_maturities), ignore_index = True)

3条回答

网友

1楼 · 编辑于 2024-06-10 11:32:56

对于您的问题“divide and conquer”可以指导您找到解决方案。我建议把你的代码分成几块，然后分析每一部分，因为，我明白了像这样的一些冗余：

(pd.to_datetime("2008-12-10") or pd.to_datetime("2008-12-18") or pd.to_datetime("2008-12-24"))

从字符串到日期时间的转换似乎是在每一行完成的。必须使用profile或更具体的工具（如perf_tool[*]）来评测代码。它通过在代码中加入一些哨兵并报告所有中间时间、呼叫数和平均值来帮助您

[*]我是主要的开发者

网友

2楼 · 编辑于 2024-06-10 11:32:56

简短回答

循环和if语句都是计算开销较大的操作，因此请寻找减少使用次数的方法

Loop Optimization：- 加速编程循环的最佳方法是将尽可能多的计算移出循环

DRY：- 不要重复你自己。您有几个冗余的if条件，查看嵌套的if条件并遵循DRY原则

使用熊猫和numpy

pandas和numpy等库的主要优点之一是，它们的设计目的是提高对数组的数学运算效率（请参见Why are numpy arrays so fast?）。这意味着您通常根本不必使用循环。不要在循环中创建新的数据帧，而是为正在计算的每个值创建一个新列

要解决不同日期等的不同逻辑问题，请筛选行并应用逻辑，使用掩码/筛选器仅选择需要操作的行，而不是使用if语句（请参见pandas filtering tutorial）

代码示例

此代码不是逻辑的复制，而是如何实现它的示例。它并不完美，但应该提供一些重大的效率改进

import pandas as pd
import numpy as np

# Maturity periods, months and years
month_periods = np.array([1, 2, 3, 6, ], dtype=np.float64)
year_periods = np.array([1, 2, 3, 4, 5, 7, 10, 20, 30, ], dtype=np.float64)

# Create column names for maturities
maturity_cols = [f"month_{m:02.0f}" for m in month_periods] + [f"year_{y:02.0f}" for y in year_periods]

# Normalise months  & concatenate into single array
month_periods = month_periods / 12
maturities = np.concatenate((month_periods, year_periods))

# Create some dummy data
np.random.seed(seed=42)  # Seed PRN generator
date_range = pd.date_range(start="2004-01-01", end="2021-01-30", freq='D')  # Dates to sample from
dates = np.random.choice(date_range, size=n_records, replace=True)
maturity_times = np.random.random(size=n_records)
options = pd.DataFrame(list(zip(dates, maturity_times)), columns=['QuoteDate', 'Time_to_Maturity', ])

# Create date masks
after = options['QuoteDate'] >= pd.to_datetime("2008-01-01")
before = options['QuoteDate'] <= pd.to_datetime("2015-01-01")

# Combine date masks / create flipped version
between = after & before
outside = np.logical_not(between)

# Select data with masks
df_outside = options[outside].copy()
df_between = options[between].copy()

# Smaller dataframes
df_a = df_between[df_between['Time_to_Maturity'] > 25].copy()
df_b = df_between[df_between['Time_to_Maturity'] <= 3.5 / 12].copy()
df_c = df_between[df_between['Time_to_Maturity'] <= 4.5 / 12].copy()
df_d = df_between[
    (df_between['Time_to_Maturity'] >= 2 / 12) & (df_between['Time_to_Maturity'] <= 4.5 / 12)].copy()

# For each maturity period, add difference column using different formula
for i, col in enumerate(maturity_cols):
    # Add a line here for each subset / chunk of data which requires a different formula
    df_a[col] = ((maturities[i] - df_outside['Time_to_Maturity']) + 40).abs()
    df_b[col] = ((maturities[i] - df_outside['Time_to_Maturity']) / 2) .abs()
    df_c[col] = (maturities[i] - df_outside['Time_to_Maturity'] + 1).abs()
    df_d[col] = (maturities[i] - df_outside['Time_to_Maturity'] * 0.8).abs()
    df_outside[col] = (maturities[i] - df_outside['Time_to_Maturity']).abs()

# Concatenate dataframes back to one dataset
frames = [df_outside, df_a, df_b, df_c, df_d, ]
output = pd.concat(frames).dropna(how='any')

output.head()

记录数的平均执行时间
即使是数百万条记录也能快速处理（内存允许） |记录|旧时间（秒）|新时间（秒）|改进| |-|-|-|-| | 10 | 0.0105 | 0.0244 | -132.38% | | 100 | 0.1078 | 0.0249 | 76.90% | |1000（1k）| 1.03 | 0.0249 | 97.58%| |10000（10k）| 15.629 | 0.0322 | 99.79%| |100000（100k）| 182.014 | 0.065 | 99.96%| |一百万（百万）|？|0.4014 | ? | |10000000（10米）|？|4.7488 | ? | |14000000（14米）|？|6.0172 | ? | |100000000（100m）|？|83.286 | ? |

进一步优化

优化和分析基本代码后，还可以研究多线程、并行化代码或使用不同的语言。另外，1400万条记录将占用大量RAM，远远超过大多数工作站的处理能力。要绕过此限制，您可以分块读取文件本身，并一次对一个块执行计算：

result_frames = []
for chunk in pd.read_csv("voters.csv", chunksize=10000):
    # Do things here
    result = chunk
    result_frames.append(result)

谷歌搜索词：多处理/线程化/Dask/PySpark

网友

3楼 · 编辑于 2024-06-10 11:32:56

正如其他人已经指出的，请分析您的代码以找到最慢的部分

一些可能的加速：

只要可能，考虑使用生成器而不是列表。此外，使用list.extend可能比列表串联更快

list_s = ([abs(maturity - row.Time_to_Maturity) for maturity in 
                           treasury_maturities2)

可能是

list_s = (abs(maturity - row.Time_to_Maturity) for maturity in 
                           treasury_maturities2)

及

list_s = list_s + [foo, bar, baz]

可能是

list_s = list_s.extend([foo, bar, baz])

简短回答

使用熊猫和numpy

代码示例

进一步优化

相关问题更多 >

编程相关推荐

热门问题

热门文章