按组反转pandas数据框
我有以下代码
df = pd.read_csv("some_data.csv")
candles = [Candle(candle["close"].iloc[0], candle["close"].iloc[-1], max(candle["close"]), min(candle["close"]))
for _, candle in df.groupby(df.index // ticks)]
candles.reverse()
这个代码是用来处理一个包含很多交易数据的表格。虽然它能正常工作,但我觉得有点笨重。所以我想问:难道不可以先把这个表格反转后再进行分组吗?
这是实际数据的一部分:
timestamp,close,security_code,volume,bid_volume,ask_volume
2024-02-28 01:00:00.358537+00:00,18002.5,NQ,1,0,1
2024-02-28 01:00:00.890809+00:00,18002.75,NQ,1,1,0
2024-02-28 01:00:00.890809+00:00,18002.75,NQ,1,1,0
2024-02-28 01:00:01.696411+00:00,18002.5,NQ,1,0,1
2024-02-28 01:00:02.268716+00:00,18002.25,NQ,1,0,1
2024-02-28 01:00:02.513397+00:00,18002.5,NQ,1,1,0
2024-02-28 01:00:03.716795+00:00,18002.5,NQ,1,0,1
2024-02-28 01:00:03.892441+00:00,18002.75,NQ,1,1,0
2024-02-28 01:00:03.893664+00:00,18002.25,NQ,1,0,1
2024-02-28 01:00:06.956017+00:00,18002.25,NQ,1,0,1
2024-02-28 01:00:08.144158+00:00,18002.25,NQ,1,1,0
2024-02-28 01:00:08.144158+00:00,18002.25,NQ,1,1,0
2024-02-28 01:00:08.772717+00:00,18002.0,NQ,1,0,1
2024-02-28 01:00:08.772717+00:00,18002.0,NQ,3,0,3
2024-02-28 01:00:09.966515+00:00,18002.25,NQ,1,1,0
2024-02-28 01:00:10.051715+00:00,18002.0,NQ,1,0,1
2024-02-28 01:00:11.053980+00:00,18001.75,NQ,1,0,1
2024-02-28 01:00:11.053980+00:00,18001.75,NQ,1,0,1
2024-02-28 01:00:11.296008+00:00,18002.0,NQ,1,1,0
2024-02-28 01:00:12.050765+00:00,18001.75,NQ,1,0,1
2024-02-28 01:00:12.050765+00:00,18001.5,NQ,1,0,1
2024-02-28 01:00:12.050765+00:00,18001.5,NQ,1,0,1
2024-02-28 01:00:12.050765+00:00,18001.5,NQ,1,0,1
2024-02-28 01:00:12.050765+00:00,18001.5,NQ,1,0,1
2024-02-28 01:00:12.050765+00:00,18001.5,NQ,1,0,1
2024-02-28 01:00:12.050765+00:00,18001.25,NQ,1,0,1
2024-02-28 01:00:12.050765+00:00,18001.25,NQ,1,0,1
2024-02-28 01:00:12.050765+00:00,18001.25,NQ,1,0,1
2024-02-28 01:00:12.050765+00:00,18001.25,NQ,2,0,2
1 个回答
1
直观上看,在分组操作中进行聚合计算似乎会更高效。下面是一个简单的例子:
class Candle:
def __init__(self, open, close, high, low):
self.open = open
self.close = close
self.high = high
self.low = low
df = pd.DataFrame({ 'close' : random.choices(range(50, 70),k=50) })
df['close'].values
#
# array([66, 67, 57, 65, 64, 63, 59, 54, 57, 50, 58, 67, 69, 53, 54, 53, 54,
# 62, 53, 67, 69, 51, 65, 64, 56, 63, 58, 54, 50, 51, 63, 69, 55, 66,
# 54, 54, 64, 52, 52, 58, 57, 61, 64, 63, 53, 64, 50, 52, 68, 63],
# dtype=int64)
candles = (df[::-1]
.groupby(df.index[::-1]//ticks, sort=False)['close']
.agg(open='last', close='first', high='max', low='min')
.apply(lambda g:Candle(*g), axis=1)
.tolist()
)
for c in candles:
print(c.__dict__)
示例输出:
{'open': 64, 'close': 63, 'high': 68, 'low': 50}
{'open': 57, 'close': 53, 'high': 64, 'low': 53}
{'open': 54, 'close': 58, 'high': 64, 'low': 52}
{'open': 63, 'close': 54, 'high': 69, 'low': 54}
{'open': 63, 'close': 51, 'high': 63, 'low': 50}
{'open': 69, 'close': 56, 'high': 69, 'low': 51}
{'open': 53, 'close': 67, 'high': 67, 'low': 53}
{'open': 58, 'close': 54, 'high': 69, 'low': 53}
{'open': 63, 'close': 50, 'high': 63, 'low': 50}
{'open': 66, 'close': 64, 'high': 67, 'low': 57}