将数据集从“宽”格式转换为“长”格式，并将时间列转换为时间格式以进行时间序列分析

# generate/import dataset import pandas as pd df = pd.DataFrame({'level':[A,B,C,D,E,F], 'Time_30':[1993.05,1999.45, 2001.11, 2007.39, 2219.77], 'Time_60':[2123.15,2299.59, 2339.19, 2443.37, 2553.15], 'Time_90':[2323.56,2495.99,2499.13, 2548.71, 2656.0], 'Time_120':[2355.52,2491.19,2519.92,2611.81, 2753.11], 'Time_150':[2425.31,2599.51, 2539.9, 2713.77, 2893.58], 'Time_180':[2443.35,2609.92, 2632.49, 2774.03, 2901.25]} )

# first series level, time, count A, 30, 1993.05 B, 60, 2123.15 C, 90, 2323.56 D, 120, 2355.52 E, 150, 2425.31 F, 180, 2443.35 # 2nd series level,time,count A,30,1999.45 B,60,2299.59 C,90,2495.99 D,120,2491.19 E,150,2599.51 F,180,2609.92 . . . . # up until the last series

# (I) df1 = pd.melt(df,id_vars = ['level'],var_name = 'time',value_name = 'count') # # (II) df1['time'] = pd.to_datetime(df1['time'],format= '%H:%M:%S' ).dt.time OR df1['time'] = pd.to_timedelta(df1['time'], unit='m') # (III) plt.figure(figsize=(10,5)) plt.plot(df1) for timex in range(30,180): plt.axvline(datetime(timex,1,1), color='k', linestyle='--', alpha=0.3) # Perform STL Decomp stl = STL(df1) result = stl.fit() seasonal, trend, resid = result.seasonal, result.trend, result.resid plt.figure(figsize=(8,6)) plt.subplot(4,1,1) plt.plot(df1) plt.title('Original Series', fontsize=16) plt.subplot(4,1,2) plt.plot(trend) plt.title('Trend', fontsize=16) plt.subplot(4,1,3) plt.plot(seasonal) plt.title('Seasonal', fontsize=16) plt.subplot(4,1,4) plt.plot(resid) plt.title('Residual', fontsize=16) plt.tight_layout() estimated = trend + seasonal plt.figure(figsize=(12,4)) plt.plot(df1) plt.plot(estimated) plt.figure(figsize=(10,4)) plt.plot(resid) # Anomaly detection resid_mu = resid.mean() resid_dev = resid.std() lower = resid_mu - 3*resid_dev upper = resid_mu + 3*resid_dev anomalies = df1[(resid < lower) | (resid > upper)] # returns the datapoints with the anomalies anomalies plt.plot(df1) for timex in range(30,180): plt.axvline(datetime(timex,1,1), color='k', linestyle='--', alpha=0.6) plt.scatter(anomalies.index, anomalies.count, color='r', marker='D')

1条回答

网友

1楼 · 发布于 2024-05-14 01:08:55

根据我上面的评论，我对您的示例数据框进行了一些小的编辑：

import pandas as pd 

df = pd.DataFrame({'level':['A','B','C','D','E'], 
       'Time_30':[1993.05,1999.45, 2001.11, 2007.39, 2219.77],
       'Time_60':[2123.15,2299.59, 2339.19, 2443.37, 2553.15],
       'Time_90':[2323.56,2495.99,2499.13, 2548.71, 2656.0],
       'Time_120':[2355.52,2491.19,2519.92,2611.81, 2753.11],
       'Time_150':[2425.31,2599.51, 2539.9, 2713.77, 2893.58],
       'Time_180':[2443.35,2609.92, 2632.49, 2774.03, 2901.25]} )

首先，将Time_*列名操作为整数值：

timecols = [int(c.replace("Time_","")) for c in df.columns if c != 'level']
df.columns = ['level'] + timecols

在这之后，您可以pd.melt()像您所想的那样，生成一个数据帧，将上面提到的所有“系列”连接在一起：

df1 = df.melt(id_vars=['level'], value_vars=timecols, var_name='time', value_name='count').sort_values(['level','time']).reset_index(drop=True)

print(df1.head(10))
  level time    count
0     A   30  1993.05
1     A   60  2123.15
2     A   90  2323.56
3     A  120  2355.52
4     A  150  2425.31
5     A  180  2443.35
6     B   30  1999.45
7     B   60  2299.59
8     B   90  2495.99
9     B  120  2491.19

如果要在level上循环，请使用以下选项选择它们：

for level in df1['level'].unique():
    tmp = df1[df1['level']==level]

或

for level in df1['level'].unique():
    tmp = df1[df1['level']==level].copy()

…如果您打算修改/向tmp数据帧添加数据

至于制作时间戳，您可以做：

df1['time'] = pd.to_timedelta(df1['time'], unit='min')

…就像你在尝试，但这取决于你如何使用它。如果您只想要看起来像“00:30:00”之类的字符串，可以尝试以下方法：

df1['time'] = pd.to_timedelta(df1['time'], unit='min').apply(lambda x:str(x)[-8:])

无论如何，希望这能让你走上你需要的轨道

相关问题更多 >

编程相关推荐

热门问题

热门文章