行中零序列前后的事件百分比

ID 0 1 2 3 4 5 6 7 8 ... 81 82 83 84 85 86 87 88 89 90 total ----------------------------------------------------------------------------------------------------- 0 A 2 21 0 18 3 0 0 0 2 ... 0 0 0 0 0 0 0 0 0 0 156 1 B 0 20 12 2 0 8 14 23 0 ... 0 0 0 0 0 0 0 0 0 0 231 2 C 0 38 19 3 1 3 3 7 1 ... 0 0 0 0 0 0 0 0 0 0 78 3 D 3 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 5

def func(row, n): """Returns the number of events before the first sequence of 0s of length n is found """ idx = np.arange(0, 91) a = row[idx] b = (a != 0).cumsum() c = b[a == 0] d = c.groupby(c).count() #in case there is no sequence of 0s with length n try: e = c[c >= d.index[d >= n][0]] f = str(e.index[0]) except IndexError: e = [90] f = str(e[0]) idx_sliced = np.arange(0, int(f)+1) a = row[idx_sliced] if (int(f) + n > 90): perc_before = 100 else: perc_before = a.cumsum().tail(1).values[0]/row['total'] return perc_before

ID 0 1 2 3 4 5 6 7 8 ... 81 82 83 84 85 86 87 88 89 90 total %_before --------------------------------------------------------------------------------------------------------------- 0 A 2 21 0 18 3 0 0 0 2 ... 0 0 0 0 0 0 0 0 0 0 156 43 1 B 0 20 12 2 0 8 14 23 0 ... 0 0 0 0 0 0 0 0 0 0 231 21 2 C 0 38 19 3 1 3 3 7 1 ... 0 0 0 0 0 0 0 0 0 0 78 90 3 D 3 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 5 100

a = pd.Series([1,1,13,0,0,0,4,0,0,0,0,0,12,1,1]) b = pd.Series([1,1,13,0,0,0,4,12,1,12,3,0,0,5,1]) c = pd.Series([1,1,13,0,0,0,4,12,2,0,5,0,5,1,1]) d = pd.Series([1,1,13,0,0,0,4,12,1,12,4,50,0,0,1]) e = pd.Series([1,1,13,0,0,0,4,12,0,0,0,54,0,1,1]) df = pd.DataFrame({'0':a, '1':b, '2':c, '3':d, '4':e}) df = df.transpose()

3条回答

网友

1楼 · 编辑于 2024-04-23 23:31:21

尝试一下：

def percent_before(row, n, ncols):
    """Return the percentage of activities happen before
    the first sequence of at least `n` consecutive 0s
    """
    start_index, i, size = 0, 0, 0
    for i in range(ncols):
        if row[i] == 0:
            # increase the size of the island
            size += 1
        elif size >= n:
            # found the island we want
            break
        else:
            # start a new island
            # row[start_index] is always non-zero
            start_index = i
            size = 0

    if size < n:
        # didn't find the island we want
        return 1
    else:
        # get the sum of activities that happen
        # before the island
        idx = np.arange(0, start_index + 1).astype(str)
        return row.loc[idx].sum() / row['total']

df['percent_before'] = df.apply(percent_before, n=3, ncols=15, axis=1)

结果:

   0  1   2  3  4  5  6   7  8   9  10  11  12  13  14  total  percent_before
0  1  1  13  0  0  0  4   0  0   0   0   0  12   1   1     33        0.454545
1  1  1  13  0  0  0  4  12  1  12   3   0   0   5   1     53        0.283019
2  1  1  13  0  0  0  4  12  2   0   5   0   5   1   1     45        0.333333
3  1  1  13  0  0  0  4  12  1  12   4  50   0   0   1     99        0.151515
4  1  1  13  0  0  0  4  12  0   0   0  54   0   1   1     87        0.172414

对于完整帧，使用ncols=91调用apply

网友

2楼 · 编辑于 2024-04-23 23:31:21

另一种可能的解决办法：

def get_vals(df, n):
    df, out = df.T, []
    for col in df.columns:
        diff_to_previous = df[col] != df[col].shift(1)
        g = df.groupby(diff_to_previous.cumsum())[col].agg(['idxmin', 'size'])

        vals = df.loc[g.loc[g['size'] >= n, 'idxmin'].values, col]
        if len(vals):
            out.append( df.loc[np.arange(0, vals[vals == 0].index[0]), col].sum() / df[col].sum() )
        else:
            out.append( 1.0 )
    return out

df['percent_before'] = get_vals(df, n=3)
print(df)

印刷品：

   0  1   2  3  4  5  6   7  8   9  10  11  12  13  14  percent_before
0  1  1  13  0  0  0  4   0  0   0   0   0  12   1   1        0.454545
1  1  1  13  0  0  0  4  12  1  12   3   0   0   5   1        0.283019
2  1  1  13  0  0  0  4  12  2   0   5   0   5   1   1        0.333333
3  1  1  13  0  0  0  4  12  1  12   4  50   0   0   1        0.151515
4  1  1  13  0  0  0  4  12  0   0   0  54   0   1   1        0.172414

网友

3楼 · 编辑于 2024-04-23 23:31:21

由于上一个问题的一个评论是关于速度的，我想你可以尝试将问题矢量化。我使用此数据帧尝试（与原始输入略有不同）：

  ID  0   1   2   3  4  5   6   7  8  total
0  A  2  21   0  18  3  0   0   0  2     46
1  B  0   0  12   2  0  8  14  23  0     59
2  C  0  38  19   3  1  3   3   7  1     75
3  D  3   0   0   1  0  0   0   0  0      4

现在我想的是链接命令来创建一个掩码并找到数据不等于0的地方，然后沿列轴使用cumsum，并查看沿列的diff等于0的地方。要找到第一个，可以使用cummax，这样（按行）之后的所有列都被认为是True。使用与此掩码相反的掩码屏蔽原始数据帧，沿列求和并除以总和。例如，n=2时：

n=2
df['%_before'] = df[~(df.ne(0).cumsum(axis=1).diff(n, axis=1)[range(9)]
                        .eq(0).cummax(axis=1))].sum(axis=1)/df.total
print (df)
  ID  0   1   2   3  4  5   6   7  8  total  %_before
0  A  2  21   0  18  3  0   0   0  2     46  0.956522
1  B  0   0  12   2  0  8  14  23  0     59  0.000000
2  C  0  38  19   3  1  3   3   7  1     75  1.000000
3  D  3   0   0   1  0  0   0   0  0      4  0.750000

在您的情况下，您需要通过range(91)更改range(9)来获取所有列

相关问题更多 >

编程相关推荐

热门问题

热门文章