如何避免在遍历数据帧时进行很长时间的python计算

#import numpy as np df['neighbor_50'] = 0 df['neighbor_100'] = 0 frame_group = df.groupby(['frame','direction']) list_keys = list(frame_group.indices.keys()) for key in list_keys : frame , direction = key[0] , key[1] #new_df = df.loc[(df['frame'] == frame) & (df['direction'] == direction)] mask1 = (df['frame'] == frame) & (df['direction'] == direction) ids = df[mask1]['id'] for i in ids: for j in ids: if i != j: #distance = sqrt((x2-x1)**2 + (y2-y1)**2) maski = (df['frame'] == frame) & (df['direction'] == direction)& (df['id'] == i) maskj = (df['frame'] == frame) & (df['direction'] == direction)& (df['id'] == j) x2 = df[maski]['x'].iloc[0] x1 = df[maskj]['x'].iloc[0] y2 = df[maski]['y'].iloc[0] y1 = df[maskj]['y'].iloc[0] distance = ((x2 - x1)**2 + (y2 - y1)**2)**0.5 #distance = np.hypot((x2 - x1),(y2 - y1)) mask = (df['frame'] == frame) & (df['direction'] == direction) &( df['id']== i) if distance <= 50: df.loc[mask , 'neighbor_50'] += 1 if distance <= 100 : df.loc[mask ,'neighbor_100'] += 1

import numpy as np df['neighbor_50'] = 0 df['neighbor_100'] = 0 frame_group = df.groupby(['frame','direction']) list_keys = list(frame_group.indices.keys()) for key in list_keys : frame , direction = key[0] , key[1] #new_df = df.loc[(df['frame'] == frame) & (df['direction'] == direction)] mask = (df['frame'] == frame) & (df['direction'] == direction) ids = df[mask]['id'].values for i in range(len(ids)-1): id1 = ids[i] for j in range(i+1,len(ids)): id2 = ids[j] maski = (df['frame'] == frame) & (df['direction'] == direction)& (df['id'] == id1) maskj = (df['frame'] == frame) & (df['direction'] == direction)& (df['id'] == id2) x2 = df[maski]['x'].iloc[0] x1 = df[maskj]['x'].iloc[0] y2 = df[maski]['y'].iloc[0] y1 = df[maskj]['y'].iloc[0] #distance = ((x2 - x1)**2 + (y2 - y1)**2)**0.5 distance = np.hypot((x2 - x1),(y2 - y1)) if distance <= 100 : df.loc[maski ,'neighbor_100'] += 1 df.loc[maskj ,'neighbor_100'] += 1 if distance <= 50: df.loc[maski , 'neighbor_50'] += 1 df.loc[maskj , 'neighbor_50'] += 1

1条回答

网友

1楼 · 发布于 2024-05-14 14:44:00

有几种方法可以做到这一点，但如果不使用scipy或numpy，这可能是最快的方法：

import pandas

# Only for generation of test data
import random
import itertools

# Only for debugging (shows progress bar)
from tqdm import tqdm

tqdm.pandas()


# Generate some realistic looking test data
df = pandas.DataFrame(
    itertools.chain.from_iterable(
        [
            [
                {
                    "id": int(identifier),
                    "frame": frame,
                    "direction": random.randint(0, 1),
                    "x": random.random() * random.randint(-100, 100),
                    "y": random.random() * random.randint(-100, 100),
                }
                for frame in range(50)
            ]
            for identifier in range(100)
        ]
    )
)


def get_neighbor_count(group: pandas.DataFrame) -> pandas.DataFrame:
    """ Function that gets the number of neighbors within a certain distance (50 and 100 units) and
    in the same direction.

    Args:
        group (pandas.DataFrame): The input dataframe (all IDs within the same timeframe)

    Returns:
        pandas.DataFrame: The same dataframe with the number of neighbors
    """

    # Cartesian product of self (join the full dataframe with itself)
    group["key"] = 0
    group["index"] = group.index
    group_cartesian = group.merge(group, on="key", suffixes=("", "_target",)).drop(
        columns=["key"]
    )

    # Filter out combinations with self (distance == 0)
    group_cartesian = group_cartesian.loc[
        group_cartesian["id"] != group_cartesian["id_target"]
    ]

    # Only consider combinations in the same direction
    group_cartesian = group_cartesian.loc[
        group_cartesian["direction"] == group_cartesian["direction_target"]
    ]

    # Calculate the distances between the points (your function: distance = ((x2 - x1)**2 + (y2 - y1)**2)**0.5)
    group_cartesian["distance"] = (
        (group_cartesian["x_target"] - group_cartesian["x"]) ** 2
        + (group_cartesian["y_target"] - group_cartesian["y"]) ** 2
    ) ** 0.5

    # Set the index (needed for matching the results back to the group)
    group_cartesian = group_cartesian.set_index("index")

    # Select the rows with less than 50 or less than 100
    within_50 = group_cartesian.loc[group_cartesian["distance"] <= 50]
    within_100 = group_cartesian.loc[group_cartesian["distance"] <= 100]

    # Count the number of found neighbors
    neighbor_50 = within_50.groupby(["index"])["id_target"].count()
    neighbor_100 = within_100.groupby(["index"])["id_target"].count()

    # Add the new series to the group
    group = pandas.concat(
        [group, neighbor_50.rename("neighbor_50"), neighbor_100.rename("neighbor_100")],
        axis=1,
    )

    # Return the group
    return group


df = df.groupby("frame").progress_apply(get_neighbor_count).droplevel(0).sort_index()
print(df)

输出：

100%|██████████| 50/50 [00:01<00:00, 45.30it/s]
      id  frame  direction          x          y  key  index  neighbor_50  \
0      0      0          1   1.154344  10.168371    0      0         30.0   
1      0      0          1  11.581348  -2.878279    0      1         33.0   
2      0      0          1  -3.577821  -0.989225    0      2         31.0   
3      0      0          1 -39.239073 -21.107833    0      3         20.0   
4      0      0          1 -30.330413  55.736254    0      4          7.0   
...   ..    ...        ...        ...        ...  ...    ...          ...   
4995  99      0          0 -22.787323 -19.553391    0   4995         32.0   
4996  99      0          0 -82.955598  11.663767    0   4996          4.0   
4997  99      0          0   0.929322  10.133367    0   4997         33.0   
4998  99      0          0 -16.414550   0.218805    0   4998         35.0   
4999  99      0          0  31.408102  -1.500168    0   4999         27.0   

      neighbor_100  
0               49  
1               48  
2               49  
3               44  
4               37  
...            ...  
4995            48  
4996            33  
4997            48  
4998            49  
4999            45  

[5000 rows x 9 columns]

注意：输出列可能是一个浮点，因为如果没有邻居，值将是NaN，不能用int表示。但是，如果所有行都有一个邻居，则数据类型将为int

相关问题更多 >

编程相关推荐

热门问题

热门文章