包含基于条件的另一列标题的新列

dfa = pd.DataFrame({'date': ['09-03-1988', '10-03-1988', '11-03-1988', '12-03-1988', '13-03-1988'], 'dist1': ['NaN',2,'NaN','NaN', 30], 'dist2': [20, 21, 22, 23, 'NaN'], 'dist3': [120, 'NaN', 122, 123, 11], 'dist4': [40, 'NaN', 42, 43, 'NaN'], 'dist5': ['NaN',1,'NaN','NaN', 70]})

dfb = pd.DataFrame({'date': ['09-03-1988', '10-03-1988', '11-03-1988', '12-03-1988', '13-03-1988'], 'dist1': ['NaN',2,'NaN','NaN', 30], 'dist2': [20, 21, 22, 23, 'NaN'], 'dist3': [120, 'Nan', 122, 123, 11], 'dist4': [40, 'NaN', 42, 43, 'NaN'], 'dist5': ['NaN',1,'NaN','NaN', 70], 'fir_closest': ['dist2','dist5','dist2','dist2', 'dist3'], 'fir_closest_dist': [20,1,22,23,11], 'sec_closest': ['dist4','dist1','dist4','dist4', 'dist1'], 'sec_closest_dist': [40,2,42,43,30]})

2条回答

网友

1楼 · 编辑于 2024-04-26 12:42:45

假设您的数据帧名为df，并且运行了import pandas as pd和import numpy as np：

# Example data
df = pd.DataFrame({'date': pd.date_range('2017-04-15', periods=5), 
                   'name': ['Mullion']*5, 
                   'dist1': [pd.np.nan, pd.np.nan, 30, 20, 15],
                   'dist2': [40, 30, 20, 15, 16], 
                   'dist3': [101, 100, 98, 72, 11]})
df
        date  dist1  dist2  dist3     name
0 2017-04-15    NaN     40    101  Mullion
1 2017-04-16    NaN     30    100  Mullion
2 2017-04-17   30.0     20     98  Mullion
3 2017-04-18   20.0     15     72  Mullion
4 2017-04-19   15.0     16     11  Mullion

# Select only those columns with numeric data types. In your case, this is
# the same as:
# df_num = df[['dist1', 'dist2', ...]].copy()
df_num = df.select_dtypes(np.number)

# Get the column index of each row's minimum distance. First, fill NaN with
# numpy's infinity placeholder to ensure that NaN distances are never chosen.
idxs = df_num.fillna(np.inf).values.argsort(axis=1)

# The 1st column of idxs (which is idxs[:, 0]) contains the column index of 
# each row's smallest distance. 
# The 2nd column of idxs (which is idxs[:, 1]) contains the column index of 
# each row's second-smallest distance.

# Convert the index of each row's closest distance to a column name.
# (df.columns is a list-like that holds the column names of df.)
df['closest_name'] = df_num.columns[max_idxs[:, 0]]

# Now get the distances themselves by indexing the underlying numpy array
# of values. There may be a more pandas-specific way of doing this, but
# this should be very fast.
df['closest_dist'] = df_num.values[np.arange(len(df_num)), max_idxs[:, 0]]

# Same idea for the second-closest distances.
df['second_closest_name'] = df_num.columns[max_idxs[:, 1]]
df['second_closest_dist'] = df_num.values[np.arange(len(df_num)), max_idxs[:, 1]]

df
        date  dist1  dist2  dist3     name closest_name  closest_dist  \
0 2017-04-15    NaN     40    101  Mullion        dist2          40.0   
1 2017-04-16    NaN     30    100  Mullion        dist2          30.0   
2 2017-04-17   30.0     20     98  Mullion        dist2          20.0   
3 2017-04-18   20.0     15     72  Mullion        dist1          20.0   
4 2017-04-19   15.0     16     11  Mullion        dist3          11.0   

  second_closest_name  second_closest_dist  
0               dist3                101.0  
1               dist3                100.0  
2               dist1                 30.0  
3               dist2                 15.0  
4               dist1                 15.0

网友

2楼 · 编辑于 2024-04-26 12:42:45

我想这可以满足你的需要。你知道吗

import pandas as pd
import numpy as np

#Reproducibility and data generation for example
np.random.seed(0)
X = np.random.randint(low = 0, high = 10, size = (5,5))

#Your data
df = pd.DataFrame(X, columns = [f'dist{j}' for j in range(5)])

# Number of columns
ix = range(df.shape[1])

col_names = df.columns.values

#Find arg of kth smallest
arg_row_min,arg_row_min2,*rest = np.argsort(df.values, axis = 1).T

df['dist_min'] = col_names[arg_row_min]
df['num_min'] = df.values[ix,arg_row_min]


df['dist_min2'] = col_names[arg_row_min2]
df['num_min2'] = df.values[ix,arg_row_min2]

相关问题更多 >

编程相关推荐

热门问题

热门文章