如何从另一个数据帧有条件地更新一个数据帧

# -*- coding: utf-8 -*- """ Created on Thu Oct 8 20:44:27 2020 @author: theo """ import pandas as pd import math def customfunction(arg1): arg2 = arg1 * arg1 arg3 = arg2 + arg1 return arg1, arg2, arg3 dt = pd.DataFrame([[0, 5, math.nan], [-5, 2, 3], [3, -7, 4]]) for index, row in dt.iterrows(): if (row.isnull().values.any()): (_, arg2, arg3) = customfunction(arg1=row[0]) # Yes the row that contains the NaN values also has the arg1 value I need to compute the rest values dt.loc[index,1] = arg2 dt.loc[index,2] = arg3

start_time = time.time() daily1 = daily.apply(lambda x: pd.Series(np.where(any(x.isna()), (getdaytempartures(date=x[0],ht=hourly)), (x[0], x[1], x[2], x[3]))), axis=1) print("--- %s seconds ---" % (time.time() - start_time)) --- 252.25447249412537 seconds --- start_time = time.time() for index, row in daily.iterrows(): if (row.isnull().values.any()): (_, tavg, tmin, tmax) = getdaytempartures(date=row['date'], ht=hourly) daily.loc[index,'tavg'] = tavg daily.loc[index,'tmin'] = tmin daily.loc[index,'tmax'] = tmax print("--- %s seconds ---" % (time.time() - start_time)) --- 113.31336617469788 seconds --- start_time = time.time() #for key in daily.keys(): daily3 = daily.apply(cf, ht=hourly, axis=1) print("--- %s seconds ---" % (time.time() - start_time)) --- 108.97707056999207 seconds ---

def cf(row, ht): if row.isnull().values.any(): dt = ht.loc[ht['date'] == row[0]].dropna() row['tmax'] = dt['temp'].max() row['tmin'] = dt['temp'].min() row['tavg'] = dt['temp'].sum() / dt['temp'].count() return row

daily: ,date,tavg,tmin,tmax 0,1963-01-03,27.3,16.1,33.9 1,1963-01-04,27.3,16.1,33.9 2,1963-01-05,26.7,17.8,35.0 3,1963-01-06,26.7,17.8,33.9 4,1963-01-07,27.6,17.2,33.9 5,1963-01-08,26.9,17.8,33.9 6,1963-01-09,27.3,18.9,33.9 7,1963-01-10,26.8,20.0,35.0 8,1963-01-13,27.3,17.8,33.9 9,1963-01-14,27.2,17.8,33.9 10,1963-01-15,27.9,17.8,35.0 11,1963-01-16,27.5,17.8,35.0 12,1963-01-17,27.5,17.8,36.1 13,1963-01-18,27.6,17.8,33.9 14,1963-01-19,26.9,17.8,35.0 15,1963-01-20,27.3,18.9,35.0 16,1963-01-21,27.6,17.8,35.0 17,1963-01-22,26.0,17.8,35.0 18,1963-01-23,28.1,17.8,33.9 19,1963-01-24,27.6,18.9,32.8 20,1963-01-25,28.3,17.8,33.9 21,1963-01-26,28.1,17.8,35.0 22,1963-01-27,28.5,17.8,35.0 23,1963-01-28,27.7,17.8,36.1 24,1963-01-29,27.9,17.2,35.0 25,1963-01-30,28.1,17.2,37.2 26,1963-02-05,26.1,18.9,33.9 27,1963-02-11,29.2,17.8,33.9 28,1963-02-12,29.3,18.9,36.1 29,1963-02-13,29.7,18.9,36.1 hourly: ,date,time,temp 0,1957-07-01,0,25.0 1,1957-07-01,12,22.2 2,1957-07-01,18,27.2 3,1957-07-02,0,26.1 4,1957-07-02,12,22.2 5,1957-07-02,18,27.8 6,1957-07-03,0,26.1 7,1957-07-03,12,22.2 8,1957-07-03,18,28.9 9,1957-07-04,0,25.0 10,1957-07-04,12,22.2 11,1957-07-04,18,28.9 12,1957-07-05,0,25.0 13,1957-07-05,12,21.1 14,1957-07-05,18,25.0 15,1957-07-06,0,25.0 16,1957-07-06,12,20.0 17,1957-07-06,18,27.8 18,1957-07-07,0,25.0 19,1957-07-07,12,21.1 20,1957-07-07,18,27.8 21,1957-07-08,0,25.0 22,1957-07-08,12,21.1 23,1957-07-08,18,28.9 24,1957-07-09,0,23.9 25,1957-07-09,12,20.0 26,1957-07-09,18,25.0 27,1957-07-10,0,23.9 28,1957-07-10,12,17.8 29,1957-07-10,18,26.1 Hourly 1977-02-20: this is a 1 day example that I used to debug ,date,time,temp 36493,1977-02-20,0,27.0 36494,1977-02-20,1,26.0 36495,1977-02-20,2,26.0 36496,1977-02-20,3,26.0 36497,1977-02-20,11,23.0 36498,1977-02-20,12,23.0 36499,1977-02-20,13, 36500,1977-02-20,14,27.0 36501,1977-02-20,15,29.0 36502,1977-02-20,16, 36503,1977-02-20,17,30.0 36504,1977-02-20,18,32.0 36505,1977-02-20,19,33.0 36506,1977-02-20,20,33.0 36507,1977-02-20,21,32.0 36508,1977-02-20,22,30.0 36509,1977-02-20,23,28.0 daily: ,date,tavg,tmin,tmax 3297,1977-02-20,28.3,,34.0

3条回答

网友

1楼 · 编辑于 2024-04-23 16:23:22

有两个主要问题导致操作缓慢：
- 第一个问题是逐行迭代，它总是比向量化函数慢
- 第二个问题是，每次迭代都需要计算min、max和mean
最好将'hourly'数据帧按'date'分组，然后为'temp'聚合min、mean和max，创建hg。
- hg可以用于{a1}{
  }，但是两个数据帧的列名应该匹配。
  - 这是一个就地更新，因此不要分配更新（例如daily = daily.update(hg)不正确）
- overwrite=True将更新数据帧中的所有值，而不仅仅是NaN值。
  - 这就是为什么overwrite=False仅用于更新数据帧的NaN值
  - 这就是为什么要更新整行数据，方法是使用NaN子设置所有行，并使用overwrite=True
所有的迭代都被删除了，所以应该更快
而且，在没有所有信息的情况下，解决问题从来都不容易

设置数据帧

import pandas as pd
import numpy

# create sample dataframes; this may us pd.read_csv or something else, as required
daily = pd.DataFrame(daily_data)
hourly = pd.DataFrame(hourly_data)

# convert date to a datetime type for both dataframes
daily.date = pd.to_datetime(daily.date)
hourly.date = pd.to_datetime(hourly.date)

# set date as the index, only for daily
daily.set_index('date', inplace=True)

# sort the daily dataframe
daily.sort_index(inplace=True)

# create a groupby dataframe for date and aggregate metrics on temp
hg = hourly.groupby('date',)['temp'].agg(['mean', 'min', 'max'])

# rename the columns of hg, to match the columns of daily: mean to tavg, min to tmin, max to tmax
hg.columns = ['tavg', 'tmin', 'tmax']

daily显示缺少的值

            tavg  tmin  tmax
date                        
1957-07-07  27.6  17.2  33.9
1957-07-08  25.0   NaN  30.0
1957-07-09  27.3  18.9  33.9
1957-08-05  26.1  18.9  33.9
1957-08-11  29.2  17.8  33.9
1957-08-12  29.3  18.9  36.1
1957-08-13  29.7  18.9  36.1
1977-02-20  28.3   NaN  34.0

hg用指标显示每日分组

                 tavg  tmin  tmax
date                             
1957-07-01  24.800000  22.2  27.2
1957-07-02  25.366667  22.2  27.8
1957-07-03  25.733333  22.2  28.9
1957-07-04  25.366667  22.2  28.9
1957-07-05  23.700000  21.1  25.0
1957-07-06  24.266667  20.0  27.8
1957-07-07  24.633333  21.1  27.8
1957-07-08  25.000000  21.1  28.9
1957-07-09  22.966667  20.0  25.0
1957-07-10  22.600000  17.8  26.1
1977-02-20  28.333333  23.0  33.0

仅更新`NaN`值

# this will do an inplace update of only the NaN values; not the entire row
daily.update(hg, overwrite=False)

# result of daily being updated
            tavg  tmin  tmax
date                        
1957-07-07  27.6  17.2  33.9
1957-07-08  25.0  21.1  30.0
1957-07-09  27.3  18.9  33.9
1957-08-05  26.1  18.9  33.9
1957-08-11  29.2  17.8  33.9
1957-08-12  29.3  18.9  36.1
1957-08-13  29.7  18.9  36.1
1977-02-20  28.3  23.0  34.0

如果存在`NaN`，则更新整行

# select only the rows from daily, containing a NaN
daily_na = daily[daily.isna().any(axis=1)].copy()

# update all the values in the rows
daily_na.update(hg)

# now update daily from daily_na
daily.update(daily_na)

# result of daily being updated
            tavg  tmin  tmax
date                        
1957-07-07  27.6  17.2  33.9
1957-07-08  25.0  21.1  28.9
1957-07-09  27.3  18.9  33.9
1957-08-05  26.100000  18.9  33.9
1957-08-11  29.200000  17.8  33.9
1957-08-12  29.300000  18.9  36.1
1957-08-13  29.700000  18.9  36.1
1977-02-20  28.333333  23.0  33.0

样本数据

daily_data = {'date': ['1957-07-03', '1957-07-04', '1957-07-05', '1957-07-06', '1957-07-07', '1957-07-11', '1957-07-09', '1957-07-10', '1957-07-13', '1957-07-14', '1957-07-15', '1957-07-16', '1957-07-17', '1957-07-18', '1957-07-19', '1957-07-20', '1957-07-21', '1957-07-22', '1957-07-23', '1957-07-24', '1957-07-25', '1957-07-26', '1957-07-27', '1957-07-28', '1957-07-29', '1957-07-30', '1957-08-05', '1957-08-11', '1957-08-12', '1957-08-13', '1977-02-20', '1957-07-08'],
              'tavg': [27.3, 27.3, 26.7, 26.7, 27.6, 26.9, 27.3, 26.8, 27.3, 27.2, 27.9, 27.5, 27.5, 27.6, 26.9, 27.3, 27.6, 26.0, 28.1, 27.6, 28.3, 28.1, 28.5, 27.7, 27.9, 28.1, 26.1, 29.2, 29.3, 29.7, 28.3, 25.0],
              'tmin': [16.1, 16.1, 17.8, 17.8, 17.2, 17.8, 18.9, 20.0, 17.8, 17.8, 17.8, 17.8, 17.8, 17.8, 17.8, 18.9, 17.8, 17.8, 17.8, 18.9, 17.8, 17.8, 17.8, 17.8, 17.2, 17.2, 18.9, 17.8, 18.9, 18.9, np.nan, np.nan],
              'tmax': [33.9, 33.9, 35.0, 33.9, 33.9, 33.9, 33.9, 35.0, 33.9, 33.9, 35.0, 35.0, 36.1, 33.9, 35.0, 35.0, 35.0, 35.0, 33.9, 32.8, 33.9, 35.0, 35.0, 36.1, 35.0, 37.2, 33.9, 33.9, 36.1, 36.1, 34.0, 30.0]}

hourly_data = {'date': ['1957-07-01', '1957-07-01', '1957-07-01', '1957-07-02', '1957-07-02', '1957-07-02', '1957-07-03', '1957-07-03', '1957-07-03', '1957-07-04', '1957-07-04', '1957-07-04', '1957-07-05', '1957-07-05', '1957-07-05', '1957-07-06', '1957-07-06', '1957-07-06', '1957-07-07', '1957-07-07', '1957-07-07', '1957-07-08', '1957-07-08', '1957-07-08', '1957-07-09', '1957-07-09', '1957-07-09', '1957-07-10', '1957-07-10', '1957-07-10', '1977-02-20', '1977-02-20', '1977-02-20', '1977-02-20', '1977-02-20', '1977-02-20', '1977-02-20', '1977-02-20', '1977-02-20', '1977-02-20', '1977-02-20', '1977-02-20', '1977-02-20', '1977-02-20', '1977-02-20', '1977-02-20', '1977-02-20'],
               'time': [0, 12, 18, 0, 12, 18, 0, 12, 18, 0, 12, 18, 0, 12, 18, 0, 12, 18, 0, 12, 18, 0, 12, 18, 0, 12, 18, 0, 12, 18, 0, 1, 2, 3, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
               'temp': [25.0, 22.2, 27.2, 26.1, 22.2, 27.8, 26.1, 22.2, 28.9, 25.0, 22.2, 28.9, 25.0, 21.1, 25.0, 25.0, 20.0, 27.8, 25.0, 21.1, 27.8, 25.0, 21.1, 28.9, 23.9, 20.0, 25.0, 23.9, 17.8, 26.1, 27.0, 26.0, 26.0, 26.0, 23.0, 23.0, np.nan, 27.0, 29.0, np.nan, 30.0, 32.0, 33.0, 33.0, 32.0, 30.0, 28.0]}

网友

2楼 · 编辑于 2024-04-23 16:23:22

假设您要在以下数据框中替换NAN：

df = pd.DataFrame({"col1": [None, np.nan, 1.0], "col2": [1,2,3]})

0   NaN     1
1   NaN     2
2   1.0     3

可以对整个列而不是行中的每个元素使用操作：

df.assign(
    col3 = np.where(df_.isnull().any(axis=1), df_.col2 * df_.col2, df_.col1)
).assign(
    col4 = lambda df_: np.where(df.isnull().any(axis=1), df_.col3 + df_.col2, df_.col1)
)

它给你：

0   NaN     1   1.0   2.0
1   NaN     2   4.0   6.0
2   1.0     3   1.0   1.0

col3和col4相当于示例中的arg2和arg3

df_.isnull().any(axis=1)将为所有行提供至少一个NaN

网友

3楼 · 编辑于 2024-04-23 16:23:22

import pandas as pd

a = np.arange(0, 6, dtype='int32')
b = np.arange(0, 6, dtype='int32')** 2

df = pd.DataFrame({'a': a, 'b': b})
df.at[[0, 4], 'a'] = None

df

# to avoid the assignment warning
pd.set_option('chained_assignment', None)

is_nan = df['a'].isna()
df['a'][is_nan] = 2 *  df['b'][is_nan]

df

另一个解决方案

df['a'] = df['a'].fillna(2 * df['b'])

df

设置数据帧

仅更新`NaN`值

如果存在`NaN`，则更新整行

样本数据

另一个解决方案

相关问题更多 >

编程相关推荐

热门问题

热门文章