import itertools as it
import pandas as pd
from pandas.util.testing import assert_frame_equal
def powerset(iterable):
"powerset([1,2,3]) > () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
s = list(iterable)
return it.chain.from_iterable(it.combinations(s,r) for r in range(len(s)+1))
def grouper(df,grpby,aggfunc):
''' produces aggregate DataFrame from DataFrames for non-redundant groupings
`workingdf` is used to avoid modifying original DataFrame
'''
uniqcols = set(col for col in grpby if len(df[col].unique()) == 1)
subset = set()
for col in uniqcols:
for grp in powerset(grpby):
if col in grp:
subset.add(grp) # add level of aggregation only when non-redundant
if len(subset) == 0:
for grp in powerset(grpby):
subset.add(grp)
workingdf = df.copy()
for idx,i in enumerate(subset):
if i != ():
tmp = aggfunc( workingdf.groupby(i) )
else:
# hack to get output to be a DataFrameGroupBy object:
# insert dummy column on which to group by
dummycolname = hash(tuple(workingdf.columns.tolist()))
workingdf[dummycolname] = ''
tmp = aggfunc( workingdf.groupby(dummycolname) )
# drop the index and add it back
if i == (): tmp.reset_index(drop=True,inplace=True)
else: tmp.reset_index(inplace=True)
for j in grpby:
if j not in tmp: # if column is not in DataFrame add it
tmp[j] = '(All)'
# new list with all columns including aggregate ones; do this only once
if idx == 0:
finalcols = grpby[:]
addlcols = [k for k in tmp if k not in grpby] # aggregate columns
finalcols.extend(addlcols)
# reorder columns
tmp = tmp[finalcols]
if idx == 0:
final = tmp; del tmp
else:
final = pd.concat( [final,tmp] ); del tmp
del workingdf
final.sort_values(finalcols,inplace=True)
final.reset_index(drop=True,inplace=True)
return final
def agg(grpbyobj):
''' the purpose of this function is to:
specify aggregate operation(s) you wish to perform,
name the resulting column(s) in the final DataFrame.
'''
tmp = pd.DataFrame()
tmp['Total (n)'] = grpbyobj['Total'].sum()
return tmp
if __name__ == '__main__':
df = pd.DataFrame({'Area':['a','a','b',],
'Year':[2014,2014,2014,],
'Month':[1,2,3,],
'Total':[4,5,6,],})
final = grouper(df,grpby=['Area','Year'],aggfunc=agg)
# test against expected result
expected = pd.DataFrame({u'Year': {0: 2014, 1: 2014, 2: 2014},
u'Total (n)': {0: 15, 1: 9, 2: 6},
u'Area': {0: u'(All)', 1: u'a', 2: u'b'}})
expected = expected[final.columns.tolist()]
try:
# check_names kwarg True: compare indexes and columns
assert_frame_equal(final,expected,check_names=True)
except AssertionError as e:
raise
您可以使用
.loc
函数来实现:在这种情况下,您可能应该创建一个系列来跟踪您的摘要统计信息。如果需要的话,您可以将concat用于显示目的。在
为此,我创建了一个聚合工具,其行为类似于SQL中的^{} 。提供分组和聚合函数所依据的列,并获取聚合数据帧。在
相关问题 更多 >
编程相关推荐