为“for”循环定义函数以在Python中输出数据帧

2024-04-16 09:02:45 发布

您现在位置:Python中文网/ 问答频道 /正文

我试图定义一个函数calc来执行for循环。这个for循环使用几个定义的函数来附加为数据帧计算的结果:weightedMeansd_pooledsummation。最后,预期结果将输出到output2。没有为for循环定义函数的代码运行良好。但是,我的问题是,在执行calc之后,output2是空的,没有抱怨。因此,我无法进行相应的故障排除。完整代码如下:

import pandas as pd
import numpy as np
from dplython import X, sift, DplyFrame, mutate, select
from plydata import define, group_by, summarize

def weightedMean(data):
        length = len(data['Var1'])
        if length == 1:
                mx = data['Var1']
                return(length)
        else:
                mx = data['Var1'][0]
                nx = data['Var3'][0]
                for i in range(1,length):
                        my = data['Var1'][i]
                        ny = data['Var3'][i]
                        nx = nx + ny
                        mx=(mx*nx+my*ny)/(nx+ny)
                return(mx)

def summation(data):
        length = len(data['Var3'])
        cx = data['Var3'][0]
        for i in range(1,length):
                cy = data['Var3'][i]
                cx = cx + cy
        return(cx)

def sd_c(x_m, x_s, x_n, y_m, y_s, y_n):
        al = x_n+y_n
        tmp_sd = al*((x_n-1)*(x_s*x_s)+(y_n-1)*(y_s*y_s))+y_n*x_n*(x_m-y_m)*(x_m-y_m)
        var = tmp_sd/(al*(al-1))
        std = np.sqrt(var)
        return(std)

def sd_pooled(data):
        length = len(data['Var1'])
        if length == 1:
                mx = data['Var1']
                return(length)
        else:
                mx = data['Var1'][0]
                sx = data['Var2'][0]
                nx = data['Var3'][0]
                for i in range(1,length):
                        my = data['Var1'][i]
                        sy = data['Var2'][i]
                        ny = data['Var3'][i]
                        sx = sd_c(mx, sx, nx, my, sy, ny)
                        nx = nx + ny
                        mx = (mx*nx + my*ny)/(nx + ny)
                return(sx)

dat = pd.read_csv("input.txt",sep="\t")

dat = {
'Group': ['A','A','A','A','A','A','A','A','A','A'],
'Process': [3,3,3,3,3,3,841,841,841,841],
'Category': ['cat1','cat1','cat1','cat1','cat1','cat1','cat2','cat2','cat2','cat2'],'Type': ['type1','type1','type1','type1','type1','type1','type2','type2','type2','type2'],
'Var1': [86.84,103.39,109.00,107.30,123.09,111.98,87.62,87.40,88.53,85.84],
'Var2': [2.913,2.835,1.478,2.979,2.424,7.462,3.049,4.781,3.025,2.703],
'Var3': [0.01096,0.00564,0.00365,0.00631,0.00531,0.00332,0.01195,0.00930,0.00697,0.00697]
}
dat = pd.DataFrame(dat)

dat_name = dat.loc[:,'Type'].unique()
dat = DplyFrame(dat)
output = pd.DataFrame([])
def calc(dat_name, dat, output):
        out = pd.DataFrame([])
        for i in range(len(dat_name)):
                df = (dat >>
                        sift(X.Type == dat_name[i]) >>
                        mutate(Var3 = X.Var3*3021) >>
                        sift(X.Var2 < 50))
                out = out.append(df)
                out_grouped = out.groupby(['Group', 'Type', 'Process', 'Category'])
                init = []
                mean = []
                stdv = []
                freq = []
                kmer = []
                for name, group in out_grouped:
                        group = pd.DataFrame(group).reset_index()
                        nm = name
                        wm = weightedMean(group)
                        sd = sd_pooled(group)
                        fq = summation(group)
                        init.append(nm)
                        mean.append(wm)
                        freq.append(fq)
                        stdv.append(sd)
                init = pd.DataFrame(init)
                mean = pd.DataFrame(mean)
                freq = pd.DataFrame(freq)
                stdv = pd.DataFrame(stdv)
                init.rename(columns={0:'Group',1:'Type',2:'Process',3:'Category'}, inplace=True)
                mean.rename(columns={0:'Var1'}, inplace=True)
                stdv.rename(columns={0:'Var2'}, inplace=True)
                freq.rename(columns={0:'Var3'}, inplace=True)
                combined = pd.concat([init.reset_index(drop=True), mean, stdv, freq], axis=1)
                output = output.append(combined)

output2 = calc(dat_name, dat, output)

预期输出如下:

  Group   Type  Process Category        Var1       Var2       Var3
0     A  type1        3     cat1  101.207332  13.997181  106.30899
1     A  type2      841     cat2   87.431341   3.584393  106.30899

在这种情况下,我想知道如何才能使calc成功地工作。非常感谢。你知道吗


Tags: dataframefordatagroupsdlengthdatpd