从多个分类变量之间具有偶数表示的数据帧生成列表

import pandas as pd import numpy as np test = pd.DataFrame({'A' : ['alice', 'bob', 'george', 'michael', 'john', 'peter', 'paul', 'mary'], 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C' : ['dog', 'cat', 'dog', 'cat', 'dog', 'cat', 'dog', 'cat'], 'D' : ['boy', 'girl', 'boy', 'girl', 'boy', 'girl', 'boy', 'girl']}) gr1, gr2, gr3 = [], [], [] gr1_names = [] def test_check1(x): #this is where I'm clearly not approaching this problem correctly for index, row in x.iterrows(): if row['A'] not in gr1 and row['B'] not in gr1 and row['C'] not in gr1 and row['D'] not in gr1: gr1.extend(row) # keep a record of what names are in what groups gr1_names.append(row['A']) #save the name

1条回答

网友

1楼 · 发布于 2024-04-20 01:41:10

我的同事找到了一个解决办法，我认为这个办法也能更好地解释这个问题。你知道吗

import pandas as pd
import random
import math
import itertools

def n_per_group(n, n_groups):
    """find the size of each group when splitting n people into n_groups"""
    n_per_group = math.floor(n/n_groups)
    rem = n % n_per_group
    return [n_per_group if k<rem else n_per_group + 1 for k in range(n_groups)]

def assign_groups(n, n_groups):
    """split the n people in n_groups pretty evenly, and randomize"""
    n_per = n_per_group(n ,n_groups)
    groups = list(itertools.chain(*[i[0]*[i[1]] for i in zip(n_per,list(range(n_groups)))]))
    random.shuffle(groups)
    return groups

def group_diff(df, g1, g2):
    """calculate the between group score difference"""
    a = df.loc[df['group']==g1, ~df.columns.isin(('A','group'))].sum()
    b = df.loc[df['group']==g2, ~df.columns.isin(('A','group'))].sum()
    #print(a)
    return abs(a-b).sum()

def swap_groups(df, row1, row2):
    """swap the groups of the people in row1 and row2"""
    r1group = df.loc[row1,'group']
    r2group = df.loc[row2,'group']
    df.loc[row2,'group'] = r1group
    df.loc[row1,'group'] = r2group
    return df

def row_to_group(df, row):
    """get the group associated to a given row"""
    return df.loc[row,'group']

def swap_and_score(df, row1, row2):
    """
    given two rows, calculate the between group scores
    originally, and if we swap rows. If the score difference
    is minimized by swapping, return the swapped df, otherwise
    return the orignal (swap back)
    """
    #orig = df
    g1 = row_to_group(df,row1)
    g2 = row_to_group(df,row2)
    s1 = group_diff(df,g1,g2)
    df = swap_groups(df, row1, row2)
    s2 = group_diff(df,g1,g2)
    #print(s1,s2)
    if s1>s2:
        #print('swap')
        return df
    else:
        return swap_groups(df, row1, row2)

def pairwise_scores(df):
    d = []
    for i in range(n_groups):
        for j in range(i+1,n_groups):
            d.append(group_diff(df,i,j))
    return d

# one hot encode and copy
df_dum = pd.get_dummies(df, columns=['B', 'C', 'D']).copy(deep=True)

#drop extra cols as needed

groups = assign_groups(n, n_groups)
df_dum['group'] = groups

# iterate
for _ in range(5000):
    rows = random.choices(list(range(n)),k=2)
    #print(rows)
    df_dum = swap_and_score(df_dum,rows[0],rows[1])
    #print(pairwise_scores(df))

print(pairwise_scores(df_dum))

df['group'] = df_dum.group
df['orig_groups'] = groups

for i in range(n_groups):
        for j in range(i+1,n_groups):
            a = df_dum.loc[df_dum['group']==3, ~df_dum.columns.isin(('A','group'))].sum()
            b = df_dum.loc[df_dum['group']==0, ~df_dum.columns.isin(('A','group'))].sum()
            print(a-b)

我将改变问题本身，以便更好地解释需要什么，因为我认为我第一次没有特别好地解释最终目标。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章