合并具有相同id的csv行并对列进行处理

2条回答

网友

1楼 · 编辑于 2024-05-26 11:09:33

在这里，数据帧的旋转可能会有所帮助：

tmp = df.pivot(index='id', columns='level3', values='value')
tmp['value'] = tmp['D7 visits: N performed']/tmp['D7 visits: N expected']
tmp['level3'] = 'D7 visits: N performed/expected %'

resul = df[['index', 'level1', 'level2', 'strate', 'id']].drop_duplicates(
    'id').merge(tmp[['level3', 'value']], left_on='id',right_index=True).reindex(
        columns=['index', 'level1', 'level2', 'level3', 'strate', 'value', 'id'])

resul.to_csv('end.csv', sep=';')

应生成预期的csv文件

网友

2楼 · 编辑于 2024-05-26 11:09:33

如果您想直接查看数据工作，请参阅“从1开始”部分。作为一个比我回答得早的人，您应该使用reforme-wide（在python:pivot中），但在我的例子中，我将数据本身进行了透视，并将其放入新的变量中（new=pd.pivot(data,index=['id','level1','strate','level2'],values=['level3','value'],columns='no')）

（*另外：在您输入整个示例数据后，我只需编辑一点代码.）

#00.module

import pandas as pd
import numpy as np

# 0. data

d=[['', '1', 'Recruitement', '', 'Recruitement start date', 'Total', '2021-07-01'],
['', '1', 'Recruitement', '', 'Recruitement start date', 'Ivory Coast', '2021-07-01'],
['', '1', 'Recruitement', '', 'Recruitement start date', 'Madagascar', '2021-07-01'],
['', '1', 'Recruitement', '', 'Recruitement start date', 'South Africa', '2021-07-01'],
['', '2', 'Recruitement', '', 'Sex (Woman), %', 'Total', '47.0'],
['', '2', 'Recruitement', '', 'Sex (Woman), %', 'Ivory Coast', '40.0'],
['', '2', 'Recruitement', '', 'Sex (Woman), %', 'Madagascar', '40.0'],
['', '2', 'Recruitement', '', 'Sex (Woman), %', 'South Africa', '60.0'],
['5-Total', '3', 'Follow up', '', 'D7 visits: N performed', 'Total', '4'],
['5-Ivory Coast', '3', 'Follow up', '', 'D7 visits: N performed', 'Ivory Coast', '1'],
['5-Madagascar', '3', 'Follow up', '', 'D7 visits: N performed', 'Madagascar', '2'],
['5-South Africa', '3', 'Follow up', '', 'D7 visits: N performed', 'South Africa', '1'],
['5-Total', '4', 'Follow up', '', 'D7 visits: N expected', 'Total', '12'],
['5-Ivory Coast', '4', 'Follow up', '', 'D7 visits: N expected', 'Ivory Coast', '4'],
['5-Madagascar', '4', 'Follow up', '', 'D7 visits: N expected', 'Madagascar', '4'],
['5-South Africa', '4', 'Follow up', '', 'D7 visits: N expected', 'South Africa', '4'],
['', '6', 'Recruitement', '', 'Age, median', 'Total', '35.0'],
['', '6', 'Recruitement', '', 'Age, median', 'Ivory Coast', '35.0'],
['', '6', 'Recruitement', '', 'Age, median', 'Madagascar', '31.0'],
['', '6', 'Recruitement', '', 'Age, median', 'South Africa', '36.0'],
['', '7', 'Recruitement', '', 'Age, IQR', 'Total', '(26.5-48.5)'],
['', '7', 'Recruitement', '', 'Age, IQR', 'Ivory Coast', '(26.0-48.0)'],
['', '7', 'Recruitement', '', 'Age, IQR', 'Madagascar', '(26.0-49.0)'],
['', '7', 'Recruitement', '', 'Age, IQR', 'South Africa', '(28.0-40.0)'],
['', '8', 'Recruitement', '', 'Randomized all, N', 'Total', '15'],
['', '8', 'Recruitement', '', 'Randomized all, N', 'Ivory Coast', '5'],
['', '8', 'Recruitement', '', 'Randomized all, N', 'Madagascar', '5'],
['', '8', 'Recruitement', '', 'Randomized all, N', 'South Africa', '5'],
['15-Total', '9', 'Recruitement', '', 'Included severity stage 1, N', 'Total', '1'],
['15-South Africa', '9', 'Recruitement', '', 'Included severity stage 1, N', 'South Africa', '1'],
['16-Total', '10', 'Recruitement', '', 'Included severity stage 2, N', 'Total', '8'],
['16-Ivory Coast', '10', 'Recruitement', '', 'Included severity stage 2, N', 'Ivory Coast', '4'],
['16-Madagascar', '10', 'Recruitement', '', 'Included severity stage 2, N', 'Madagascar', '1'],
['16-South Africa', '10', 'Recruitement', '', 'Included severity stage 2, N', 'South Africa', '3'],
['17-Total', '11', 'Recruitement', '', 'Included severity stage 3, N', 'Total', '6'],
['17-Ivory Coast', '11', 'Recruitement', '', 'Included severity stage 3, N', 'Ivory Coast', '1'],
['17-Madagascar', '11', 'Recruitement', '', 'Included severity stage 3, N', 'Madagascar', '4'],
['17-South Africa', '11', 'Recruitement', '', 'Included severity stage 3, N', 'South Africa', '1'],
['15-Total', '12', 'Recruitement', '', 'Included severity stage 1, %', 'Total', '7.0'],
['15-Ivory Coast', '12', 'Recruitement', '', 'Included severity stage 1, %', 'Ivory Coast', 'np.nan'],
['15-Madagascar', '12', 'Recruitement', '', 'Included severity stage 1, %', 'Madagascar', 'np.nan'],
['15-South Africa', '12', 'Recruitement', '', 'Included severity stage 1, %', 'South Africa', '20.0'],
['16-Total', '13', 'Recruitement', '', 'Included severity stage 2, %', 'Total', '53.0'],
['16-Ivory Coast', '13', 'Recruitement', '', 'Included severity stage 2, %', 'Ivory Coast', '80.0'],
['16-Madagascar', '13', 'Recruitement', '', 'Included severity stage 2, %', 'Madagascar', '20.0'],
['16-South Africa', '13', 'Recruitement', '', 'Included severity stage 2, %', 'South Africa', '60.0'],
['17-Total', '14', 'Recruitement', '', 'Included severity stage 3, %', 'Total', '40.0'],
['17-Ivory Coast', '14', 'Recruitement', '', 'Included severity stage 3, %', 'Ivory Coast', '20.0'],
['17-Madagascar', '14', 'Recruitement', '', 'Included severity stage 3, %', 'Madagascar', '80.0'],
['17-South Africa', '14', 'Recruitement', '', 'Included severity stage 3, %', 'South Africa', '20.0']]


data=pd.DataFrame(d,columns=['id', 'index', 'level1', 'level2', 'level3', 'strate', 'value'])

data=data.applymap(lambda x: np.nan if (x=='np.nan') or (x=='') else x)
print(data)
backup=data

# 1.1 make natural number by ['id','level1','strate']

data=data.loc[~(data['id'].isna()),:]
data['value']=data.value.astype('float')
data['concat']=data['id']+data['level1']+data['strate']

for i in data['concat'].unique():

    print(i)
    data.loc[data['concat']==i,'no']=list(range(1,data.loc[data['concat']==i,:].shape[0]+1))

print(data)

# 1.2 reshape wide(in python: pivot) 

new=pd.pivot(data,index=['id','level1','strate','level2'],values=['level3','value'],columns='no')

new.columns=new.columns.map(lambda x: f'{x[0]}_{x[1]}')
new=new.reset_index()
print(new)

new[['level3_1.0', 'level3_2.0', 'value_1.0', 'value_2.0']]

# 1.3 generate value and level3 columns and drop ['level3_1.0', 'level3_2.0', 'value_1.0', 'value_2.0'] columns

new['value']=new['value_1.0']/new['value_2.0']
new['level3']='D7 visits: N performed/expected %'
new=new.drop(columns=new.filter(regex='.0$'))
print(new)

# 1.4 append

backup.loc[(backup['id'].isna()),:].append(new,ignore_index=True)

我还在Stata中添加了相同的工作。处理数据会容易得多

clear
edit // open data editor(editing mode)

* 0) make a dataset

input str30(id index level1 level2 level3 strate value)
"" "1" "Recruitement" . "Recruitement start date" "Total" "2021-07-01"
"" "1" "Recruitement" . "Recruitement start date" "Ivory Coast" "2021-07-01"
"" "1" "Recruitement" . "Recruitement start date" "Madagascar" "2021-07-01"
"" "1" "Recruitement" . "Recruitement start date" "South Africa" "2021-07-01"
"" "2" "Recruitement" . "Sex (Woman), %" "Total" "47.0"
"" "2" "Recruitement" . "Sex (Woman), %" "Ivory Coast" "40.0"
"" "2" "Recruitement" . "Sex (Woman), %" "Madagascar" "40.0"
"" "2" "Recruitement" . "Sex (Woman), %" "South Africa" "60.0"
"5-Total" "3" "Follow up" . "D7 visits: N performed" "Total" "4"
"5-Ivory Coast" "3" "Follow up" . "D7 visits: N performed" "Ivory Coast" "1"
"5-Madagascar" "3" "Follow up" . "D7 visits: N performed" "Madagascar" "2"
"5-South Africa" "3" "Follow up" . "D7 visits: N performed" "South Africa" "1"
"5-Total" "4" "Follow up" . "D7 visits: N expected" "Total" "12"
"5-Ivory Coast" "4" "Follow up" . "D7 visits: N expected" "Ivory Coast" "4"
"5-Madagascar" "4" "Follow up" . "D7 visits: N expected" "Madagascar" "4"
"5-South Africa" "4" "Follow up" . "D7 visits: N expected" "South Africa" "4"
"" "6" "Recruitement" . "Age, median" "Total" "35.0"
"" "6" "Recruitement" . "Age, median" "Ivory Coast" "35.0"
"" "6" "Recruitement" . "Age, median" "Madagascar" "31.0"
"" "6" "Recruitement" . "Age, median" "South Africa" "36.0"
"" "7" "Recruitement" . "Age, IQR" "Total" "(26.5-48.5)"
"" "7" "Recruitement" . "Age, IQR" "Ivory Coast" "(26.0-48.0)"
"" "7" "Recruitement" . "Age, IQR" "Madagascar" "(26.0-49.0)"
"" "7" "Recruitement" . "Age, IQR" "South Africa" "(28.0-40.0)"
"" "8" "Recruitement" . "Randomized all, N" "Total" "15"
"" "8" "Recruitement" . "Randomized all, N" "Ivory Coast" "5"
"" "8" "Recruitement" . "Randomized all, N" "Madagascar" "5"
"" "8" "Recruitement" . "Randomized all, N" "South Africa" "5"
"15-Total" "9" "Recruitement" . "Included severity stage 1, N" "Total" "1"
"15-South Africa" "9" "Recruitement" . "Included severity stage 1, N" "South Africa" "1"
"16-Total" "10" "Recruitement" . "Included severity stage 2, N" "Total" "8"
"16-Ivory Coast" "10" "Recruitement" . "Included severity stage 2, N" "Ivory Coast" "4"
"16-Madagascar" "10" "Recruitement" . "Included severity stage 2, N" "Madagascar" "1"
"16-South Africa" "10" "Recruitement" . "Included severity stage 2, N" "South Africa" "3"
"17-Total" "11" "Recruitement" . "Included severity stage 3, N" "Total" "6"
"17-Ivory Coast" "11" "Recruitement" . "Included severity stage 3, N" "Ivory Coast" "1"
"17-Madagascar" "11" "Recruitement" . "Included severity stage 3, N" "Madagascar" "4"
"17-South Africa" "11" "Recruitement" . "Included severity stage 3, N" "South Africa" "1"
"15-Total" "12" "Recruitement" . "Included severity stage 1, %" "Total" "7.0"
"15-Ivory Coast" "12" "Recruitement" . "Included severity stage 1, %" "Ivory Coast" "nan"
"15-Madagascar" "12" "Recruitement" . "Included severity stage 1, %" "Madagascar" "nan"
"15-South Africa" "12" "Recruitement" . "Included severity stage 1, %" "South Africa" "20.0"
"16-Total" "13" "Recruitement" . "Included severity stage 2, %" "Total" "53.0"
"16-Ivory Coast" "13" "Recruitement" . "Included severity stage 2, %" "Ivory Coast" "80.0"
"16-Madagascar" "13" "Recruitement" . "Included severity stage 2, %" "Madagascar" "20.0"
"16-South Africa" "13" "Recruitement" . "Included severity stage 2, %" "South Africa" "60.0"
"17-Total" "14" "Recruitement" . "Included severity stage 3, %" "Total" "40.0"
"17-Ivory Coast" "14" "Recruitement" . "Included severity stage 3, %" "Ivory Coast" "20.0"
"17-Madagascar" "14" "Recruitement" . "Included severity stage 3, %" "Madagascar" "80.0"
"17-South Africa" "14" "Recruitement" . "Included severity stage 3, %" "South Africa" "20.0"
end
compress
replace value="" if value=="nan"
sa data,replace


* 1) reshape wide

use data,clear

drop if missing(id)
destring value,replace
bysort level1 level2 strate id : gen no=_n // make no colums of which value values are natural numbers by['level1', 'level2','strate','id']
reshape wide level3 value index ,i(id) j(no) // pd.pivot, so easy!

* 2) value and level3

gen value=value1/value2
drop value? // so easy!

gen level3="D7 visits: N performed/expected %"
drop level3? // so easy!

order id level* strate value // so easy!(order columns)

* 3) append 

drop index*
tostring value,replace force
replace value="" if value=="."
sa temp,replace

use data,clear
keep if missing(id)

append using temp.dta

相关问题更多 >

编程相关推荐

热门问题

热门文章

合并具有相同id的csv行并对列进行处理

相关问题 更多 >

编程相关推荐

热门问题

热门文章

相关问题更多 >