我有一个370864*493的数据集,我想在数据集的尾部添加一个新行。我试过了数据帧.loc[370864]=.... 以及数据框.append(). 两种方法都不如我的好期待,但是同样的代码可以在更小的数据集上工作,只有20000行。我希望知道原因。数据集的大小是1.6G,我使用pandas,我的IDE是spyder。图中显示了更多细节。数据源是UCSC癌症浏览器,LUAD甲基化数据。The tail of the dataframe
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import MinMaxScaler
"""
get clinical information and count number of M0 and M1
"""
def get_Metastasis(sampleID_list,df_clinical):
num_M0=0
num_M1=0
list_Metastasis=[]
list_Metastasis.append('Metastasis')
for ID in sampleID_list:
row_number=df_clinical.loc[df_clinical.sampleID==ID].index[0]
list_Metastasis.append(df_clinical.loc[row_number,'pathologic_M'])
for i in range(1,len(list_Metastasis)):
if list_Metastasis[i]!="M0" and isinstance(list_Metastasis[i],str):
list_Metastasis[i]="M1"
num_M1+=1
list_Metastasis[i]=1
elif list_Metastasis[i]=="M0":
num_M0+=1
list_Metastasis[i]=0
# else:
# list_Metastasis[i]=None
return list_Metastasis, num_M1, num_M0
"""
read Data
"""
path_for_clinical_data="clinical_data"
path_for_genomicMatrix="genomicMatrix"
df_clinical = pd.read_table(path_for_clinical_data)
df_genomicMatrix = pd.read_table(path_for_genomicMatrix)
df_genomicMatrix=df_genomicMatrix.dropna(axis=0) ##get rid of row include nan
"""
Add metastasis information
"""
sampleID_list=list(df_genomicMatrix.columns.values)
sampleID_list=sampleID_list[1:]
list_M=[]
list_M,num_M1,num_M0=get_Metastasis(sampleID_list,df_clinical)
df_genomicMatrix.loc[len(df_genomicMatrix)]=list_M ## Here is the problem.
结果如下:
sample TCGA-44-4112-01 TCGA-NJ-A4YP-01 TCGA-86-8278-01 \
485566 cg15678817 0.02110 0.0961 -0.1652
485567 cg14483317 -0.41520 -0.4051 -0.4117
485573 cg10230711 -0.42750 -0.3067 -0.4182
485574 cg16651827 0.22345 0.2358 0.2007
485576 cg07883722 0.36660 0.3932 0.4155
目前没有回答
相关问题 更多 >
编程相关推荐