基于两个数据库中的列值合并两个数据帧,其中列名称未知

2024-04-27 00:36:54 发布

您现在位置:Python中文网/ 问答频道 /正文

这是我的密码。基本上,我试图根据名为“PolicyNum”的列(它必须是一个变量,因此我不能使用df.PolicyNum.isin方法)来计算公共行

虽然您可以建议一种更好的方法,但我也想知道为什么两个数据帧(打印在第11行和第12行)的长度会不同

import pandas as pd

def getPreparedDataForComparison( baseDataFrame, secondaryDataFrame, sourceName, indexKey):
    baseDataFrameCommon = baseDataFrame[(baseDataFrame[indexKey].isin(secondaryDataFrame[indexKey]) == False)]
    baseDataFrameCommon['COMBO'] = baseDataFrameCommon.apply(lambda x :','.join(x.astype(str)),axis=1)
    baseDataFrameCommon['DataSource'] = sourceName
    return baseDataFrameCommon

def compareDataFrames(dataframe1, dataframe2, indexKey):
    dataframe1Common = getPreparedDataForComparison(dataframe1, dataframe2, 'TXT', indexKey)
    dataframe2Common = getPreparedDataForComparison(dataframe2, dataframe1, 'SQL', indexKey)
    print(len(dataframe1Common))
    print(len(dataframe2Common))

def sampleData1():
    cols = ['PolicyNum','firsttransactiondate','subsequentbonustotalcumulative','subsequentpremiumtotalcumulative','totalautocumulative','totalautoposteffectivedatecumulative','totalpartialcumulative','totalpartialposteffectivedatecumulative']
    sourceData  = [ ('E001','#1985-01-01#',100,100,100,100,100,100),
                    ('E002','#1985-01-01#',200,200,200,200,200,200),
                    ('E003','#1985-01-01#',100,100,100,100,100,100),
                    ('E004','#1985-01-01#',100,100,100,100,100,100),
                    ('E005','#1985-01-01#',100,100,100,100,100,100),
                    ('E201','#1985-01-01#',100,100,100,100,100,100),
                    ('E202','#1985-01-01#',100,100,100,100,100,100),
                    ('1006','#1985-01-01#',100,100,100,100,100,100),
                    ('1007','#1985-01-01#',100,100,100,100,100,100),
                    ('1008','#1985-01-01#',100,100,100,100,100,100),]
    x = pd.DataFrame(sourceData,columns=cols)
    return x

def sampleData2():
    cols = ['PolicyNum','firsttransactiondate','subsequentbonustotalcumulative','subsequentpremiumtotalcumulative','totalautocumulative','totalautoposteffectivedatecumulative','totalpartialcumulative','totalpartialposteffectivedatecumulative']
    sourceData  = [ ('E001','#1985-01-01#',100,300,100,100,100,400),
                    ('E002','#1985-01-01#',200,200,200,200,200,200),
                    ('E005','#1989-01-01#',100,100,100,100,100,100),
                    ('E105','#1989-01-01#',100,100,100,100,100,100),
                    ('1106','#1985-01-01#',100,100,100,100,900,100),
                    ('1006','#1985-01-01#',100,100,100,100,900,100),
                    ('1007','#1985-01-01#',100,100,100,100,100,100),]
    x = pd.DataFrame(sourceData,columns=cols)
    return x

compareDataFrames(sampleData1(),sampleData2(),'PolicyNum')

Tags: 方法returndefpdcolsisindataframe1indexkey
1条回答
网友
1楼 · 发布于 2024-04-27 00:36:54

这不是我认为最好的方式,但这就是我最终要做的

熊猫迷们可以挖一挖

def getPreparedDataForComparison( baseDataFrame, secondaryDataFrame, sourceName, indexKey):
    baseDataFrame['DataSource'] = sourceName
    baseDataFrame['Common'] = np.where((baseDataFrame[indexKey].isin( list(secondaryDataFrame[indexKey]))),True,False)
    baseDataFrameCommon = baseDataFrame.loc[baseDataFrame['Common'] == True]
    baseDataFrameCommon.drop(['Common'],axis=1,inplace=True)
    baseDataFrameCommon['COMBO'] = baseDataFrameCommon.apply(lambda x :','.join(x.astype(str)),axis=1)
    baseDataFrame = None
    secondaryDataFrame = None    
    return baseDataFrameCommon

相关问题 更多 >