这是我的密码。基本上,我试图根据名为“PolicyNum”的列(它必须是一个变量,因此我不能使用df.PolicyNum.isin方法)来计算公共行
虽然您可以建议一种更好的方法,但我也想知道为什么两个数据帧(打印在第11行和第12行)的长度会不同
import pandas as pd
def getPreparedDataForComparison( baseDataFrame, secondaryDataFrame, sourceName, indexKey):
baseDataFrameCommon = baseDataFrame[(baseDataFrame[indexKey].isin(secondaryDataFrame[indexKey]) == False)]
baseDataFrameCommon['COMBO'] = baseDataFrameCommon.apply(lambda x :','.join(x.astype(str)),axis=1)
baseDataFrameCommon['DataSource'] = sourceName
return baseDataFrameCommon
def compareDataFrames(dataframe1, dataframe2, indexKey):
dataframe1Common = getPreparedDataForComparison(dataframe1, dataframe2, 'TXT', indexKey)
dataframe2Common = getPreparedDataForComparison(dataframe2, dataframe1, 'SQL', indexKey)
print(len(dataframe1Common))
print(len(dataframe2Common))
def sampleData1():
cols = ['PolicyNum','firsttransactiondate','subsequentbonustotalcumulative','subsequentpremiumtotalcumulative','totalautocumulative','totalautoposteffectivedatecumulative','totalpartialcumulative','totalpartialposteffectivedatecumulative']
sourceData = [ ('E001','#1985-01-01#',100,100,100,100,100,100),
('E002','#1985-01-01#',200,200,200,200,200,200),
('E003','#1985-01-01#',100,100,100,100,100,100),
('E004','#1985-01-01#',100,100,100,100,100,100),
('E005','#1985-01-01#',100,100,100,100,100,100),
('E201','#1985-01-01#',100,100,100,100,100,100),
('E202','#1985-01-01#',100,100,100,100,100,100),
('1006','#1985-01-01#',100,100,100,100,100,100),
('1007','#1985-01-01#',100,100,100,100,100,100),
('1008','#1985-01-01#',100,100,100,100,100,100),]
x = pd.DataFrame(sourceData,columns=cols)
return x
def sampleData2():
cols = ['PolicyNum','firsttransactiondate','subsequentbonustotalcumulative','subsequentpremiumtotalcumulative','totalautocumulative','totalautoposteffectivedatecumulative','totalpartialcumulative','totalpartialposteffectivedatecumulative']
sourceData = [ ('E001','#1985-01-01#',100,300,100,100,100,400),
('E002','#1985-01-01#',200,200,200,200,200,200),
('E005','#1989-01-01#',100,100,100,100,100,100),
('E105','#1989-01-01#',100,100,100,100,100,100),
('1106','#1985-01-01#',100,100,100,100,900,100),
('1006','#1985-01-01#',100,100,100,100,900,100),
('1007','#1985-01-01#',100,100,100,100,100,100),]
x = pd.DataFrame(sourceData,columns=cols)
return x
compareDataFrames(sampleData1(),sampleData2(),'PolicyNum')
这不是我认为最好的方式,但这就是我最终要做的
熊猫迷们可以挖一挖
相关问题 更多 >
编程相关推荐