import pandas as pd
data = pd.DataFrame({'cols1':[4, 5, 5, 4, 321, 32, 5],
'clol2':[45, 66, 6, 6, 1, 432, 3],
'class':['A', 'B', 'C', 'C', 'A', 'B', 'B']})
freq = pd.DataFrame({'class':['A', 'B', 'C'],
'nostoextract':[2, 2, 2], })
def bootstrap(data, freq):
freq = freq.set_index('class')
# This function will be applied on each group of instances of the same
# class in `data`.
def sampleClass(classgroup):
cls = classgroup['class'].iloc[0]
nDesired = freq.nostoextract[cls]
nRows = len(classgroup)
nSamples = min(nRows, nDesired)
return classgroup.sample(nSamples)
samples = data.groupby('class').apply(sampleClass)
# If you want a new index with ascending values
# samples.index = range(len(samples))
# If you want an index which is equal to the row in `data` where the sample
# came from
samples.index = samples.index.get_level_values(1)
# If you don't change it then you'll have a multiindex with level 0
# being the class and level 1 being the row in `data` where
# the sample came from.
return samples
print(bootstrap(data,freq))
看看this answer:
您可以将“城市”、“类型”和“年份”列合并为一个新列:
准备
MainTable
:准备
SampleTable
:然后根据链接答案中的
SampleTable["combination"].value_counts()
而不是freq["class"]
进行采样相关问题 更多 >
编程相关推荐