def concat_duplicate_columns(df):
dupli = {}
# populate dictionary with column names and count for duplicates
for column in df.columns:
dupli[column] = dupli[column] + 1 if column in dupli.keys() else 1
# rename duplicated keys with °°° number suffix
for key, val in dict(dupli).items():
del dupli[key]
if val > 1:
for i in range(val):
dupli[key+'°°°'+str(i)] = val
else: dupli[key] = 1
# rename columns so that we can now access abmigous column names
# sorting in dict is the same as in original table
df.columns = dupli.keys()
# for each duplicated column name
for i in set(re.sub('°°°(.*)','',j) for j in dupli.keys() if '°°°' in j):
i = str(i)
# for each duplicate of a column name
for k in range(dupli[i+'°°°0']-1):
# concatenate values in duplicated columns
df[i+'°°°0'] = df[i+'°°°0'].astype(str) + df[i+'°°°'+str(k+1)].astype(str)
# Drop duplicated columns from which we have aquired data
df = df.drop(i+'°°°'+str(k+1), 1)
# resort column names for proper mapping
df = df.reindex_axis(sorted(df.columns), axis = 1)
# rename columns
df.columns = sorted(set(re.sub('°°°(.*)','',i) for i in dupli.keys()))
return df
def duplicated_varnames(df):
"""Return a dict of all variable names that
are duplicated in a given dataframe."""
repeat_dict = {}
var_list = list(df) # list of varnames as strings
for varname in var_list:
# make a list of all instances of that varname
test_list = [v for v in var_list if v == varname]
# if more than one instance, report duplications in repeat_dict
if len(test_list) > 1:
repeat_dict[varname] = len(test_list)
return repeat_dict
如果有帮助的话,我在尝试连接两个数据帧时也遇到了这个错误(截至编写本文时,这是除了源代码之外,我在google上唯一能找到的相关命中)。
我不知道这个答案是否能解决OP的问题(因为他/她没有发布足够的信息),但对我来说,这是我试图用列
['A', 'B', 'B', 'C']
来concat
数据帧df1
时造成的(请参阅重复的列标题?)使用数据帧df2
和列['A', 'B']
。可以理解的是,这种复制导致大熊猫摇摆不定。将df1
更改为['A', 'B', 'C']
(即删除一个重复的列),一切正常。编写了一个小函数来连接重复的列名。 函数关心排序如果原始数据帧未排序,则输出将是已排序的。
我最近也收到了这条消息,我发现像上面的用户@jason和@user3805082一样,在我试图
concat
的数百个数据帧中,有几个列是重复的,每个数据帧都有几十个神秘的变量名。手动搜索重复项不实用。如果其他人有同样的问题,我写了下面的函数,可能会有帮助。
然后,您可以遍历该dict来报告有多少个重复项,删除重复的变量,或者以某种系统的方式重命名它们。
相关问题 更多 >
编程相关推荐