使用Pandas MultiIndex在matplotlib柱状图中分组标签
我有一个带有多重索引的 pandas 数据框:
group subgroup obs_1 obs_2
GroupA Elem1 4 0
Elem2 34 2
Elem3 0 10
GroupB Elem4 5 21
等等。正如在这个问题中提到的,这在 matplotlib 中是可以做到的,但我更希望(如果可能的话)利用我已经知道的层级结构(多重索引给我的帮助)。目前的情况是,索引显示为一个元组。
这样的事情可能吗?
3 个回答
1
如何创建一个有两个层级的分组条形图
你可以为每个组创建一个子图,然后把它们拼在一起,设置 wspace=0
。每个子图的宽度需要根据子组的数量进行调整,这可以通过在 gridspec_kw
字典中使用 width_ratios
参数来实现,这样所有的列就能保持相同的宽度。
接下来,你可以进行各种格式的调整。在下面的例子中,我选择在背景中绘制水平的网格线,并通过使用小刻度线在组之间添加分隔线。
import numpy as np # v 1.19.2
import pandas as pd # v 1.1.3
import matplotlib.pyplot as plt # v 3.3.2
# Create sample DataFrame with MultiIndex
df = pd.DataFrame(dict(group = ['GroupA', 'GroupA', 'GroupA', 'GroupB'],
subgroup = ['Elem1', 'Elem2', 'Elem3', 'Elem4'],
obs_1 = [4, 34, 0, 5],
obs_2 = [0, 2, 10, 21]))
df.set_index(['group', 'subgroup'], inplace=True)
# Create figure with a subplot for each group with a relative width that
# is proportional to the number of subgroups
groups = df.index.levels[0]
nplots = groups.size
plots_width_ratios = [df.xs(group).index.size for group in groups]
fig, axes = plt.subplots(nrows=1, ncols=nplots, sharey=True, figsize=(6, 4),
gridspec_kw = dict(width_ratios=plots_width_ratios, wspace=0))
# Loop through array of axes to create grouped bar chart for each group
alpha = 0.3 # used for grid lines, bottom spine and separation lines between groups
for group, ax in zip(groups, axes):
# Create bar chart with horizontal grid lines and no spines except bottom one
df.xs(group).plot.bar(ax=ax, legend=None, zorder=2)
ax.grid(axis='y', zorder=1, color='black', alpha=alpha)
for spine in ['top', 'left', 'right']:
ax.spines[spine].set_visible(False)
ax.spines['bottom'].set_alpha(alpha)
# Set and place x labels for groups
ax.set_xlabel(group)
ax.xaxis.set_label_coords(x=0.5, y=-0.15)
# Format major tick labels for subgroups
ax.set_xticklabels(ax.get_xticklabels(), rotation=0, ha='center')
ax.tick_params(axis='both', which='major', length=0, pad=10)
# Set and format minor tick marks for separation lines between groups: note
# that except for the first subplot, only the right tick mark is drawn to avoid
# duplicate overlapping lines so that when an alpha different from 1 is chosen
# (like in this example) all the lines look the same
if ax.is_first_col():
ax.set_xticks([*ax.get_xlim()], minor=True)
else:
ax.set_xticks([ax.get_xlim()[1]], minor=True)
ax.tick_params(which='minor', length=45, width=0.8, color=[0, 0, 0, alpha])
# Add legend using the labels and handles from the last subplot
fig.legend(*ax.get_legend_handles_labels(), frameon=False,
bbox_to_anchor=(0.92, 0.5), loc="center left")
title = 'Grouped bar chart of a hierarchical dataset with 2 levels'
fig.suptitle(title, y=1.01, size=14);
参考: gyx-hh 的这个回答
3
我觉得目前还没有一种很好且标准的方法来绘制多重索引的数据框。我发现@Stein提供的这个解决方案看起来很不错。我把他的例子调整了一下,适应你的数据:
import pandas as pd
import matplotlib.pyplot as plt
from itertools import groupby
import numpy as np
%matplotlib inline
group = ('Group_A', 'Group_B')
subgroup = ('elem1', 'elem2', 'elem3', 'elem4')
obs = ('obs_1', 'obs_2')
index = pd.MultiIndex.from_tuples([('Group_A','elem1'),('Group_A','elem2'),('Group_A','elem3'),('Group_B','elem4')],
names=['group', 'subgroup'])
values = np.array([[4,0],[43,2],[0,10],[5,21]])
df = pd.DataFrame(index=index)
df['obs_1'] = values[:,0]
df['obs_2'] = values[:,1]
def add_line(ax, xpos, ypos):
line = plt.Line2D([xpos, xpos], [ypos + .1, ypos],
transform=ax.transAxes, color='gray')
line.set_clip_on(False)
ax.add_line(line)
def label_len(my_index,level):
labels = my_index.get_level_values(level)
return [(k, sum(1 for i in g)) for k,g in groupby(labels)]
def label_group_bar_table(ax, df):
ypos = -.1
scale = 1./df.index.size
for level in range(df.index.nlevels)[::-1]:
pos = 0
for label, rpos in label_len(df.index,level):
lxpos = (pos + .5 * rpos)*scale
ax.text(lxpos, ypos, label, ha='center', transform=ax.transAxes)
add_line(ax, pos*scale, ypos)
pos += rpos
add_line(ax, pos*scale , ypos)
ypos -= .1
ax = df.plot(kind='bar',stacked=False)
#Below 2 lines remove default labels
ax.set_xticklabels('')
ax.set_xlabel('')
label_group_bar_table(ax, df)
这样就会生成:
5
如果你的 MultiIndex
只有两个层级,我觉得下面的方法会更简单:
plt.figure()
ax = plt.gca()
DF.plot(kind='bar', ax=ax)
plt.grid(True, 'both')
minor_XT = ax.get_xaxis().get_majorticklocs()
DF['XT_V'] = minor_XT
major_XT = DF.groupby(by=DF.index.get_level_values(0)).first()['XT_V'].tolist()
DF.__delitem__('XT_V')
ax.set_xticks(minor_XT, minor=True)
ax.set_xticklabels(DF.index.get_level_values(1), minor=True)
ax.tick_params(which='major', pad=15)
_ = plt.xticks(major_XT, (DF.index.get_level_values(0)).unique(), rotation=0)
还有一个稍微复杂一点,但适用性更广的解决方案(不管你有多少个层级都可以用):
def cvt_MIdx_tcklab(df):
Midx_ar = np.array(df.index.tolist())
Blank_ar = Midx_ar.copy()
col_idx = np.arange(Midx_ar.shape[0])
for i in range(Midx_ar.shape[1]):
val,idx = np.unique(Midx_ar[:, i], return_index=True)
Blank_ar[idx, i] = val
idx=~np.in1d(col_idx, idx)
Blank_ar[idx, i]=''
return map('\n'.join, np.fliplr(Blank_ar))
plt.figure()
ax = plt.gca()
DF.plot(kind='bar', ax=ax)
ax.set_xticklabels(cvt_MIdx_tcklab(DF), rotation=0)