如何根据另一数据框列中的值为matplotlib直方图上色
我用以下代码创建了一个数据框:
import pandas as pd
import matplotlib.pyplot as plt
df_dict = {
"test_predictions": [0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.4, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9],
"y_true": [0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1],
"distance" : [-0.1, -0.09, -0.08, -0.08, -0.07, -0.05, -0.05, -0.05, -0.05, -0.05, -0.04, -0.04, -0.04, -0.03, -0.02, -0.01, 0.01, 0.01, 0.01, 0.02, 0.03, 0.03, 0.04, 0.05, 0.05, 0.06, 0.06, 0.07, 0.08, 0.08, 0.09, 0.1]
}
df = pd.DataFrame(df_dict)
然后,我用以下代码画了一个图,这个图包含了两条直线和一个直方图:
fig, ax1 = plt.subplots()
ax1.plot([0, 1], [0, 1], color="red", linestyle=":", label="Perfect Model")
ax1.plot(df['test_predictions'], df['y_true'], label="NN3", color='blue')
ax2 = ax1.twinx()
ax2.hist(df['test_predictions'], bins=10, alpha=0.7, color='darkgreen', label='Histogram')
我想根据 df['distance']
中的数值给直方图上色,并且还想使用一个颜色映射。这样的话,直方图的一个区间里可能会有多种颜色。希望能得到一些帮助,非常感谢!
补充:
我之前尝试在 ax2.hist(df['test_predictions'], bins=10, alpha=0.7, color='darkgreen', label='Histogram')
之前做这个。
bins = np.linspace(df['test_predictions'].min(), df['test_predictions'].max(), 10)
for index, row in df.iterrows():
bin_index = np.digitize(row['test_predictions'], bins)
color = plt.cm.viridis(row['distance']/df['distance'].max())
ax2.bar(bins[bin_index-1], 1, width=np.diff(bins)[0], color = color, alpha = 0.7)
不过,我有点担心使用循环的问题,因为我的数据框可能会有超过10000行,而且这样做的时候我得到的结果也不是我想要的,出来的效果像是单独的条形,而不是像直方图那样堆叠在一起。
1 个回答
1
帖子中的示例数据
你可以使用 imshow
来显示每个柱子对应的距离值。
下面的代码首先创建了一个额外的数据列,用来存放每一行的箱子ID。然后,选择每个箱子的距离,并将这些距离作为 imshow()
的输入。
import matplotlib.pyplot as plt
from matplotlib.cm import ScalarMappable
import pandas as pd
import numpy as np
df_dict = {
"test_predictions": [0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.4, 0.4, 0.4, 0.5, 0.5, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9],
"y_true": [0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1],
"distance": [-0.1, -0.09, -0.08, -0.08, -0.07, -0.05, -0.05, -0.05, -0.05, -0.05, -0.04, -0.04, -0.04, -0.03, -0.02, -0.01, 0.01, 0.01, 0.01, 0.02, 0.03, 0.03, 0.04, 0.05, 0.05, 0.06, 0.06, 0.07, 0.08, 0.08, 0.09, 0.1]
}
df = pd.DataFrame(df_dict)
cmap = plt.get_cmap('RdYlBu')
norm = plt.Normalize(vmin=df['distance'].min(), vmax=df['distance'].max())
num_bins = 10
bins = np.linspace(df['test_predictions'].min(), df['test_predictions'].max() + 0.001, num_bins + 1)
df['bin'] = np.digitize(df['test_predictions'], bins)
fig, ax1 = plt.subplots()
for bin_id, bin_df in df.groupby('bin'):
ax1.imshow(bin_df['distance'].values. Reshape(-1, 1), interpolation='nearest', cmap=cmap, norm=norm,
extent=[bins[bin_id - 1], bins[bin_id], 0, len(bin_df)], aspect='auto')
ax1.use_sticky_edges = False # remove stickiness due to imshow
ax1.autoscale_view()
ax1.set_ylim(ymin=0)
plt.colorbar(ScalarMappable(norm=norm, cmap=cmap), label='Distance', ax=ax1)
plt.tight_layout()
plt.show()
'tips' 数据集
这是另一个例子,使用的是 Seaborn 的 'tips' 数据集。
import matplotlib.pyplot as plt
from matplotlib.cm import ScalarMappable
import seaborn as sns # to get the 'tips' dataset
import numpy as np
df = sns.load_dataset('tips')
df.sort_values(by='tip', ascending=True, inplace=True, ignore_index=True)
cmap = plt.get_cmap('RdYlBu_r')
norm = plt.Normalize(vmin=df['tip'].min(), vmax=df['tip'].max())
num_bins = 10
bins = np.linspace(df['total_bill'].min(), df['total_bill'].max() + 0.001, num_bins + 1)
df['bin'] = np.digitize(df['total_bill'], bins)
fig, ax1 = plt.subplots()
for bin_id, bin_df in df.groupby('bin'):
ax1.imshow(bin_df['tip'].values. Reshape(-1, 1), interpolation='nearest', cmap=cmap, norm=norm,
extent=[bins[bin_id - 1], bins[bin_id], 0, len(bin_df)], aspect='auto')
ax1.use_sticky_edges = False # remove stickiness due to imshow
ax1.autoscale_view()
ax1.set_ylim(ymin=0)
ax1.set_xlabel('Total Bill')
ax1.set_ylabel('Count')
plt.colorbar(ScalarMappable(norm=norm, cmap=cmap), label='Tip', ax=ax1)
plt.tight_layout()
plt.show()