绘制scipy的偏态正态分布时数据幅度意外
我正在尝试为我的样本数据估计一个偏态正态分布,并将其与数据一起绘制。如果我把数据画成直方图,我得到的图是这样的:
然后我尝试将一个偏态正态分布拟合到我的数据上,并将其与直方图一起绘制。然而,拟合后的数据曲线形状是对的,但明显比我预期的要低很多。
如果我把直方图从这个图中去掉,我得到的曲线看起来就像我预期的那样。只是缩小了很多:
我该如何在一个图中同时表示这两种绘制方式呢?我相信我遗漏了一些简单的东西。我通常不在Python中编程。
代码:
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np
data = [ 10, 10, 11, 10, 11, 11, 10, 10, 15, 15, 14, 18, 11, 10, 11, 13, 13, 10, 13, 16
, 16, 15, 11, 16, 12, 11, 17, 13, 11, 14, 12, 11, 10, 12, 11, 12, 10, 12, 10, 12
, 11, 11, 11, 12, 15, 11, 12, 12, 10, 12, 10, 10, 11, 11, 14, 10, 11, 10, 17, 10
, 15, 10, 11, 11, 10, 9, 12, 11, 13, 12, 12, 11, 11, 16, 15, 21, 11, 11, 11, 13
, 11, 12, 10, 21, 10, 13, 10, 10, 13, 13, 10, 18, 13, 13, 11, 14, 10, 14, 13, 11
, 10, 12, 15, 9, 10, 9, 16, 14, 15, 11, 10, 11, 10, 11, 12, 12, 12, 12, 10, 10
, 10, 11, 13, 11, 19, 11, 15, 13, 13, 11, 10, 13, 10, 10, 10, 12, 10, 10, 18, 12
, 12, 13, 11, 17, 10, 11, 10, 14, 12, 12, 14, 10, 15, 10, 10, 12, 12, 11, 10, 25
, 11, 13, 10, 11, 12, 12, 12, 17, 12, 11, 10, 11, 24, 10, 10, 10, 13, 10, 11, 12
, 10, 12, 12, 11, 24, 11, 15, 11, 13, 13, 12, 11, 10, 11, 10, 12, 10]
X = np.linspace(min(data), max(data), num=200)
fig,ax = plt.subplots()
#ax.hist(data, bins=25)
ax.plot(X, stats.skewnorm.pdf(X, *stats.skewnorm.fit(data)))
fig.savefig("test.png")
1 个回答
1
因为你的数据是离散的,所以在整数位置之间设置明确的区间边界,会让直方图看起来更合适。使用默认的区间边界时,有些区间可能会是空的,或者不同的区间会对应不同数量的输入值。
标准化直方图
一个概率密度函数(pdf)需要标准化,使得它的总面积为 1
。你可以在绘制直方图时加上 ax.hist(..., density=True)
,这样直方图也会被标准化。
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import numpy as np
from scipy import stats
data = [10, 10, 11, 10, 11, 11, 10, 10, 15, 15, 14, 18, 11, 10, 11, 13, 13, 10, 13, 16, 16, 15, 11, 16, 12, 11, 17, 13, 11, 14, 12, 11, 10, 12, 11, 12, 10, 12, 10, 12, 11, 11, 11, 12, 15, 11, 12, 12, 10, 12, 10, 10, 11, 11, 14, 10, 11, 10, 17, 10, 15, 10, 11, 11, 10, 9, 12, 11, 13, 12, 12, 11, 11, 16, 15, 21, 11, 11, 11, 13, 11, 12, 10, 21, 10, 13, 10, 10, 13, 13, 10, 18, 13, 13, 11, 14, 10, 14, 13, 11, 10, 12, 15, 9, 10, 9, 16, 14, 15, 11, 10, 11, 10, 11, 12, 12, 12, 12, 10, 10, 10, 11, 13, 11, 19, 11, 15, 13, 13, 11, 10, 13, 10, 10, 10, 12, 10, 10, 18, 12, 12, 13, 11, 17, 10, 11, 10, 14, 12, 12, 14, 10, 15, 10, 10, 12, 12, 11, 10, 25, 11, 13, 10, 11, 12, 12, 12, 17, 12, 11, 10, 11, 24, 10, 10, 10, 13, 10, 11, 12, 10, 12, 12, 11, 24, 11, 15, 11, 13, 13, 12, 11, 10, 11, 10, 12, 10]
X = np.linspace(min(data), max(data), num=200)
bins = np.arange(min(data) - 0.5, max(data) + 1, 1)
fig, ax = plt.subplots()
ax.hist(data, bins=bins, density=True)
fit_params = stats.skewnorm.fit(data)
ax.plot(X, stats.skewnorm.pdf(X, *fit_params))
ax.fill_between(X, stats.skewnorm.pdf(X, *fit_params), color='red', alpha=0.3)
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
plt.show()
缩放概率密度函数
除了标准化直方图,你还可以将概率密度函数与直方图的面积相乘。面积是所有柱子的面积之和。由于柱子的高度加起来等于观察值的总数,并且使用的区间宽度为 1
,所以面积就是 len(data)
。
为了更好地显示数据的离散性,可以将柱子画得更窄。在 ax.hist()
中的 rwidth=
是一个缩放因子。
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
data = [10, 10, 11, 10, 11, 11, 10, 10, 15, 15, 14, 18, 11, 10, 11, 13, 13, 10, 13, 16, 16, 15, 11, 16, 12, 11, 17, 13, 11, 14, 12, 11, 10, 12, 11, 12, 10, 12, 10, 12, 11, 11, 11, 12, 15, 11, 12, 12, 10, 12, 10, 10, 11, 11, 14, 10, 11, 10, 17, 10, 15, 10, 11, 11, 10, 9, 12, 11, 13, 12, 12, 11, 11, 16, 15, 21, 11, 11, 11, 13, 11, 12, 10, 21, 10, 13, 10, 10, 13, 13, 10, 18, 13, 13, 11, 14, 10, 14, 13, 11, 10, 12, 15, 9, 10, 9, 16, 14, 15, 11, 10, 11, 10, 11, 12, 12, 12, 12, 10, 10, 10, 11, 13, 11, 19, 11, 15, 13, 13, 11, 10, 13, 10, 10, 10, 12, 10, 10, 18, 12, 12, 13, 11, 17, 10, 11, 10, 14, 12, 12, 14, 10, 15, 10, 10, 12, 12, 11, 10, 25, 11, 13, 10, 11, 12, 12, 12, 17, 12, 11, 10, 11, 24, 10, 10, 10, 13, 10, 11, 12, 10, 12, 12, 11, 24, 11, 15, 11, 13, 13, 12, 11, 10, 11, 10, 12, 10]
bins = np.arange(min(data) - 0.5, max(data) + 1, 1)
X = np.linspace(bins[0], bins[-1], num=200)
fig, ax = plt.subplots()
ax.hist(data, bins=bins, density=False, rwidth=0.3)
fit_params = stats.skewnorm.fit(data)
ax.plot(X, len(data) * stats.skewnorm.pdf(X, *fit_params), color='crimson')
ax.fill_between(X, len(data) * stats.skewnorm.pdf(X, *fit_params), color='crimson', alpha=0.3)
ax.set_xticks(range(min(data), max(data) + 1))
ax.margins(x=0)
plt.show()
拟合负二项分布
负二项分布是一种离散分布。它的形状大概是这样的:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import nbinom
data = [10, 10, 11, 10, 11, 11, 10, 10, 15, 15, 14, 18, 11, 10, 11, 13, 13, 10, 13, 16, 16, 15, 11, 16, 12, 11, 17, 13, 11, 14, 12, 11, 10, 12, 11, 12, 10, 12, 10, 12, 11, 11, 11, 12, 15, 11, 12, 12, 10, 12, 10, 10, 11, 11, 14, 10, 11, 10, 17, 10, 15, 10, 11, 11, 10, 9, 12, 11, 13, 12, 12, 11, 11, 16, 15, 21, 11, 11, 11, 13, 11, 12, 10, 21, 10, 13, 10, 10, 13, 13, 10, 18, 13, 13, 11, 14, 10, 14, 13, 11, 10, 12, 15, 9, 10, 9, 16, 14, 15, 11, 10, 11, 10, 11, 12, 12, 12, 12, 10, 10, 10, 11, 13, 11, 19, 11, 15, 13, 13, 11, 10, 13, 10, 10, 10, 12, 10, 10, 18, 12, 12, 13, 11, 17, 10, 11, 10, 14, 12, 12, 14, 10, 15, 10, 10, 12, 12, 11, 10, 25, 11, 13, 10, 11, 12, 12, 12, 17, 12, 11, 10, 11, 24, 10, 10, 10, 13, 10, 11, 12, 10, 12, 12, 11, 24, 11, 15, 11, 13, 13, 12, 11, 10, 11, 10, 12, 10]
loc = min(data) # suppose the distribution starts at the lowest observed value
mean = np.mean(data)
var = np.var(data)
p = (mean - loc) / var
n = (mean - loc) ** 2 / (var - (mean - loc))
fig, ax = plt.subplots()
bins = np.arange(min(data) - 0.5, max(data) + 1, 1)
ax.hist(data, bins=bins, rwidth=0.9, density=True)
X = np.arange(min(data) - 1, max(data) + 2)
ax.plot(X, nbinom.pmf(X, loc=loc, n=n, p=p), color='crimson', marker='o', ls=':')
ax.fill_between(X, nbinom.pmf(X, loc=loc, n=n, p=p), color='crimson', alpha=0.3)
ax.set_xticks(np.arange(min(data), max(data) + 1))
ax.margins(x=0)
plt.show()