我是Python和数据分析的新手,O正在使用ARIMA模型研究时间序列问题。假设我的数据是
Month,Value
1949-01,112
1949-02,118
1949-03,132
1949-04,129
1949-05,121
1949-06,135
1949-07,148
1949-08,148
1949-09,136
基于上述数据,我必须预测未来几年的数据,我能够做到所有的概念都是以here的形式出现的,但最终的结果是以日志的形式出现的,我想把它转换成乘客数量的形式。
我的代码
from datetime import datetime
from matplotlib.pylab import rcParams
from pyspark.sql.functions import window
from statsmodels.tsa.stattools import adfuller
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from unicodedata import decomposition
rcParams['figure.figsize'] = 10, 6
dataset = pd.read_csv("/home/rajnish.kumar/eclipse-workspace/TimeSeriesPrediction/Data/trial_series.csv")
# parse strings to datetime type
dataset['Month'] = pd.to_datetime(dataset['Month'], infer_datetime_format=True)
indexedDataset = dataset.set_index(['Month'])
print(indexedDataset.tail())
plt.xlabel("Date")
plt.ylabel("value")
plt.plot(indexedDataset)
plt.show()
rolemean = indexedDataset.rolling(window=12).mean()
rolstd = indexedDataset.rolling(window=12).std()
print(rolemean,rolstd)
orign = plt.plot(indexedDataset,color='blue',label='Original')
meanplot = plt.plot(rolemean,color='red',label='Roling Mean')
std = plt.plot(rolstd,color='black',label='Rolling Std')
plt.legend(loc='best')
plt.title("Rolling Mean and Standard Deviation")
plt.show(block=False)
print("Result of Dickey-Fuller Test:")
dftest = adfuller(indexedDataset['Value'], autolag='AIC')
dfoutput = pd.Series(dftest[0:4],index=['Test Statistics','p-value','#Lags Used','Number Of Observations Used'])
for key, value in dftest[4].items():
dfoutput['Critical Value (%s)'%key]= value
print(dfoutput)
indexedDataset_logScale = np.log(indexedDataset)
plt.plot(indexedDataset_logScale)
movingaverage = indexedDataset_logScale.rolling(window=12).mean()
movingSTD = indexedDataset_logScale.rolling(window=12).std()
plt.plot(indexedDataset_logScale)
plt.plot(movingaverage,color='red')
dataSetLogScaleMinusMovingAverage = indexedDataset_logScale - movingaverage
print(dataSetLogScaleMinusMovingAverage.head(12))
# remove NAN Values
dataSetLogScaleMinusMovingAverage.dropna(inplace=True)
print(dataSetLogScaleMinusMovingAverage.head(10))
def test_stationarity(timeseries):
movingAverage = timeseries.rolling(window=12).mean()
movingSTD = timeseries.rolling(window=12).std()
orign = plt.plot(timeseries,color='blue',label='Original')
meanplot = plt.plot(movingAverage,color='red',label='Roling Mean')
std = plt.plot(movingSTD,color='black',label='Rolling Std')
plt.legend(loc='best')
plt.title("Rolling Mean and Standard Deviation")
plt.show(block=False)
dftest = adfuller(timeseries['Value'], autolag='AIC')
dfoutput = pd.Series(dftest[0:4],index=['Test Statistics','p-value','#Lags Used','Number Of Observations Used'])
for key, value in dftest[4].items():
dfoutput['Critical Value (%s)'%key]= value
print(dfoutput)
test_stationarity(dataSetLogScaleMinusMovingAverage)
exponentialDecayWeightedAverage = indexedDataset_logScale.ewm(halflife=12,min_periods=0,adjust= True).mean()
plt.plot(indexedDataset_logScale)
plt.plot(exponentialDecayWeightedAverage,color='red')
datasetLogScaleMinusMovingExponentialDecayAverage = indexedDataset_logScale - exponentialDecayWeightedAverage
test_stationarity(datasetLogScaleMinusMovingExponentialDecayAverage)
datasetLogDiffShifting = indexedDataset_logScale - indexedDataset_logScale.shift()
plt.plot(datasetLogDiffShifting)
datasetLogDiffShifting.dropna(inplace=True)
test_stationarity(datasetLogDiffShifting)
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(indexedDataset_logScale)
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
plt.subplot(411)
plt.plot(indexedDataset_logScale,label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend,label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual,label='Residuals')
plt.legend(loc='best')
plt.tight_layout()
decomposedLogData = residual
decomposedLogData.dropna(inplace=True)
test_stationarity(decomposedLogData)
# ACF and PACF plots
from statsmodels.tsa.stattools import acf,pacf
lag_acf = acf(datasetLogDiffShifting,nlags=20)
lag_pacf = pacf(datasetLogDiffShifting,nlags=20,method='ols')
# Plot ACF
plt.subplot(121)
plt.plot(lag_acf)
plt.axhline(y=0, linestyle='--', color='gray')
plt.axhline(y=-1.96/np.sqrt(len(datasetLogDiffShifting)),linestyle='--', color='gray')
plt.axhline(y= 1.96/np.sqrt(len(datasetLogDiffShifting)),linestyle='--', color='gray')
plt.title('Autocorrelation Function')
# Plot PACF
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0, linestyle='--', color='gray')
plt.axhline(y=-1.96/np.sqrt(len(datasetLogDiffShifting)),linestyle='--', color='gray')
plt.axhline(y= 1.96/np.sqrt(len(datasetLogDiffShifting)),linestyle='--', color='gray')
plt.title('PArtial Autocorrelation Function')
plt.tight_layout()
from statsmodels.tsa.arima_model import ARIMA
# AR MODEL
model = ARIMA (indexedDataset_logScale,order =(2,1,2))
results_ar = model.fit(disp=1)
plt.plot(datasetLogDiffShifting)
plt.plot(results_ar.fittedvalues,color ='red')
plt.title('RSS: %.4f'% sum((results_ar.fittedvalues-datasetLogDiffShifting["Value"])**2))
print('Plotting AR Model')
# MA MODEL
Model = ARIMA (indexedDataset_logScale,order =(0,1,2))
results_ma = Model.fit(disp=1)
plt.plot(datasetLogDiffShifting)
plt.plot(results_ma.fittedvalues,color ='red')
plt.title('RSS: %.4f'% sum((results_ma.fittedvalues-datasetLogDiffShifting["Value"])**2))
print('Plotting MA Model')
# ARIMA
MoDel = ARIMA (indexedDataset_logScale,order =(2,1,2))
results_arima = MoDel.fit(disp=1)
plt.plot(datasetLogDiffShifting)
plt.plot(results_arima.fittedvalues,color ='red')
plt.title('RSS: %.4f'% sum((results_arima.fittedvalues-datasetLogDiffShifting["Value"])**2))
print('Plotting ARIMA Model')
predictions_ARIMA_diff = pd.Series(results_arima.fittedvalues,copy=True)
print(predictions_ARIMA_diff.head())
# Convert to cumulative sum
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
print(predictions_ARIMA_diff_cumsum.head())
predictions_ARIMA_log = pd.Series(indexedDataset_logScale['Value'].ix[0],index=indexedDataset_logScale.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
print(predictions_ARIMA_log.head())
predictions_ARIMA = np.exp(predictions_ARIMA_log)
plt.plot(indexedDataset)
plt.plot(predictions_ARIMA)
# predict
results_arima.plot_predict(1,264)
#predictions_ARIMA.forecast(steps=12)
print "-------------------------------------"
print predictions_ARIMA.forecast(steps=12) // when i run this line i am getting
Traceback (most recent call last):
File "/home/rajnish.kumar/eclipse-workspace/TimeSeriesPrediction/TimeSerise/__init__.py", line 227, in <module>
predictions_ARIMA.forecast(steps=12)
File "/usr/local/lib/python2.7/dist-packages/pandas/core/generic.py", line 4376, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'Series' object has no attribute 'forecast'
当我运行print results_arima.forecast(steps=12)
时,我得到的结果低于结果。
(array([ 6.09553392, 6.1528141 , 6.22442983, 6.29241129, 6.34164751,
6.36359397, 6.35784715, 6.33139323, 6.29597547, 6.2644771 ,
6.24738318, 6.25025166]), array([ 0.08384711, 0.10749464, 0.11568698, 0.11702779, 0.11703501,
0.11744022, 0.11762254, 0.11778717, 0.12024167, 0.12736047,
0.13870965, 0.15118799]), array([[ 5.9311966 , 6.25987125],
[ 5.94212847, 6.36349972],
[ 5.99768751, 6.45117214],
[ 6.06304103, 6.52178154],
[ 6.11226311, 6.5710319 ],
[ 6.13341538, 6.59377256],
[ 6.12731121, 6.58838309],
[ 6.10053461, 6.56225184],
[ 6.06030613, 6.5316448 ],
[ 6.01485518, 6.51409903],
[ 5.97551726, 6.5192491 ],
[ 5.95392864, 6.54657468]]))
目前没有回答
相关问题 更多 >
编程相关推荐