建立ARIMA 模型的一般过程如下:
- 1: 模块导入,加载数据,并可视化时间序列数据
- 2:平稳性检验
- 3:序列平稳化
- 4:白噪声检验
- 5: 时间序列定阶
- 6:构建ARIMA模型及预测
#from model.arimaModel import * # 导入自定义的方法 import pandas as pd import matplotlib.pyplot as plt import pmdarima as pm from statsmodels.graphics.tsaplots import plot_acf, plot_pacf # 画图定阶 from statsmodels.tsa.stattools import adfuller # ADF检验 from statsmodels.stats.diagnostic import acorr_ljungbox # 白噪声检验 import warnings warnings.filterwarnings("ignore") # 选择过滤警告
df_taiyuan = pd.read_excel('月.xls',sheet_name ='太原',dtype={'年份':str, '月份': str}) df_shanxi = pd.read_excel('月.xls',sheet_name ='杭州',dtype={'年份':str, '月份': str}) df_taiyuan.info() df_taiyuan["年份"] = df_taiyuan["年份"].str.strip() df_taiyuan["月份"] = df_taiyuan["月份"].str.strip() df_shanxi["年份"] = df_shanxi["年份"].str.strip() df_shanxi["月份"] = df_shanxi["月份"].str.strip() df_taiyuan["Date"] = df_taiyuan["年份"]+'-'+df_taiyuan["月份"] df_shanxi["Date"] = df_shanxi["年份"]+'-'+df_shanxi["月份"]
RangeIndex: 240 entries, 0 to 239 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 年份 240 non-null object 1 月份 240 non-null object 2 市 240 non-null object 3 降水量/mm 240 non-null float64 dtypes: float64(1), object(3) memory usage: 7.6+ KB
df_taiyuan
240 rows × 5 columns
# 对购买日期进行转换 df_taiyuan["Date"] = pd.to_datetime(df_taiyuan["Date"],format='%Y-%m',errors = 'coerce')#加errors防止报错 df_shanxi["Date"] = pd.to_datetime(df_shanxi["Date"],format='%Y-%m',errors = 'coerce')#加errors防止报错
df_taiyuan.info()
RangeIndex: 240 entries, 0 to 239 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 年份 240 non-null object 1 月份 240 non-null object 2 市 240 non-null object 3 降水量/mm 240 non-null float64 4 Date 240 non-null datetime64[ns] dtypes: datetime64[ns](1), float64(1), object(3) memory usage: 9.5+ KB
df_taiyuan = df_taiyuan[['Date','降水量/mm']] df_shanxi = df_shanxi[['Date','降水量/mm']] df_taiyuan.set_index('Date',inplace=True) df_shanxi.set_index('Date',inplace=True)
df_shanxi.describe()
df_taiyuan.describe()
import matplotlib as mpl mpl.rcParams['font.sans-serif'] = ['SimHei'] mpl.rcParams['font.serif'] = ['SimHei'] # mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题,或者转换负号为字符串 # import seaborn as sns # sns.set_style("darkgrid",{"font.sans-serif":['KaiTi', 'Arial']}) #这是方便seaborn绘图得时候得字体设置 fig = plt.figure(figsize=(20,8), dpi=300) ax1 = fig.add_subplot(121) ax1.plot(df_taiyuan.index,df_taiyuan['降水量/mm'], color='red', linewidth=3, alpha=0.4, label='降水量/mm') #ax1.grid(c='r') xmax=np.max(df_taiyuan.index) xmin=np.min(df_taiyuan.index) plt.xlim(xmin,xmax) ax1.set_yticks(np.linspace(np.min(df_taiyuan['降水量/mm']), np.max(df_taiyuan['降水量/mm']), 10))#设置右边纵坐标刻度 plt.xlabel('月份') ax1.set_ylabel('降水量/mm') # 设置左边纵坐标标签 plt.legend(loc=1)#设置图例在右上方 plt.title('太原降水量vs时间')# 给整张图命名 ax2 = fig.add_subplot(122) ax2.plot(df_shanxi.index,df_shanxi['降水量/mm'], color='blue', linewidth=3, alpha=0.4, label='降水量/mm') #plt.grid(c='r') xmax=np.max(df_shanxi.index) xmin=np.min(df_shanxi.index) plt.xlim(xmin,xmax) ax2.set_yticks(np.linspace(np.min(df_shanxi['降水量/mm']), np.max(df_shanxi['降水量/mm']), 10))#设置右边纵坐标刻度 plt.xlabel('月份') ax2.set_ylabel('降水量/mm') # 设置左边纵坐标标签 plt.legend(loc=1)#设置图例在右上方 plt.title('山西降水量vs时间')# 给整张图命名 plt.show()
用matplotlib输出原始时间序列图如上所示。该时间序列有比较明显的季节性趋势,即基本上每一年数据都呈现先上升再下降。
1.3季节性分析 请注意,以下的分析以太原地区的df_taiyuan为例子同时,可以使用seasonal_decompose()进行分析,将时间序列分解成长期趋势项(Trend)、季节性周期项(Seansonal)和残差项(Resid)这三部分。该方法的原理见这https://link.csdn.net/?target=https%3A%2F%2Fwww.jianshu.com%2Fp%2Fe2cc90e1c32d
from statsmodels.tsa.seasonal import seasonal_decompose
# 分解数据查看季节性 freq为周期 taiyuan_decomposition = seasonal_decompose(df_taiyuan['降水量/mm'], freq=12) taiyuan_decomposition.plot() plt.show()
D:Anaconda3envstensorflowlibsite-packagesipykernel_launcher.py:2: FutureWarning: the 'freq'' keyword is deprecated, use 'period' instead D:Anaconda3envstensorflowlibsite-packagesmatplotlibbackendsbackend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0.0, flags=flags) D:Anaconda3envstensorflowlibsite-packagesmatplotlibbackendsbackend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0, flags=flags)
由下子图,确实每年的数据呈现先升后降的特征。
2:平稳性检验 2.1 Augmented Dickey-Fuller Test(ADF检验)[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-9D4KpdaG-1636857660193)(attachment:image.png)]
详细的介绍可以看这篇博客
https://blog.csdn.net/FrankieHello/article/details/86766625/
from statsmodels.tsa.stattools import adfuller # ADF检验
# 该函数参考了其他文章 def stableCheck(timeseries): # 移动12期的均值和方差 rol_mean = timeseries.rolling(window=12).mean() rol_std = timeseries.rolling(window=12).std() # 绘图 fig = plt.figure(figsize=(12, 8)) orig = plt.plot(timeseries, color='blue', label='Original') mean = plt.plot(rol_mean, color='red', label='Rolling Mean') std = plt.plot(rol_std, color='black', label='Rolling Std') plt.legend(loc='best') plt.title('Rolling Mean & Standard Deviation') plt.show() # 进行ADF检验 print('Results of Dickey-Fuller Test:') dftest = adfuller(timeseries, autolag='AIC') # 对检验结果进行语义描述 dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used']) for key, value in dftest[4].items(): dfoutput['Critical Value (%s)' % key] = value print('ADF检验结果:') print(dfoutput)
stableCheck_result1 = stableCheck(df_taiyuan['降水量/mm'])
Results of Dickey-Fuller Test: ADF检验结果: Test Statistic -3.382526 p-value 0.011569 #Lags Used 11.000000 Number of Observations Used 228.000000 Critical Value (1%) -3.459361 Critical Value (5%) -2.874302 Critical Value (10%) -2.573571 dtype: float64
检验结果如下。ADF检验的H0假设就是存在单位根,如果得到的显著性检验统计量小于三个置信度(10%,5%,1%),则对应有(90%,95,99%)的把握来拒绝原假设。
-
有以下两种方式来判断:
-
1 这里将Test Statistic与1%、%5、%10不同程度拒绝原假设的统计值进行比较。当ADF Test result 同时小于 1%、5%、10%即说明非常好地拒绝该假设。
-
2 本数据中,Test Statistic -3.382526,大于Critical Value (1%)。
-
3 观察p-value,是否非常接近于0,接近0才能能拒绝原假设。这里p-value约为0.01569,。
# 差分处理非平稳序列,先进行一阶差分 time_series_diff1 = df_taiyuan['降水量/mm'].diff(1).dropna() # 在一阶差分基础上进行季节性差分差分 time_series_diff2 = time_series_diff1.diff(12).dropna() stableCheck_result2 = stableCheck(time_series_diff2)
D:Anaconda3envstensorflowlibsite-packagesmatplotlibbackendsbackend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0.0, flags=flags) D:Anaconda3envstensorflowlibsite-packagesmatplotlibbackendsbackend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font. font.set_text(s, 0, flags=flags)
Results of Dickey-Fuller Test: ADF检验结果: Test Statistic -6.683002e+00 p-value 4.294983e-09 #Lags Used 1.300000e+01 Number of Observations Used 2.130000e+02 Critical Value (1%) -3.461429e+00 Critical Value (5%) -2.875207e+00 Critical Value (10%) -2.574054e+00 dtype: float64
-
对于非平稳时间序列,可以通过d阶差分运算使变成平稳序列。从统计学角度来讲,只有平稳性的时间序列才能避免“伪回归”的存在,才有经济意义。
-
进行了一阶差分,再进行了一次季节性差分,通过了ADF检验。结果如下,p-value很接近0,且Test Statistic远小于Critical Value (1%)的值,可以有99%的概率认为时间序列平稳。
-
因此,我们可以确定季节性ARIMA模型的d=1,D=1,m=12。
- 接下来,对d阶差分后的的平稳序列进行白噪声检验,若该平稳序列为非白噪声序列,则可以进行第五步。
详细的介绍可以看这篇博客
https://www.cnblogs.com/travelcat/p/11400307.html
Ljung-Box检验即LB检验,是时间序列分析中检验序列自相关性的方法。LB检验的Q统计量为:
用来检验m阶滞后范围内序列的自相关性是否显著,或序列是否为白噪声,Q统计量服从自由度为m的卡方分布。
4.3 python实现def whiteNoiseCheck(data): result = acorr_ljungbox(data, lags=1) temp = result[1] print('白噪声检验结果:', result) # 如果temp小于0.05,则可以以95%的概率拒绝原假设,认为该序列为非白噪声序列;否则,为白噪声序列,认为没有分析意义 print(temp) return result
ifwhiteNoise = whiteNoiseCheck(time_series_diff2)
白噪声检验结果: (array([57.11722495]), array([4.10593727e-14])) [4.10593727e-14]
检验结果如下:返回的是统计量和p-value,其中,延迟1阶的p-value约为4.10593727e-14<0.05,可以以95%的概率拒绝原假设,认为该序列为非白噪声序列。
5:时间序列定阶def draw_acf(data): # 利用ACF判断模型阶数 plot_acf(data) plt.title("序列自相关图(ACF)") plt.show() def draw_pacf(data): # 利用PACF判断模型阶数 plot_pacf(data) plt.title("序列偏自相关图(PACF)") plt.show() def draw_acf_pacf(data): from matplotlib.ticker import MultipleLocator f = plt.figure(facecolor='white') # 构建第一个图 ax1 = f.add_subplot(211) # 把x轴的刻度间隔设置为1,并存在变量里 x_major_locator = MultipleLocator(1) plot_acf(data, ax=ax1) # 构建第二个图 ax2 = f.add_subplot(212) plot_pacf(data, ax=ax2) plt.subplots_adjust(hspace=0.5) # 把x轴的主刻度设置为1的倍数 ax1.xaxis.set_major_locator(x_major_locator) ax2.xaxis.set_major_locator(x_major_locator) plt.show()
draw_acf_pacf(time_series_diff2)
具体怎么对季节性ARIMA模型进行定阶数可以看https://link.csdn.net/?target=https%3A%2F%2Fwww.jianshu.com%2Fp%2F413c094e46f6。
-
(1)确定d和D的阶数:当对原序列进行了d阶差分和lag为m的D D阶差分后序列为平稳序列,则可以确定d,D,m的值。
-
(2)确定p,q和P,Q的阶数:1)首先对平稳化后的时间序列绘制ACF和PACF图;2)然后,可以通过观察季节性lag处的拖尾/截尾情况来确定P,Q的值;3)观察短期非季节性lag处的拖尾/截尾情况来确定p,q的值。
-
之前已经在步骤三处确定了d,D,m。对于剩下的参数,将依据如下的ACF和PACF图确定。下图的横坐标lag是以月份为单位。
-
非季节性部分:对于p,在lag =1后,ACF图拖尾,PACF图截尾。同样地,对于q,在lag = 1,ACF图截尾,PACF图拖尾。
-
季节性部分:P , Q的确定和非季节性一样,不过需要记得滞后的间隔为12。
-
AR模型:自相关系数拖尾,偏自相关系数截尾;
-
MA模型:自相关系数截尾,偏自相关函数拖尾;
-
ARMA模型:自相关函数和偏自相关函数均拖尾。
- 这里使用了pmdarima.auto_arima()方法。这个方法可以帮助我们自动确定 A R I M A ( p , d , q ) ( P , D , Q ) m 的参数,也就是可以直接跳过上述的步骤,直接输入数据,设置auto_arima()中的参数则可。
- 通过观察ACF、PACF图的拖尾截尾现象来定阶,但是这样可能不准确。实际上,往往需要结合图像拟合多个模型,通过模型的AIC、BIC值以及残差分析结果来选择合适的模型。
import pmdarima as pm
将数据分为训练集data_train和测试集data_test 。
time_series = df_taiyuan['降水量/mm'] split_point = int(len(time_series) * 0.8) # 确定训练集/测试集 data_train, data_test = time_series[0:split_point], time_series[split_point:len(time_series)] # 使用训练集的数据来拟合模型 built_arimamodel = pm.auto_arima(data_train, start_p=0, # p最小值 start_q=0, # q最小值 test='adf', # ADF检验确认差分阶数d max_p=5, # p最大值 max_q=5, # q最大值 m=12, # 季节性周期长度,当m=1时则不考虑季节性 d=None, # 通过函数来计算d seasonal=True, start_P=0, D=1, trace=True, error_action='ignore', suppress_warnings=True, stepwise=False # stepwise为False则不进行完全组合遍历 ) print(built_arimamodel.summary())
ARIMA(0,0,0)(0,1,0)[12] intercept : AIC=1660.467, Time=0.19 sec ARIMA(0,0,0)(0,1,1)[12] intercept : AIC=1601.913, Time=0.13 sec ARIMA(0,0,0)(0,1,2)[12] intercept : AIC=inf, Time=0.61 sec ARIMA(0,0,0)(1,1,0)[12] intercept : AIC=1634.528, Time=0.12 sec ARIMA(0,0,0)(1,1,1)[12] intercept : AIC=inf, Time=0.28 sec ARIMA(0,0,0)(1,1,2)[12] intercept : AIC=inf, Time=0.76 sec ARIMA(0,0,0)(2,1,0)[12] intercept : AIC=1627.601, Time=0.29 sec ARIMA(0,0,0)(2,1,1)[12] intercept : AIC=inf, Time=0.60 sec ARIMA(0,0,0)(2,1,2)[12] intercept : AIC=inf, Time=1.02 sec ARIMA(0,0,1)(0,1,0)[12] intercept : AIC=1661.180, Time=0.06 sec ARIMA(0,0,1)(0,1,1)[12] intercept : AIC=1603.795, Time=0.17 sec ARIMA(0,0,1)(0,1,2)[12] intercept : AIC=inf, Time=0.27 sec ARIMA(0,0,1)(1,1,0)[12] intercept : AIC=1635.873, Time=0.15 sec ARIMA(0,0,1)(1,1,1)[12] intercept : AIC=inf, Time=0.43 sec ARIMA(0,0,1)(1,1,2)[12] intercept : AIC=inf, Time=0.98 sec ARIMA(0,0,1)(2,1,0)[12] intercept : AIC=1629.197, Time=0.35 sec ARIMA(0,0,1)(2,1,1)[12] intercept : AIC=inf, Time=0.38 sec ARIMA(0,0,1)(2,1,2)[12] intercept : AIC=inf, Time=1.24 sec ARIMA(0,0,2)(0,1,0)[12] intercept : AIC=1663.146, Time=0.09 sec ARIMA(0,0,2)(0,1,1)[12] intercept : AIC=1604.923, Time=0.40 sec ARIMA(0,0,2)(0,1,2)[12] intercept : AIC=inf, Time=0.96 sec ARIMA(0,0,2)(1,1,0)[12] intercept : AIC=1636.040, Time=0.19 sec ARIMA(0,0,2)(1,1,1)[12] intercept : AIC=inf, Time=0.28 sec ARIMA(0,0,2)(1,1,2)[12] intercept : AIC=inf, Time=1.21 sec ARIMA(0,0,2)(2,1,0)[12] intercept : AIC=1630.228, Time=0.42 sec ARIMA(0,0,2)(2,1,1)[12] intercept : AIC=inf, Time=0.44 sec ARIMA(0,0,3)(0,1,0)[12] intercept : AIC=1665.144, Time=0.11 sec ARIMA(0,0,3)(0,1,1)[12] intercept : AIC=1606.865, Time=0.49 sec ARIMA(0,0,3)(0,1,2)[12] intercept : AIC=inf, Time=0.48 sec ARIMA(0,0,3)(1,1,0)[12] intercept : AIC=1637.649, Time=0.26 sec ARIMA(0,0,3)(1,1,1)[12] intercept : AIC=inf, Time=0.31 sec ARIMA(0,0,3)(2,1,0)[12] intercept : AIC=1632.051, Time=0.51 sec ARIMA(0,0,4)(0,1,0)[12] intercept : AIC=1667.075, Time=0.23 sec ARIMA(0,0,4)(0,1,1)[12] intercept : AIC=1608.812, Time=0.72 sec ARIMA(0,0,4)(1,1,0)[12] intercept : AIC=1639.603, Time=0.28 sec ARIMA(0,0,5)(0,1,0)[12] intercept : AIC=1668.651, Time=0.20 sec ARIMA(1,0,0)(0,1,0)[12] intercept : AIC=1661.224, Time=0.04 sec ARIMA(1,0,0)(0,1,1)[12] intercept : AIC=1603.813, Time=0.23 sec ARIMA(1,0,0)(0,1,2)[12] intercept : AIC=inf, Time=0.63 sec ARIMA(1,0,0)(1,1,0)[12] intercept : AIC=1635.999, Time=0.16 sec ARIMA(1,0,0)(1,1,1)[12] intercept : AIC=inf, Time=0.18 sec ARIMA(1,0,0)(1,1,2)[12] intercept : AIC=inf, Time=0.95 sec ARIMA(1,0,0)(2,1,0)[12] intercept : AIC=1629.252, Time=0.37 sec ARIMA(1,0,0)(2,1,1)[12] intercept : AIC=inf, Time=0.40 sec ARIMA(1,0,0)(2,1,2)[12] intercept : AIC=inf, Time=1.18 sec ARIMA(1,0,1)(0,1,0)[12] intercept : AIC=1663.147, Time=0.08 sec ARIMA(1,0,1)(0,1,1)[12] intercept : AIC=1605.849, Time=0.40 sec ARIMA(1,0,1)(0,1,2)[12] intercept : AIC=inf, Time=0.38 sec ARIMA(1,0,1)(1,1,0)[12] intercept : AIC=1636.543, Time=0.27 sec ARIMA(1,0,1)(1,1,1)[12] intercept : AIC=inf, Time=0.36 sec ARIMA(1,0,1)(1,1,2)[12] intercept : AIC=inf, Time=1.22 sec ARIMA(1,0,1)(2,1,0)[12] intercept : AIC=1630.502, Time=0.58 sec ARIMA(1,0,1)(2,1,1)[12] intercept : AIC=inf, Time=1.20 sec ARIMA(1,0,2)(0,1,0)[12] intercept : AIC=1665.180, Time=0.08 sec ARIMA(1,0,2)(0,1,1)[12] intercept : AIC=1606.862, Time=0.47 sec ARIMA(1,0,2)(0,1,2)[12] intercept : AIC=inf, Time=1.20 sec ARIMA(1,0,2)(1,1,0)[12] intercept : AIC=1637.802, Time=0.40 sec ARIMA(1,0,2)(1,1,1)[12] intercept : AIC=inf, Time=0.57 sec ARIMA(1,0,2)(2,1,0)[12] intercept : AIC=1632.094, Time=0.69 sec ARIMA(1,0,3)(0,1,0)[12] intercept : AIC=1663.793, Time=0.23 sec ARIMA(1,0,3)(0,1,1)[12] intercept : AIC=inf, Time=0.73 sec ARIMA(1,0,3)(1,1,0)[12] intercept : AIC=inf, Time=0.67 sec ARIMA(1,0,4)(0,1,0)[12] intercept : AIC=1668.308, Time=0.28 sec ARIMA(2,0,0)(0,1,0)[12] intercept : AIC=1663.152, Time=0.06 sec ARIMA(2,0,0)(0,1,1)[12] intercept : AIC=1604.940, Time=0.34 sec ARIMA(2,0,0)(0,1,2)[12] intercept : AIC=inf, Time=0.35 sec ARIMA(2,0,0)(1,1,0)[12] intercept : AIC=1636.341, Time=0.23 sec ARIMA(2,0,0)(1,1,1)[12] intercept : AIC=inf, Time=0.24 sec ARIMA(2,0,0)(1,1,2)[12] intercept : AIC=inf, Time=1.15 sec ARIMA(2,0,0)(2,1,0)[12] intercept : AIC=1630.378, Time=0.53 sec ARIMA(2,0,0)(2,1,1)[12] intercept : AIC=inf, Time=0.54 sec ARIMA(2,0,1)(0,1,0)[12] intercept : AIC=1665.145, Time=0.17 sec ARIMA(2,0,1)(0,1,1)[12] intercept : AIC=1606.831, Time=0.60 sec ARIMA(2,0,1)(0,1,2)[12] intercept : AIC=inf, Time=1.00 sec ARIMA(2,0,1)(1,1,0)[12] intercept : AIC=1637.650, Time=0.39 sec ARIMA(2,0,1)(1,1,1)[12] intercept : AIC=inf, Time=0.49 sec ARIMA(2,0,1)(2,1,0)[12] intercept : AIC=1632.038, Time=0.84 sec ARIMA(2,0,2)(0,1,0)[12] intercept : AIC=inf, Time=0.35 sec ARIMA(2,0,2)(0,1,1)[12] intercept : AIC=inf, Time=0.83 sec ARIMA(2,0,2)(1,1,0)[12] intercept : AIC=1634.264, Time=0.68 sec ARIMA(2,0,3)(0,1,0)[12] intercept : AIC=1669.128, Time=0.18 sec ARIMA(3,0,0)(0,1,0)[12] intercept : AIC=1665.134, Time=0.07 sec ARIMA(3,0,0)(0,1,1)[12] intercept : AIC=1606.847, Time=0.46 sec ARIMA(3,0,0)(0,1,2)[12] intercept : AIC=inf, Time=0.41 sec ARIMA(3,0,0)(1,1,0)[12] intercept : AIC=1637.441, Time=0.30 sec ARIMA(3,0,0)(1,1,1)[12] intercept : AIC=inf, Time=0.30 sec ARIMA(3,0,0)(2,1,0)[12] intercept : AIC=1631.999, Time=0.62 sec ARIMA(3,0,1)(0,1,0)[12] intercept : AIC=1667.134, Time=0.09 sec ARIMA(3,0,1)(0,1,1)[12] intercept : AIC=1608.822, Time=0.96 sec ARIMA(3,0,1)(1,1,0)[12] intercept : AIC=1639.388, Time=0.50 sec ARIMA(3,0,2)(0,1,0)[12] intercept : AIC=inf, Time=0.41 sec ARIMA(4,0,0)(0,1,0)[12] intercept : AIC=1667.112, Time=0.08 sec ARIMA(4,0,0)(0,1,1)[12] intercept : AIC=1608.715, Time=0.59 sec ARIMA(4,0,0)(1,1,0)[12] intercept : AIC=1639.341, Time=0.37 sec ARIMA(4,0,1)(0,1,0)[12] intercept : AIC=1668.911, Time=0.23 sec ARIMA(5,0,0)(0,1,0)[12] intercept : AIC=1669.005, Time=0.11 sec Best model: ARIMA(0,0,0)(0,1,1)[12] intercept Total fit time: 44.314 seconds SARIMAX Results ================================================================================== Dep. Variable: y No. Observations: 192 Model: SARIMAX(0, 1, [1], 12) Log Likelihood -797.957 Date: Sun, 14 Nov 2021 AIC 1601.913 Time: 01:24:40 BIC 1611.492 Sample: 0 HQIC 1605.797 - 192 Covariance Type: opg ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ intercept 0.4770 0.398 1.198 0.231 -0.303 1.257 ma.S.L12 -0.8625 0.046 -18.866 0.000 -0.952 -0.773 sigma2 379.2626 29.167 13.003 0.000 322.096 436.430 =================================================================================== Ljung-Box (L1) (Q): 0.09 Jarque-Bera (JB): 49.15 Prob(Q): 0.76 Prob(JB): 0.00 Heteroskedasticity (H): 1.42 Skew: 0.55 Prob(H) (two-sided): 0.17 Kurtosis: 5.32 =================================================================================== Warnings: [1] Covariance matrix calculated using the outer product of gradients (complex-step).6.2 模型识别
built_arimamodel.plot_diagnostics() plt.figure(figsize=(20,20)) plt.show()
6.3 需求预测
# 进行多步预测,再与测试集的数据进行比较 pred_list = [] for index, row in data_test.iteritems(): # 输出索引,值 # print(index, row) pred_list += [built_arimamodel.predict(n_periods=1)] # 更新模型,model.update()函数,不断用新观测到的 value 更新模型 built_arimamodel.update(row) # 预测时间序列以外未来的一次 predict_f1 = built_arimamodel.predict(n_periods=1) print('未来一期的预测需求为:', predict_f1[0])
未来一期的预测需求为: 10.201966098632884 未来一期的预测需求为: 14.99989920567672 未来一期的预测需求为: 42.26895773858568 未来一期的预测需求为: 40.21468047811304 未来一期的预测需求为: 68.85271218590057 未来一期的预测需求为: 139.7592549982466 未来一期的预测需求为: 104.44564264905952 未来一期的预测需求为: 62.246242349666126 未来一期的预测需求为: 54.145840142964836 未来一期的预测需求为: 13.671028354685868 未来一期的预测需求为: 8.06936900981979 未来一期的预测需求为: 9.22272411800279 未来一期的预测需求为: 10.689786166366131 未来一期的预测需求为: 15.212157485076254 未来一期的预测需求为: 43.54056246836933 未来一期的预测需求为: 41.74871042543931 未来一期的预测需求为: 72.11023415842766 未来一期的预测需求为: 150.67364455852288 未来一期的预测需求为: 103.52649625565765 未来一期的预测需求为: 59.48886309148327 未来一期的预测需求为: 57.120699610158184 未来一期的预测需求为: 14.02798554515653 未来一期的预测需求为: 8.996770333795709 未来一期的预测需求为: 9.168063152136359 未来一期的预测需求为: 11.219590648904596 未来一期的预测需求为: 15.986039401233102 未来一期的预测需求为: 42.796451111774665 未来一期的预测需求为: 41.090246552574605 未来一期的预测需求为: 72.66350772032835 未来一期的预测需求为: 150.2526828410559 未来一期的预测需求为: 108.10944211457121 未来一期的预测需求为: 55.01301452159474 未来一期的预测需求为: 66.51392120325039 未来一期的预测需求为: 12.861931991845184 未来一期的预测需求为: 8.077133764924923 未来一期的预测需求为: 9.35135519755752 未来一期的预测需求为: 10.159858286271415 未来一期的预测需求为: 15.877413383751623 未来一期的预测需求为: 41.047721899734015 未来一期的预测需求为: 42.382601802288434 未来一期的预测需求为: 70.04074197968913 未来一期的预测需求为: 150.7447296776018 未来一期的预测需求为: 106.62147469854312 未来一期的预测需求为: 56.950800263708764 未来一期的预测需求为: 57.245633831422175 未来一期的预测需求为: 12.877487322408932 未来一期的预测需求为: 7.712518745698711 未来一期的预测需求为: 8.5378493776609836.4 结果评估
对预测结果进行评价,指标解释看这。https://blog.csdn.net/FrankieHello/article/details/82024526/
# ARIMA模型评价 def forecast_accuracy(forecast, actual): mape = np.mean(np.abs(forecast - actual)/np.abs(actual)) me = np.mean(forecast - actual) mae = np.mean(np.abs(forecast - actual)) mpe = np.mean((forecast - actual)/actual) # rmse = np.mean((forecast - actual)**2)**0.5 # RMSE rmse_1 = np.sqrt(sum((forecast - actual) ** 2) / actual.size) corr = np.corrcoef(forecast, actual)[0, 1] mins = np.amin(np.hstack([forecast[:, None], actual[:, None]]), axis=1) maxs = np.amax(np.hstack([forecast[:, None], actual[:, None]]), axis=1) minmax = 1 - np.mean(mins/maxs) return ({'mape': mape, 'me': me, 'mae': mae, 'mpe': mpe, 'rmse': rmse_1, 'corr': corr, 'minmax': minmax })
# 画图观测实际与测试的对比 test_predict = data_test.copy() for x in range(len(test_predict)): test_predict.iloc[x] = pred_list[x] # 模型评价 eval_result = forecast_accuracy(test_predict.values, data_test.values) print('模型评价结果n', eval_result) # 画图显示 plt.figure(figsize=(12,8)) plt.grid() plt.plot(df_taiyuan['降水量/mm'], 'p-', lw=1, label='True Data', marker='s') plt.plot(test_predict, 'r-', lw=1, label='Prediction', marker='o') plt.title('RMSE:{}'.format(eval_result['rmse'])) plt.legend(loc='best') plt.grid() # 生成网格 plt.show()
模型评价结果 {'mape': 0.711626695820776, 'me': 2.3749678132592074, 'mae': 16.21231564623559, 'mpe': 0.5716067098565859, 'rmse': 26.39232189767079, 'corr': 0.8294116106810141, 'minmax': 0.3088124622843993}
红色的点为预测结果,蓝色的点为原来的数据。
RMSE约为26.4,方根误差(Root Mean Square Error,RMSE)是预测值与真实值偏差的平方与观测次数n比值的平方根指标,衡量观测值与真实值之间的偏差。一般是越小越好,但是实际上需要进行结果对比才能判断。
RIMA模型对数据的要求还是挺高的。其中,数据量太少不行,当只有一年左右的跨度,很难发现一些周期规律。另外就是对平稳性的要求。
import statsmodels.api as sm
model=sm.tsa.statespace.SARIMAX(df_taiyuan['降水量/mm'],order=(0, 0, 0),seasonal_order=(0,1,1,12)) results=model.fit()
df_taiyuan
240 rows × 1 columns
df_taiyuan['forecast']=results.predict(start=192,end=240,dynamic=True) df_taiyuan[['降水量/mm','forecast']].plot(figsize=(12,8))
#设置最大显示列数 pd.set_option('display.max_columns', 20) #设置最大显示行数 pd.set_option('display.max_rows', 240) df_taiyuan['forecast'].describe()
count 48.000000 mean 44.183393 std 37.086907 min 5.115774 25% 13.094405 50% 29.949404 75% 73.386118 max 114.663257 Name: forecast, dtype: float64
df_taiyuan
from pandas.tseries.offsets import DateOffset future_dates=[df_taiyuan.index[-1]+ DateOffset(months=x)for x in range(0,13)]
future_datest_df=pd.Dataframe(index=future_dates[1:],columns=df_taiyuan.columns)
future_datest_df
future_df=pd.concat([df_taiyuan,future_datest_df])
future_df['forecast'] = results.predict(start = 240, end = 252, dynamic= True) future_df[['降水量/mm', 'forecast']].plot(figsize=(12, 8))
#设置最大显示行数 pd.set_option('display.max_rows', 252) future_df
数据集:
链接:https://pan.baidu.com/s/1g324O57bu6GMse0HHuOyxg
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)