一文搞懂Python时间序列预测（步骤，模板，python代码）

本文介绍: 预测包括，数值拟合，线性回归，多元回归，时间序列，神经网络等等对于单变量的时间序列预测：模型有AR,MA,ARMA,ARIMA，综合来说用ARIMA即可表示全部。

对于单变量的时间序列预测：模型有AR,MA,ARMA,ARIMA，综合来说用ARIMA即可表示全部。

以预测美国未来10年GDP的变换情况为列：

import pandas as pd,numpy as np
from matplotlib import pyplot as plt

#加载数据，sheet_name指定excel表的数据页面，header指定指标column属性，loc去除杂数据，可选：parse_dates=['']，index_col='',use_cols=['']
df=pd.read_excel('./data/time1.xls',sheet_name='数据',header=1).loc[1:,:]

#DateFrame索引重置
df=df.set_index('DATE')#df.set_index('DATE',inplace=True)

#查看前5行
print(df.head(5))

#查看列索引
print(df.columns)#print(df.keys())

#查看表的维度
print(df.shape)

#查看行索引
print(df.index)

#np.array()  array1.reshape(,)  df.values.astype(int).tolist() np.vstack((a1,a2))  np.hstack((a1,a2))  round()  iloc

#时间索引拆分
# dates=pd.date_range(start='1991-01-01',end='2007-08-01',freq='MS')#日期取值和格式转换，MS代表每月第一天
# years=[d.strftime('%Y-%m') for d in dates][0:200:25]
# years.append('2007-09')

plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题
plt.rc('font',family='SimHei')
plt.style.use('ggplot')

df.plot(secondary_y=['CS','INV','P_GDP','GOV_NET'])#单个指标时序图 df['CS'].plot()
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Time Series Plot')
#plt.grid()
plt.show()

from statsmodels.tsa.stattools import adfuller# ADF检验

for i in df.columns:
    data=df[i]
    print(f'{i}的单位根检验')
    result = adfuller(data)#默认情况下，regression参数为'c'，表示使用包含截距项的回归模型。
    print('ADF Statistic: %f' % result[0])#ADF统计量
    print('p-value: %f' % result[1])#p值
    print('Critical Values:')#在置信水平下的临界值
    for key, value in result[4].items():
        print('t%s: %.3f' % (key, value))
    print()

#差分时序图
diff_data = df.diff(periods=1).dropna()# 创建一阶差分的时间序列，加上dropna()后续不需要执行[1:]
#print(diff_data)

diff_data.plot()
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Time Series Plot')
#plt.grid()
plt.show()

for i in diff_data.columns:
    data=diff_data[i]#Series索引选取，一阶差分第一个数据为NA
    
    print(f'{i}的单位根检验')
    result = adfuller(data)#结果对应位置的数据需要自己判断是什么含义
    print('ADF Statistic: %f' % result[0])#ADF统计量
    print('p-value: %f' % result[1])#p值
    print('Critical Values:')#在置信水平下的临界值
    for key, value in result[4].items():
        print('t%s: %.3f' % (key, value))
    print()

from statsmodels.stats.diagnostic import acorr_ljungbox

#这里为一阶差分后的平稳序列进行白噪声检验，lags为1，否则lags为0，这里拿上述的GDP指标进行
lags = [1,4,8,16,32]

print('差分序列的白噪声检验结果为：'+'n',acorr_ljungbox(diff_data['GDP'], lags))  # 返回统计量和p值,这里的lags对应差分阶数

print('原始数据序列的白噪声检验结果为：'+'n',acorr_ljungbox(df['GDP'], lags))

#生成自相关图，偏自相关图
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf

fig, ax = plt.subplots(figsize=(5, 4))
plot_acf(diff_data['GDP'],ax=ax)#可以换成df的数据，这里用一阶差分数据得到平稳序列
    
ax.set_title("Autocorrelation Plot")
lags=list(range(24))
ax.set_xticks(lags)#这里把x坐标刻度变更精细，加网格图更方便,xticklabels替换标签
ax.set_xlabel("Lag")
ax.set_ylabel("Autocorrelation")
ax.grid(alpha=0.5)
plt.legend(['GDP'])#这里区别直接放入df.columns[i]，如果是多字符如‘CS’，这样会被认为是一个序列，拆成C和S的图例
plt.show()
    
fig, ax = plt.subplots(figsize=(5, 4))
lags=list(range(24))
ax.set_xticks(lags)#这里把x坐标刻度变更精细，加网格图更方便,xticklabels替换标签
plot_pacf(diff_data['GDP'], ax=ax,method='ywm')#ywm替换默认的yw，去除警告
ax.set_xlabel('Lags')
ax.set_ylabel('Partial Autocorrelation')
ax.grid(alpha=0.5)
plt.legend(['GDP'])
plt.show()

import warnings
warnings.filterwarnings("ignore")
import statsmodels.api as sm


#diff_data['GDP'].values.astype(float),这里发现Serise的dtype为object，模型用的应该为float或者int类型，需要注意原数据的数据类型是否一致

Min=float('inf')
for i in range(0,6):#AIC，BIC最小找到p，q阶数来定阶，从0开始定阶是否可行？？
    for j in range(0,6):
        result=sm.tsa.ARIMA(df['GDP'].values.astype(float),order=(i,1,j)).fit()
        print([i,j,result.aic,result.bic])
        if result.bic<Min:
            Min=result.bic
            best_pq=[i,j,result.aic,result.bic]
print(f'最优定阶为{best_pq}')

Result=sm.tsa.ARIMA(df['GDP'].values.astype(float),order=(best_pq[0],1,best_pq[1])).fit()
print(Result.summary())  #显示模型的所有信息

print(len(Result.resid))
#print(Result.resid)这里观察到残差的第一项为原数据的1239.5，即差分数据不管第一项，这里需要调整残差的观测

#这里就可以观察到原始模型的结果LB统计量和这里的白噪声检验是一致的，p>0.05,即认为残差为白噪声序列，原序列信息提取充分。
lags = [1,4,8,16,32]
print('差分序列的白噪声检验结果为：'+'n',acorr_ljungbox(Result.resid[1:], lags))

## 查看模型的拟合残差分布
fig = plt.figure(figsize=(12,5))
ax = fig.add_subplot(1,2,1)
plt.plot(Result.resid[1:])
plt.title("ARIMA(2,1,1)残差曲线")

## 检查残差是否符合正太分布
ax = fig.add_subplot(1,2,2)
sm.qqplot(Result.resid[1:], line='q', ax=ax)
plt.title("ARIMA(2,1,0)残差Q-Q图")
plt.tight_layout()
plt.show()

fig = plt.figure(figsize=(12,5))
Residual=pd.DataFrame(Result.resid[1:])
Residual.plot(kind='kde', title='密度')
plt.legend('')
plt.show()

#预测，绘制原序列和预测序列值对比图

Predict=Result.predict(start=1, end=len(df['GDP'])-1+1+10); #不加参数默认0到n-1，要加预测个数在end后面N-1+预测n即可
#如果是一阶差分的序列预测，第一个数据已经差分消去了，应该start从第二个观测数据开始，即n=1；如果是0阶，则不需要按默认0到n-1

print(list(zip(range(193,203),Predict[-10:])))#打印预测值

plt.figure()
plt.plot(range(193),df['GDP'].values)#'o-k'
plt.plot(range(193+10),Predict)#'P--'
plt.legend(('原始观测值','预测值'))
plt.xticks(list(range(0,203,10)),rotation=90)
plt.show()

plt.figure()
plt.plot(range(193),df['GDP'].values)#'o-k'
plt.plot(range(192,193+10),Predict[-11:])#'P--'#接着原数据最后一个，进行拟合预测表示
plt.legend(('原始观测值','预测值'))
plt.xticks(list(range(0,203,10)),rotation=90)
plt.show()

import pmdarima as pm
# ## 自动搜索合适的参数
model = pm.auto_arima(df['GDP'].values,
                      start_p=1, start_q=1, # p,q的开始值
                      max_p=12, max_q=12, # 最大的p和q
                      d = 0,            # 寻找ARMA模型参数
                      m=1,              # 序列的周期
                      seasonal=False,   # 没有季节性趋势
                      trace=True,error_action='ignore',  
                      suppress_warnings=True, stepwise=True)
 
print(model.summary())