2017-01-30 533 views
11

我想用LSTM單元對RNN進行建模,以基於多個輸入時間序列預測多個輸出時間序列。具體來說,我有4個輸出時間序列y1 [t],y2 [t],y3 [t],y4 [t],每個輸出時間序列的長度均爲3000(t = 0,...,2999)。我也有3個輸入時間序列,x1 [t],x2 [t],x3 [t],每個都有3000秒的長度(t = 0,...,2999)。我們的目標是預測Y1 [T],.. Y4 [T]使用所有輸入時間系列到這個當前時間點,即:Keras RNN帶有LSTM單元,用於根據多個輸入時間序列預測多個輸出時間序列

y1[t] = f1(x1[k],x2[k],x3[k], k = 0,...,t) 
    y2[t] = f2(x1[k],x2[k],x3[k], k = 0,...,t) 
    y3[t] = f3(x1[k],x2[k],x3[k], k = 0,...,t) 
    y4[t] = f3(x1[k],x2[k],x3[k], k = 0,...,t) 

對於模型有一個長期的記憶,我創建了一個遵循有狀態的RNN模型。 keras-stateful-lstme。我的情況和keras-stateful-lstme之間的主要區別是,我有:

  • 超過1個輸出時間序列
  • 超過1個輸入時間序列
  • 的目標是連續時間序列預測

我的代碼正在運行。但是,即使使用簡單的數據,模型的預測結果也不好。所以我想問問你是否有什麼問題。

這是我的代碼與玩具的例子。

在玩具例子中,我們輸入的時間序列是簡單的餘弦和符號波:

import numpy as np 
def random_sample(len_timeseries=3000): 
    Nchoice = 600 
    x1 = np.cos(np.arange(0,len_timeseries)/float(1.0 + np.random.choice(Nchoice))) 
    x2 = np.cos(np.arange(0,len_timeseries)/float(1.0 + np.random.choice(Nchoice))) 
    x3 = np.sin(np.arange(0,len_timeseries)/float(1.0 + np.random.choice(Nchoice))) 
    x4 = np.sin(np.arange(0,len_timeseries)/float(1.0 + np.random.choice(Nchoice))) 
    y1 = np.random.random(len_timeseries) 
    y2 = np.random.random(len_timeseries) 
    y3 = np.random.random(len_timeseries) 
    for t in range(3,len_timeseries): 
     ## the output time series depend on input as follows: 
     y1[t] = x1[t-2] 
     y2[t] = x2[t-1]*x3[t-2] 
     y3[t] = x4[t-3] 
    y = np.array([y1,y2,y3]).T 
    X = np.array([x1,x2,x3,x4]).T 
    return y, X 
def generate_data(Nsequence = 1000): 
    X_train = [] 
    y_train = [] 
    for isequence in range(Nsequence): 
     y, X = random_sample() 
     X_train.append(X) 
     y_train.append(y) 
    return np.array(X_train),np.array(y_train) 

請注意,Y1在時間點t簡直是X1的值在t - 2 也請注意, y3在時間點t僅僅是前兩個步驟中的x1的值。

使用這些函數,我生成了100組時間序列y1,y2,y3,x1,x2,x3,x4。其中一半去訓練數據,剩下的一半去測試數據。

Nsequence = 100 
prop = 0.5 
Ntrain = Nsequence*prop 
X, y = generate_data(Nsequence) 
X_train = X[:Ntrain,:,:] 
X_test = X[Ntrain:,:,:] 
y_train = y[:Ntrain,:,:] 
y_test = y[Ntrain:,:,:] 

X,y都爲3維,並且每個包含:

#X.shape = (N sequence, length of time series, N input features) 
#y.shape = (N sequence, length of time series, N targets) 
print X.shape, y.shape 
> (100, 3000, 4) (100, 3000, 3) 

時間序列Y1的例子,.. Y4和X1,...,X 3被示出爲如下:

enter image description here enter image description here

我這些標準化數據:

def standardize(X_train,stat=None): 
    ## X_train is 3 dimentional e.g. (Nsample,len_timeseries, Nfeature) 
    ## standardization is done with respect to the 3rd dimention 
    if stat is None: 
     featmean = np.array([np.nanmean(X_train[:,:,itrain]) for itrain in range(X_train.shape[2])]).reshape(1,1,X_train.shape[2]) 
     featstd = np.array([np.nanstd(X_train[:,:,itrain]) for itrain in range(X_train.shape[2])]).reshape(1,1,X_train.shape[2]) 
     stat = {"featmean":featmean,"featstd":featstd} 
    else: 
     featmean = stat["featmean"] 
     featstd = stat["featstd"] 
    X_train_s = (X_train - featmean)/featstd 
    return X_train_s, stat 

X_train_s, X_stat = standardize(X_train,stat=None) 
X_test_s, _ = standardize(X_test,stat=X_stat) 
y_train_s, y_stat = standardize(y_train,stat=None) 
y_test_s, _ = standardize(y_test,stat=y_stat) 

創建具有10 LSTM隱藏神經原

from keras.models import Sequential 
from keras.layers.core import Dense, Activation, Dropout 
from keras.layers.recurrent import LSTM 
def create_stateful_model(hidden_neurons): 
    # create and fit the LSTM network 

    model = Sequential() 
    model.add(LSTM(hidden_neurons, 
        batch_input_shape=(1, 1, X_train.shape[2]), 
        return_sequences=False, 
        stateful=True)) 
    model.add(Dropout(0.5)) 
    model.add(Dense(y_train.shape[2])) 
    model.add(Activation("linear")) 
    model.compile(loss='mean_squared_error', optimizer="rmsprop",metrics=['mean_squared_error']) 
    return model 
model = create_stateful_model(10) 

查閱下面的代碼用於訓練和驗證RNN模型有狀態RNN模型:

def get_R2(y_pred,y_test): 
     ## y_pred_s_batch: (Nsample, len_timeseries, Noutput) 
     ## the relative percentage error is computed for each output 
     overall_mean = np.nanmean(y_test) 
     SSres = np.nanmean((y_pred - y_test)**2 ,axis=0).mean(axis=0) 
     SStot = np.nanmean((y_test - overall_mean)**2 ,axis=0).mean(axis=0) 
     R2 = 1 - SSres/SStot 
     print "<R2 testing> target 1:",R2[0],"target 2:",R2[1],"target 3:",R2[2] 
     return R2 


def reshape_batch_input(X_t,y_t=None): 
    X_t = np.array(X_t).reshape(1,1,len(X_t)) ## (1,1,4) dimention 
    if y_t is not None: 
     y_t = np.array([y_t]) ## (1,3) 
    return X_t,y_t 
def fit_stateful(model,X_train,y_train,X_test,y_test,nb_epoch=8): 
    ''' 
    reference: http://philipperemy.github.io/keras-stateful-lstm/ 

    X_train: (N_time_series, len_time_series, N_features) = (10,000, 3,600 (max), 2), 
    y_train: (N_time_series, len_time_series, N_output) = (10,000, 3,600 (max), 4) 

    ''' 
    max_len = X_train.shape[1] 

    print "X_train.shape(Nsequence =",X_train.shape[0],"len_timeseries =",X_train.shape[1],"Nfeats =",X_train.shape[2],")" 
    print "y_train.shape(Nsequence =",y_train.shape[0],"len_timeseries =",y_train.shape[1],"Ntargets =",y_train.shape[2],")" 
    print('Train...') 
    for epoch in range(nb_epoch): 
     print('___________________________________') 
     print "epoch", epoch+1, "out of ",nb_epoch 
     ## ---------- ## 
     ## training ## 
     ## ---------- ## 
     mean_tr_acc = [] 
     mean_tr_loss = [] 
     for s in range(X_train.shape[0]): 
      for t in range(max_len): 
       X_st = X_train[s][t] 
       y_st = y_train[s][t] 
       if np.any(np.isnan(y_st)): 
        break 
       X_st,y_st = reshape_batch_input(X_st,y_st) 
       tr_loss, tr_acc = model.train_on_batch(X_st,y_st) 
       mean_tr_acc.append(tr_acc) 
       mean_tr_loss.append(tr_loss) 
      model.reset_states() 

     ##print('accuracy training = {}'.format(np.mean(mean_tr_acc))) 
     print('<loss (mse) training> {}'.format(np.mean(mean_tr_loss))) 
     ## ---------- ## 
     ## testing ## 
     ## ---------- ## 
     y_pred = predict_stateful(model,X_test) 
     eva = get_R2(y_pred,y_test) 
    return model, eva, y_pred 

def predict_stateful(model,X_test): 
    y_pred = [] 
    max_len = X_test.shape[1] 
    for s in range(X_test.shape[0]): 
     y_s_pred = [] 
     for t in range(max_len): 
      X_st = X_test[s][t] 
      if np.any(np.isnan(X_st)): 
       ## the rest of y is NA 
       y_s_pred.extend([np.NaN]*(max_len-len(y_s_pred))) 
       break 
      X_st,_ = reshape_batch_input(X_st) 
      y_st_pred = model.predict_on_batch(X_st) 
      y_s_pred.append(y_st_pred[0].tolist()) 

     y_pred.append(y_s_pred) 
     model.reset_states() 

    y_pred = np.array(y_pred) 
    return y_pred 




    model, train_metric, y_pred = fit_stateful(model, 
             X_train_s,y_train_s, 
             X_test_s,y_test_s,nb_epoch=15) 

輸出是以下內容:

X_train.shape(Nsequence = 15 len_timeseries = 3000 Nfeats = 4) 
y_train.shape(Nsequence = 15 len_timeseries = 3000 Ntargets = 3) 
Train... 
___________________________________ 
epoch 1 out of 15 
<loss (mse) training> 0.414115458727 
<R2 testing> target 1: 0.664464304688 target 2: -0.574523052322 target 3: 0.526447813052 
___________________________________ 
epoch 2 out of 15 
<loss (mse) training> 0.394549429417 
<R2 testing> target 1: 0.361516087033 target 2: -0.724583671831 target 3: 0.795566178787 
___________________________________ 
epoch 3 out of 15 
<loss (mse) training> 0.403199136257 
<R2 testing> target 1: 0.09610702779 target 2: -0.468219774909 target 3: 0.69419269042 
___________________________________ 
epoch 4 out of 15 
<loss (mse) training> 0.406423777342 
<R2 testing> target 1: 0.469149270848 target 2: -0.725592048946 target 3: 0.732963522766 
___________________________________ 
epoch 5 out of 15 
<loss (mse) training> 0.408153116703 
<R2 testing> target 1: 0.400821776652 target 2: -0.329415365214 target 3: 0.2578432553 
___________________________________ 
epoch 6 out of 15 
<loss (mse) training> 0.421062678099 
<R2 testing> target 1: -0.100464591586 target 2: -0.232403824523 target 3: 0.570606489959 
___________________________________ 
epoch 7 out of 15 
<loss (mse) training> 0.417774856091 
<R2 testing> target 1: 0.320094445321 target 2: -0.606375769083 target 3: 0.349876223119 
___________________________________ 
epoch 8 out of 15 
<loss (mse) training> 0.427440851927 
<R2 testing> target 1: 0.489543715713 target 2: -0.445328806611 target 3: 0.236463139804 
___________________________________ 
epoch 9 out of 15 
<loss (mse) training> 0.422931671143 
<R2 testing> target 1: -0.31006468223 target 2: -0.322621276474 target 3: 0.122573123871 
___________________________________ 
epoch 10 out of 15 
<loss (mse) training> 0.43609803915 
<R2 testing> target 1: 0.459111316554 target 2: -0.313382405804 target 3: 0.636854743292 
___________________________________ 
epoch 11 out of 15 
<loss (mse) training> 0.433844655752 
<R2 testing> target 1: -0.0161015052703 target 2: -0.237462995323 target 3: 0.271788109459 
___________________________________ 
epoch 12 out of 15 
<loss (mse) training> 0.437297314405 
<R2 testing> target 1: -0.493665758658 target 2: -0.234236263092 target 3: 0.047264439493 
___________________________________ 
epoch 13 out of 15 
<loss (mse) training> 0.470605045557 
<R2 testing> target 1: 0.144443089961 target 2: -0.874982 target 3: -0.00432615142135 
___________________________________ 
epoch 14 out of 15 
<loss (mse) training> 0.444566756487 
<R2 testing> target 1: -0.053982119103 target 2: -0.0676577449316 target 3: -0.12678037186 
___________________________________ 
epoch 15 out of 15 
<loss (mse) training> 0.482106208801 
<R2 testing> target 1: 0.208482181828 target 2: -0.402982670798 target 3: 0.366757778713 

如您所見,訓練損失不減!由於目標時間序列1和3與輸入時間序列(y1 [t] = x1 [t-2],y3 [t] = x4 [t-3])的關係非常簡單,所以我期望完美的預測表現。然而,在每個時代測試R2都表明事實並非如此。最終時代的R2僅爲0.2和0.36。顯然,算法不收斂。我對這個結果感到非常困惑。請讓我知道我缺少什麼,以及爲什麼算法不收斂。

+0

通常當這類事情發生了,還用超參數有問題。你有沒有考慮通過'hyperopt'包或'hyperas'包裝來做一些超參數優化? – StatsSorceress

回答

3

初始注意事項。如果時間序列很短(例如T = 30),那麼我們不需要有狀態的LSTM,而經典的LSTM會運行良好。 在OP問題中,時間序列長度爲T = 3000,因此經典LSTM的學習速度可能非常慢。通過將時間序列分成幾部分並使用有狀態的LSTM來改進學習。

N = batch_size的有狀態模式。 有狀態模型對Keras非常棘手,因爲您需要小心如何減少時間序列和選擇批量大小。在OP問題中,樣本大小爲N = 100。由於我們可以接受批量培訓一百個模型(它不是一個大數目),我們將選擇batch_size = 100。

選擇batch_size = N簡化了訓練,因爲您不需要重置曆元內部的狀態(所以不需要編寫回調on_batch_begin)。

它仍然是削減時間序列的問題。切割是一個小技巧,所以我寫了一個包裝函數在所有情況下工作。

def stateful_cut(arr, batch_size, T_after_cut): 
    if len(arr.shape) != 3: 
     # N: Independent sample size, 
     # T: Time length, 
     # m: Dimension 
     print("ERROR: please format arr as a (N, T, m) array.") 

    N = arr.shape[0] 
    T = arr.shape[1] 

    # We need T_after_cut * nb_cuts = T 
    nb_cuts = int(T/T_after_cut) 
    if nb_cuts * T_after_cut != T: 
     print("ERROR: T_after_cut must divide T") 

    # We need batch_size * nb_reset = N 
    # If nb_reset = 1, we only reset after the whole epoch, so no need to reset 
    nb_reset = int(N/batch_size) 
    if nb_reset * batch_size != N: 
     print("ERROR: batch_size must divide N") 

    # Cutting (technical) 
    cut1 = np.split(arr, nb_reset, axis=0) 
    cut2 = [np.split(x, nb_cuts, axis=1) for x in cut1] 
    cut3 = [np.concatenate(x) for x in cut2] 
    cut4 = np.concatenate(cut3) 
    return(cut4) 

從現在開始,模型的訓練變得很容易。由於OP示例非常簡單,我們不需要額外的預處理或正則化。我描述瞭如何一步一步地進行(對於不耐煩的人來說,完整的獨立代碼可以在本文末尾獲得)。

首先我們加載數據並用包裝函數重新設計它。

import numpy as np 
from keras.models import Sequential 
from keras.layers import Dense, LSTM, TimeDistributed 
import matplotlib.pyplot as plt 
import matplotlib.patches as mpatches 

## 
# Data 
## 
N = X_train.shape[0] # size of samples 
T = X_train.shape[1] # length of each time series 
batch_size = N # number of time series considered together: batch_size | N 
T_after_cut = 100 # length of each cut part of the time series: T_after_cut | T 
dim_in = X_train.shape[2] # dimension of input time series 
dim_out = y_train.shape[2] # dimension of output time series 

inputs, outputs, inputs_test, outputs_test = \ 
    [stateful_cut(arr, batch_size, T_after_cut) for arr in \ 
    [X_train, y_train, X_test, y_test]] 

然後我們編譯帶有4個輸入,3路輸出,並含有10個節點1隱藏層的模型。

## 
# Model 
## 
nb_units = 10 

model = Sequential() 
model.add(LSTM(batch_input_shape=(batch_size, None, dim_in), 
       return_sequences=True, units=nb_units, stateful=True)) 
model.add(TimeDistributed(Dense(activation='linear', units=dim_out))) 
model.compile(loss = 'mse', optimizer = 'rmsprop') 

我們在不重置狀態的情況下訓練模型。我們能做到這一點,只是因爲我們選擇=的batch_size N.

## 
# Training 
## 
epochs = 100 

nb_reset = int(N/batch_size) 
if nb_reset > 1: 
    print("ERROR: We need to reset states when batch_size < N") 

# When nb_reset = 1, we do not need to reinitialize states 
history = model.fit(inputs, outputs, epochs = epochs, 
        batch_size = batch_size, shuffle=False, 
        validation_data=(inputs_test, outputs_test)) 

我們得到培訓/測試損失的演變如下:

training and test loss decreasing as a function of epochs as expected

現在,我們定義了一個「啞劇模型」,這是無國籍的,但包含我們有狀態的權重。 [爲什麼要這樣?通過model.predict使用有狀態模型進行預測需要Keras中有完整的批處理,但我們可能沒有完整的批處理來預測...]

## Mime model which is stateless but containing stateful weights 
model_stateless = Sequential() 
model_stateless.add(LSTM(input_shape=(None, dim_in), 
       return_sequences=True, units=nb_units)) 
model_stateless.add(TimeDistributed(Dense(activation='linear', units=dim_out))) 
model_stateless.compile(loss = 'mse', optimizer = 'rmsprop') 
model_stateless.set_weights(model.get_weights()) 

最後,我們可以展示我們對我們長期的時間序列Y1,Y2和Y3令人難以置信的預測(藍色爲真正的輸出;橙色預測輸出):

對於Y1: Prediction for y1

對於Y2: Prediction for y2

對於Y3: Prediction for y3

結論:它的工作原理幾乎完全,除非2-3第一次約會,其中該系列產品是通過定義不可預知的。從下一批次的批次開始,我們沒有觀察到任何爆發。

更多當N很大時,我們想選擇batch_size | N with batch_size < N.我已經在https://github.com/ahstat/deep-learning/blob/master/rnn/4_lagging_and_stateful.py(C和D部分)中編寫了完整的代碼。該github路徑還顯示了短時間序列(A部分)的經典LSTM效率和長時間序列(B部分)的低效率。我可能會在下個月寫一篇博客文章。

完全獨立的代碼

################ 
# Code from OP # 
################ 
import numpy as np 
def random_sample(len_timeseries=3000): 
    Nchoice = 600 
    x1 = np.cos(np.arange(0,len_timeseries)/float(1.0 + np.random.choice(Nchoice))) 
    x2 = np.cos(np.arange(0,len_timeseries)/float(1.0 + np.random.choice(Nchoice))) 
    x3 = np.sin(np.arange(0,len_timeseries)/float(1.0 + np.random.choice(Nchoice))) 
    x4 = np.sin(np.arange(0,len_timeseries)/float(1.0 + np.random.choice(Nchoice))) 
    y1 = np.random.random(len_timeseries) 
    y2 = np.random.random(len_timeseries) 
    y3 = np.random.random(len_timeseries) 
    for t in range(3,len_timeseries): 
     ## the output time series depend on input as follows: 
     y1[t] = x1[t-2] 
     y2[t] = x2[t-1]*x3[t-2] 
     y3[t] = x4[t-3] 
    y = np.array([y1,y2,y3]).T 
    X = np.array([x1,x2,x3,x4]).T 
    return y, X 
def generate_data(Nsequence = 1000): 
    X_train = [] 
    y_train = [] 
    for isequence in range(Nsequence): 
     y, X = random_sample() 
     X_train.append(X) 
     y_train.append(y) 
    return np.array(X_train),np.array(y_train) 

Nsequence = 100 
prop = 0.5 
Ntrain = int(Nsequence*prop) 
X, y = generate_data(Nsequence) 
X_train = X[:Ntrain,:,:] 
X_test = X[Ntrain:,:,:] 
y_train = y[:Ntrain,:,:] 
y_test = y[Ntrain:,:,:] 

#X.shape = (N sequence, length of time series, N input features) 
#y.shape = (N sequence, length of time series, N targets) 
print(X.shape, y.shape) 
# (100, 3000, 4) (100, 3000, 3) 

#################### 
# Cutting function # 
#################### 
def stateful_cut(arr, batch_size, T_after_cut): 
    if len(arr.shape) != 3: 
     # N: Independent sample size, 
     # T: Time length, 
     # m: Dimension 
     print("ERROR: please format arr as a (N, T, m) array.") 

    N = arr.shape[0] 
    T = arr.shape[1] 

    # We need T_after_cut * nb_cuts = T 
    nb_cuts = int(T/T_after_cut) 
    if nb_cuts * T_after_cut != T: 
     print("ERROR: T_after_cut must divide T") 

    # We need batch_size * nb_reset = N 
    # If nb_reset = 1, we only reset after the whole epoch, so no need to reset 
    nb_reset = int(N/batch_size) 
    if nb_reset * batch_size != N: 
     print("ERROR: batch_size must divide N") 

    # Cutting (technical) 
    cut1 = np.split(arr, nb_reset, axis=0) 
    cut2 = [np.split(x, nb_cuts, axis=1) for x in cut1] 
    cut3 = [np.concatenate(x) for x in cut2] 
    cut4 = np.concatenate(cut3) 
    return(cut4) 

############# 
# Main code # 
############# 
from keras.models import Sequential 
from keras.layers import Dense, LSTM, TimeDistributed 
import matplotlib.pyplot as plt 
import matplotlib.patches as mpatches 

## 
# Data 
## 
N = X_train.shape[0] # size of samples 
T = X_train.shape[1] # length of each time series 
batch_size = N # number of time series considered together: batch_size | N 
T_after_cut = 100 # length of each cut part of the time series: T_after_cut | T 
dim_in = X_train.shape[2] # dimension of input time series 
dim_out = y_train.shape[2] # dimension of output time series 

inputs, outputs, inputs_test, outputs_test = \ 
    [stateful_cut(arr, batch_size, T_after_cut) for arr in \ 
    [X_train, y_train, X_test, y_test]] 

## 
# Model 
## 
nb_units = 10 

model = Sequential() 
model.add(LSTM(batch_input_shape=(batch_size, None, dim_in), 
       return_sequences=True, units=nb_units, stateful=True)) 
model.add(TimeDistributed(Dense(activation='linear', units=dim_out))) 
model.compile(loss = 'mse', optimizer = 'rmsprop') 

## 
# Training 
## 
epochs = 100 

nb_reset = int(N/batch_size) 
if nb_reset > 1: 
    print("ERROR: We need to reset states when batch_size < N") 

# When nb_reset = 1, we do not need to reinitialize states 
history = model.fit(inputs, outputs, epochs = epochs, 
        batch_size = batch_size, shuffle=False, 
        validation_data=(inputs_test, outputs_test)) 

def plotting(history): 
    plt.plot(history.history['loss'], color = "red") 
    plt.plot(history.history['val_loss'], color = "blue") 
    red_patch = mpatches.Patch(color='red', label='Training') 
    blue_patch = mpatches.Patch(color='blue', label='Test') 
    plt.legend(handles=[red_patch, blue_patch]) 
    plt.xlabel('Epochs') 
    plt.ylabel('MSE loss') 
    plt.show() 

plt.figure(figsize=(10,8)) 
plotting(history) # Evolution of training/test loss 

## 
# Visual checking for a time series 
## 
## Mime model which is stateless but containing stateful weights 
model_stateless = Sequential() 
model_stateless.add(LSTM(input_shape=(None, dim_in), 
       return_sequences=True, units=nb_units)) 
model_stateless.add(TimeDistributed(Dense(activation='linear', units=dim_out))) 
model_stateless.compile(loss = 'mse', optimizer = 'rmsprop') 
model_stateless.set_weights(model.get_weights()) 

## Prediction of a new set 
i = 0 # time series selected (between 0 and N-1) 
x = X_train[i] 
y = y_train[i] 
y_hat = model_stateless.predict(np.array([x]))[0] 

for dim in range(3): # dim = 0 for y1 ; dim = 1 for y2 ; dim = 2 for y3. 
    plt.figure(figsize=(10,8)) 
    plt.plot(range(T), y[:,dim]) 
    plt.plot(range(T), y_hat[:,dim]) 
    plt.show() 

## Conclusion: works almost perfectly.