library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.5     ✓ purrr   0.3.4
✓ tibble  3.1.6     ✓ dplyr   1.0.7
✓ tidyr   1.1.4     ✓ stringr 1.4.0
✓ readr   2.1.0     ✓ forcats 0.5.1
── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
library(magrittr)
Attaching package: ‘magrittr’
The following object is masked from ‘package:purrr’:
    set_names
The following object is masked from ‘package:tidyr’:
    extract
library(keras)
library(tidymodels) # Or only load the 'rsample' and recipes on its own
Registered S3 method overwritten by 'tune':
  method                   from   
  required_pkgs.model_spec parsnip
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.1.3 ──
✓ broom        0.7.9      ✓ rsample      0.1.0 
✓ dials        0.0.10     ✓ tune         0.1.6 
✓ infer        1.0.0      ✓ workflows    0.2.3 
✓ modeldata    0.1.1      ✓ workflowsets 0.1.0 
✓ parsnip      0.1.7      ✓ yardstick    0.0.8 
✓ recipes      0.1.16     
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
x scales::discard()        masks purrr::discard()
x magrittr::extract()      masks tidyr::extract()
x dplyr::filter()          masks stats::filter()
x recipes::fixed()         masks stringr::fixed()
x yardstick::get_weights() masks keras::get_weights()
x dplyr::lag()             masks stats::lag()
x magrittr::set_names()    masks purrr::set_names()
x yardstick::spec()        masks readr::spec()
x recipes::step()          masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.
3 Intro: Ways to create sequences…
Task:
library(tidyquant) # My favorite package to get stock data
Loading required package: lubridate
Attaching package: ‘lubridate’
The following objects are masked from ‘package:base’:
    date, intersect, setdiff, union
Loading required package: PerformanceAnalytics
Loading required package: xts
Loading required package: zoo
Attaching package: ‘zoo’
The following objects are masked from ‘package:base’:
    as.Date, as.Date.numeric
Attaching package: ‘xts’
The following objects are masked from ‘package:dplyr’:
    first, last
Attaching package: ‘PerformanceAnalytics’
The following object is masked from ‘package:graphics’:
    legend
Loading required package: quantmod
Loading required package: TTR
Attaching package: ‘TTR’
The following object is masked from ‘package:dials’:
    momentum
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
══ Need to Learn tidyquant? ═════════════════════════════════════════════════════════════════════════════════════════════════════
Business Science offers a 1-hour course - Learning Lab #9: Performance Analysis & Portfolio Optimization with tidyquant!
</> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library(timetk) 
tickers = c("GME") # We can also try AAPL etc
            
data_stocks <- tq_get(tickers,
               from = "2000-01-01",
               to = "2021-11-16",
               get = "stock.prices" # What we want to get.... here prices
               )
Warning: `type_convert()` only converts columns of type 'character'.
- `df` has no columns of type 'character'
data_stocks %>% glimpse()
Rows: 4,975
Columns: 8
$ symbol   <chr> "GME", "GME", "GME", "GME", "GME", "GME", "GME", "GME", "GME", "GME", "GME", "GME", "GME", "GME", "GME", "GME"…
$ date     <date> 2002-02-13, 2002-02-14, 2002-02-15, 2002-02-19, 2002-02-20, 2002-02-21, 2002-02-22, 2002-02-25, 2002-02-26, 2…
$ open     <dbl> 9.625, 10.175, 10.000, 9.900, 9.600, 9.840, 9.925, 9.650, 9.700, 9.675, 9.600, 9.525, 9.725, 9.800, 9.620, 9.6…
$ high     <dbl> 10.060, 10.195, 10.025, 9.900, 9.875, 9.925, 9.925, 9.825, 9.850, 9.680, 9.725, 9.775, 9.905, 9.825, 9.715, 10…
$ low      <dbl> 9.525, 9.925, 9.850, 9.375, 9.525, 9.750, 9.600, 9.540, 9.545, 9.500, 9.550, 9.490, 9.710, 9.580, 9.600, 9.675…
$ close    <dbl> 10.050, 10.000, 9.950, 9.550, 9.875, 9.850, 9.675, 9.750, 9.750, 9.575, 9.550, 9.685, 9.850, 9.625, 9.675, 10.…
$ volume   <dbl> 19054000, 2755400, 2097400, 1852600, 1723200, 1744200, 881400, 863400, 690400, 1022800, 687800, 478400, 795200…
$ adjusted <dbl> 6.766666, 6.733002, 6.699337, 6.430017, 6.648838, 6.632006, 6.514179, 6.564677, 6.564677, 6.446849, 6.430017, …
data_stocks %>% head()
data_stocks %>% 
  plot_time_series(date, adjusted)
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
Registered S3 methods overwritten by 'htmltools':
  method               from         
  print.html           tools:rstudio
  print.shiny.tag      tools:rstudio
  print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio
# # ggplot alternative
# data_stocks %>%
#   ggplot(aes(x = date, y = adjusted,)) +
#   geom_line() +
#   labs(x = 'Date', y = "Adjusted Price") 
# Limit data
data <- data_stocks %>%
  rename(index = date, value = adjusted) %>%
  select(index, value) %>%
  arrange(index) 
# Remodel value as percentage change
data %<>%
  distinct(index, .keep_all = TRUE) %>%
  tidyr::fill(value, .direction = "downup") %>%
  mutate(value = (value - lag(value,1)) / lag(value,1) ) %>%
  drop_na()
data %>%
  ggplot(aes(x = index, y = value)) +
  geom_line() +
  labs(x = 'Date', y = "Price change in pct") 
data %>%
    plot_acf_diagnostics(date, value)
# We use time_splits here to maintain the sequences
data_split <- data %>% initial_time_split(prop = 0.75)
data_train <- data_split %>% training()
data_test <- data_split %>% testing()
# See ehat we got
data_train %>% pull(index) %>% min()
[1] "2002-02-14"
data_train %>% pull(index) %>% max()
[1] "2016-12-06"
data_test %>% pull(index) %>% min()
[1] "2016-12-07"
data_test %>% pull(index) %>% max()
[1] "2021-11-15"
data_train %>% mutate(split = 'training') %>%
  bind_rows(data_test %>% mutate(split = 'testing')) %>%
  ggplot(aes(x = index, y = value, col = split)) +
  geom_line() 
step_rangedata_recipe
Data Recipe
Inputs:
Training data contained 3730 data points and no missing data.
Operations:
Centering and scaling for value [trained]
Row arrangement [trained]
# Preserve the values for later (to reconstruct original values)
prep_history <- tibble(
  mean = data_recipe$steps[[1]]$means,
  sds = data_recipe$steps[[1]]$sds
)
prep_history
# Number of lags
n_lag = 1
# Train data
x_train <- data_recipe %>% juice() 
y_train <- data_recipe %>%  juice() %>%
  mutate(value = value %>% lead(n_lag)) %>%
  tidyr::fill(value, .direction = "downup") 
# And the same for the test data
x_test <- data_recipe %>% bake(data_test) 
y_test <- data_recipe %>%  bake(data_test) %>%  
  mutate(value = value %>% lead(n_lag)) %>%
  tidyr::fill(value, .direction = "downup") 
# TRansforming the x sequence to a 3d tensor (necessary for LSTMs)
x_train_arr <- x_train %>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1, 1))
x_test_arr <- x_test %>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1, 1))
y_train_arr <- y_train %>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1))
y_test_arr <- y_test %>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1))
x_train_arr %>% dim()
[1] 3730    1    1
x_train_arr %>% glimpse()
 num [1:3730, 1, 1] -0.203 -0.204 -1.469 1.198 -0.116 ...
model <- keras_model_sequential() %>%
  # LSTM layer
  layer_lstm(units = 32, 
             dropout=0.2, 
             recurrent_dropout=0.2,
             input_shape = dim(x_train_arr)[-1],
             return_sequences = TRUE) %>%
  # LSTM layer
  layer_lstm(units = 32, 
             dropout=0.2, 
             recurrent_dropout=0.2,
             return_sequences = FALSE) %>%
  # A DENSE LAYER IN BETWEEN
  layer_dense(units = 32, activation = 'relu') %>%
  #Final prediction layer
  layer_dense(units = 1, activation = 'linear')
Loaded Tensorflow version 2.7.0
2021-11-17 10:42:50.041624: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
# Compile model
model %>% 
  compile(loss = "mse", 
          metric = 'mse', 
          optimizer = optimizer_adam())
model %>% summary()
Model: "sequential"
_________________________________________________________________________________________________________________________________
 Layer (type)                                             Output Shape                                       Param #             
=================================================================================================================================
 lstm_1 (LSTM)                                            (None, 1, 32)                                      4352                
                                                                                                                                 
 lstm (LSTM)                                              (None, 32)                                         8320                
                                                                                                                                 
 dense_1 (Dense)                                          (None, 32)                                         1056                
                                                                                                                                 
 dense (Dense)                                            (None, 1)                                          33                  
                                                                                                                                 
=================================================================================================================================
Total params: 13,761
Trainable params: 13,761
Non-trainable params: 0
_________________________________________________________________________________________________________________________________
shuffle = FALSE to preserve sequenceshist_model <- model %>% fit(x          = x_train_arr, 
                            y          = y_train_arr, 
                            epochs     = 10,
                            verbose    = TRUE, 
                            batch_size = 64,
                            validation_split = 0.25, 
                            shuffle    = FALSE)
Epoch 1/10
 1/44 [..............................] - ETA: 2:49 - loss: 0.9618 - mse: 0.9618
17/44 [==========>...................] - ETA: 0s - loss: 1.1581 - mse: 1.1581  
30/44 [===================>..........] - ETA: 0s - loss: 1.2840 - mse: 1.2840
43/44 [============================>.] - ETA: 0s - loss: 1.0833 - mse: 1.0833
44/44 [==============================] - 4s 4ms/step - loss: 1.0756 - mse: 1.0756
44/44 [==============================] - 5s 30ms/step - loss: 1.0756 - mse: 1.0756 - val_loss: 0.7748 - val_mse: 0.7748
Epoch 2/10
 1/44 [..............................] - ETA: 0s - loss: 0.9596 - mse: 0.9596
15/44 [=========>....................] - ETA: 0s - loss: 1.2292 - mse: 1.2292
27/44 [=================>............] - ETA: 0s - loss: 1.2358 - mse: 1.2358
36/44 [=======================>......] - ETA: 0s - loss: 1.1733 - mse: 1.1733
44/44 [==============================] - 0s 5ms/step - loss: 1.0748 - mse: 1.0748
44/44 [==============================] - 0s 9ms/step - loss: 1.0748 - mse: 1.0748 - val_loss: 0.7757 - val_mse: 0.7757
Epoch 3/10
 1/44 [..............................] - ETA: 0s - loss: 0.9570 - mse: 0.9570
12/44 [=======>......................] - ETA: 0s - loss: 1.3214 - mse: 1.3214
24/44 [===============>..............] - ETA: 0s - loss: 1.0544 - mse: 1.0544
33/44 [=====================>........] - ETA: 0s - loss: 1.2418 - mse: 1.2418
44/44 [==============================] - 0s 5ms/step - loss: 1.0742 - mse: 1.0742
44/44 [==============================] - 0s 8ms/step - loss: 1.0742 - mse: 1.0742 - val_loss: 0.7770 - val_mse: 0.7770
Epoch 4/10
 1/44 [..............................] - ETA: 0s - loss: 0.9555 - mse: 0.9555
16/44 [=========>....................] - ETA: 0s - loss: 1.1910 - mse: 1.1910
32/44 [====================>.........] - ETA: 0s - loss: 1.2577 - mse: 1.2577
44/44 [==============================] - 0s 3ms/step - loss: 1.0745 - mse: 1.0745
44/44 [==============================] - 0s 7ms/step - loss: 1.0745 - mse: 1.0745 - val_loss: 0.7770 - val_mse: 0.7770
Epoch 5/10
 1/44 [..............................] - ETA: 0s - loss: 0.9538 - mse: 0.9538
 9/44 [=====>........................] - ETA: 0s - loss: 1.5461 - mse: 1.5461
19/44 [===========>..................] - ETA: 0s - loss: 1.1149 - mse: 1.1149
34/44 [======================>.......] - ETA: 0s - loss: 1.2203 - mse: 1.2203
44/44 [==============================] - 0s 5ms/step - loss: 1.0740 - mse: 1.0740
44/44 [==============================] - 0s 9ms/step - loss: 1.0740 - mse: 1.0740 - val_loss: 0.7778 - val_mse: 0.7778
Epoch 6/10
 1/44 [..............................] - ETA: 0s - loss: 0.9553 - mse: 0.9553
10/44 [=====>........................] - ETA: 0s - loss: 1.4411 - mse: 1.4411
20/44 [============>.................] - ETA: 0s - loss: 1.0731 - mse: 1.0731
32/44 [====================>.........] - ETA: 0s - loss: 1.2580 - mse: 1.2580
43/44 [============================>.] - ETA: 0s - loss: 1.0823 - mse: 1.0823
44/44 [==============================] - 0s 5ms/step - loss: 1.0746 - mse: 1.0746
44/44 [==============================] - 0s 9ms/step - loss: 1.0746 - mse: 1.0746 - val_loss: 0.7777 - val_mse: 0.7777
Epoch 7/10
 1/44 [..............................] - ETA: 0s - loss: 0.9508 - mse: 0.9508
17/44 [==========>...................] - ETA: 0s - loss: 1.1560 - mse: 1.1560
33/44 [=====================>........] - ETA: 0s - loss: 1.2422 - mse: 1.2422
44/44 [==============================] - 0s 4ms/step - loss: 1.0744 - mse: 1.0744
44/44 [==============================] - 0s 7ms/step - loss: 1.0744 - mse: 1.0744 - val_loss: 0.7781 - val_mse: 0.7781
Epoch 8/10
 1/44 [..............................] - ETA: 0s - loss: 0.9561 - mse: 0.9561
12/44 [=======>......................] - ETA: 0s - loss: 1.3166 - mse: 1.3166
21/44 [=============>................] - ETA: 0s - loss: 1.0524 - mse: 1.0524
29/44 [==================>...........] - ETA: 0s - loss: 1.2945 - mse: 1.2945
43/44 [============================>.] - ETA: 0s - loss: 1.0809 - mse: 1.0809
44/44 [==============================] - 0s 5ms/step - loss: 1.0733 - mse: 1.0733
44/44 [==============================] - 0s 9ms/step - loss: 1.0733 - mse: 1.0733 - val_loss: 0.7786 - val_mse: 0.7786
Epoch 9/10
 1/44 [..............................] - ETA: 0s - loss: 0.9443 - mse: 0.9443
12/44 [=======>......................] - ETA: 0s - loss: 1.3194 - mse: 1.3194
27/44 [=================>............] - ETA: 0s - loss: 1.2326 - mse: 1.2326
38/44 [========================>.....] - ETA: 0s - loss: 1.1413 - mse: 1.1413
44/44 [==============================] - 0s 4ms/step - loss: 1.0730 - mse: 1.0730
44/44 [==============================] - 0s 8ms/step - loss: 1.0730 - mse: 1.0730 - val_loss: 0.7800 - val_mse: 0.7800
Epoch 10/10
 1/44 [..............................] - ETA: 0s - loss: 0.9420 - mse: 0.9420
15/44 [=========>....................] - ETA: 0s - loss: 1.2291 - mse: 1.2291
24/44 [===============>..............] - ETA: 0s - loss: 1.0560 - mse: 1.0560
31/44 [====================>.........] - ETA: 0s - loss: 1.2664 - mse: 1.2664
41/44 [==========================>...] - ETA: 0s - loss: 1.1025 - mse: 1.1025
44/44 [==============================] - 0s 5ms/step - loss: 1.0750 - mse: 1.0750
44/44 [==============================] - 0s 10ms/step - loss: 1.0750 - mse: 1.0750 - val_loss: 0.7789 - val_mse: 0.7789
hist_model %>% plot()
`geom_smooth()` using formula 'y ~ x'
model %>% evaluate(x_test_arr, y_test_arr)
 1/39 [..............................] - ETA: 0s - loss: 0.6815 - mse: 0.6815
32/39 [=======================>......] - ETA: 0s - loss: 2.8909 - mse: 2.8909
39/39 [==============================] - 0s 2ms/step - loss: 9.8967 - mse: 9.8967
39/39 [==============================] - 0s 2ms/step - loss: 9.8967 - mse: 9.8967
   loss     mse 
9.89671 9.89671 
model_pred <- model %>% predict(x_test_arr) %>% as.numeric()
reverse_norm<- function(x, mean, sds) {
  x_re <- (x * sds) + mean
  return(x_re)
  }
eval <- tibble(
  index = data_test %>% pull(index),
  truth = data_test %>% pull(value),
  pred = model_pred %>% reverse_norm(x = ., mean = prep_history$mean, sds = prep_history$sds)
) 
eval %>% 
  pivot_longer(-index) %>%
  ggplot(aes(x = index, y = value, col = name)) +
  geom_line()
Well… soso
data_example
  [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
 [32]  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62
 [63]  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93
 [94]  94  95  96  97  98  99 100
n_timesteps <- 10  # Define that we would like to have 5 timesteps
batch_size <- 6 # Batch size (somewhat arbitrary)
n_features <- 1 # Number of features. Since we only predict the outcome based on its own sequence, it will be 1
timeseries_generator, which will feed the LSTM (or other architecture) with on-the-fly generated sequencestrain_gen <- 
  timeseries_generator(
    data = data_example, # The data we will use to create the sequences.
    targets = data_example, # The putcome data, in this case the same, since we just want to predict the subsequent period
    length = n_timesteps, # How many previous steps in the sequence should be used for the prediction
    sampling_rate = 1, # Should we use every observation in the sequence or skip some?
    stride = 1, # How many steps should be skipped
    shuffle = FALSE, # Should the sequence be shuffled? In time-series prediction, we want to preserve the order of sequences, so always FALSE
    batch_size = batch_size # size of the batches generated. USe this batch size also later in the LSTM
    )
train_gen
<keras.preprocessing.sequence.TimeseriesGenerator>
batch_0 <- train_gen[0]
batch_0
[[1]]
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,]    1    2    3    4    5    6    7    8    9    10
[2,]    2    3    4    5    6    7    8    9   10    11
[3,]    3    4    5    6    7    8    9   10   11    12
[4,]    4    5    6    7    8    9   10   11   12    13
[5,]    5    6    7    8    9   10   11   12   13    14
[6,]    6    7    8    9   10   11   12   13   14    15
[[2]]
[1] 11 12 13 14 15 16
# create the model
model <- keras_model_sequential()  %>%
  # Add the layer. We will make it as simple as possible here with just one LSTM and an output layer.
  layer_lstm(
    units = 32, 
    batch_input_shape  = c(batch_size, n_timesteps, n_features), # the first layer in a model needs to know the shape of the input data
    #dropout = 0.1,
    #recurrent_dropout = 0.1,
    return_sequences = FALSE, # by default, an LSTM just returns the final state
    stateful = TRUE) %>% 
  # Final output layer
  layer_dense(units = 1)
model %>% compile(loss = 'mse', optimizer = optimizer_adam(), metrics = 'mse')
length(data_example)
[1] 100
stride to time_p + 1 ?# Define a function that outputs time_p timesteps for y
gen_timeseries_output <- function(data, n_timesteps_put){
  
  target <- matrix(nrow = length(data), ncol =n_timesteps_out)
  
  data <- data %>% as.numeric()
  
  for (i in 1:length(data)) {
    target[i,] <- data[(i+1):(i+n_timesteps_out)]
  }
  
  return(target)
}
n_timesteps_out <- 5
outcome_sequnce <- data_example %>%
  gen_timeseries_output(n_timesteps_out)
outcome_sequnce %>% head(20)
      [,1] [,2] [,3] [,4] [,5]
 [1,]    2    3    4    5    6
 [2,]    3    4    5    6    7
 [3,]    4    5    6    7    8
 [4,]    5    6    7    8    9
 [5,]    6    7    8    9   10
 [6,]    7    8    9   10   11
 [7,]    8    9   10   11   12
 [8,]    9   10   11   12   13
 [9,]   10   11   12   13   14
[10,]   11   12   13   14   15
[11,]   12   13   14   15   16
[12,]   13   14   15   16   17
[13,]   14   15   16   17   18
[14,]   15   16   17   18   19
[15,]   16   17   18   19   20
[16,]   17   18   19   20   21
[17,]   18   19   20   21   22
[18,]   19   20   21   22   23
[19,]   20   21   22   23   24
[20,]   21   22   23   24   25
train_gen_seq <- 
  timeseries_generator(
    data = data,
    targets = outcome_sequnce,
    length = 5,
  sampling_rate = 1,
  stride = 1,
  shuffle = FALSE,
  batch_size = 16
)
Error in py_get_attr_impl(x, name, silent) : 
  AttributeError: 'list' object has no attribute 'astype'
batch_0_seq = train_gen_seq[0]
batch_0_seq
n_x <- 1 # number of features
time_x <- 4 # 4 days
time_y <- 1 # ... to predict one day ahead
# TRansforming the x sequence to a 3d tensor (necessary for LSTMs)
x_train_arr_n1 <- data_recipe %>% juice()  %>% pull(value) %>% as.matrix(ncol = time_x) %>% array_reshape(dim = c(nrow(.), ncol(.), 1))
x_test_arr_n1 <- x_test %>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1, 1))
y_train_arr2 <- y_train %>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1))
y_test_arr2 <- y_test %>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1))
x <- data_recipe %>% juice()  %>% pull(value) %>% matrix(ncol = time_x)
dim(x)[2]
length(x)[2]
x_train_arr %>% dim()
x_train_arr %>% glimpse()
tsteps_x = 5
tsteps_y = 5
train_arr <- x_train %>% pull(value) %>% as.numeric() %>% matrix(ncol = (tsteps_x + tsteps_y))
x_train_arr <- train_arr[,1:tsteps_x] %>% array_reshape(dim = c(length(.), 1, 1))
#x_train %<>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1, 1))
#x_test %<>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1, 1))
#y_train %<>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1))
#y_test %<>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1))