library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.5 ✓ purrr 0.3.4
✓ tibble 3.1.6 ✓ dplyr 1.0.7
✓ tidyr 1.1.4 ✓ stringr 1.4.0
✓ readr 2.1.0 ✓ forcats 0.5.1
── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(magrittr)
Attaching package: ‘magrittr’
The following object is masked from ‘package:purrr’:
set_names
The following object is masked from ‘package:tidyr’:
extract
library(keras)
library(tidymodels) # Or only load the 'rsample' and recipes on its own
Registered S3 method overwritten by 'tune':
method from
required_pkgs.model_spec parsnip
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.1.3 ──
✓ broom 0.7.9 ✓ rsample 0.1.0
✓ dials 0.0.10 ✓ tune 0.1.6
✓ infer 1.0.0 ✓ workflows 0.2.3
✓ modeldata 0.1.1 ✓ workflowsets 0.1.0
✓ parsnip 0.1.7 ✓ yardstick 0.0.8
✓ recipes 0.1.16
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
x scales::discard() masks purrr::discard()
x magrittr::extract() masks tidyr::extract()
x dplyr::filter() masks stats::filter()
x recipes::fixed() masks stringr::fixed()
x yardstick::get_weights() masks keras::get_weights()
x dplyr::lag() masks stats::lag()
x magrittr::set_names() masks purrr::set_names()
x yardstick::spec() masks readr::spec()
x recipes::step() masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.
3 Intro: Ways to create sequences…
Task:
library(tidyquant) # My favorite package to get stock data
Loading required package: lubridate
Attaching package: ‘lubridate’
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
Loading required package: PerformanceAnalytics
Loading required package: xts
Loading required package: zoo
Attaching package: ‘zoo’
The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
Attaching package: ‘xts’
The following objects are masked from ‘package:dplyr’:
first, last
Attaching package: ‘PerformanceAnalytics’
The following object is masked from ‘package:graphics’:
legend
Loading required package: quantmod
Loading required package: TTR
Attaching package: ‘TTR’
The following object is masked from ‘package:dials’:
momentum
Registered S3 method overwritten by 'quantmod':
method from
as.zoo.data.frame zoo
══ Need to Learn tidyquant? ═════════════════════════════════════════════════════════════════════════════════════════════════════
Business Science offers a 1-hour course - Learning Lab #9: Performance Analysis & Portfolio Optimization with tidyquant!
</> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library(timetk)
tickers = c("GME") # We can also try AAPL etc
data_stocks <- tq_get(tickers,
from = "2000-01-01",
to = "2021-11-16",
get = "stock.prices" # What we want to get.... here prices
)
Warning: `type_convert()` only converts columns of type 'character'.
- `df` has no columns of type 'character'
data_stocks %>% glimpse()
Rows: 4,975
Columns: 8
$ symbol <chr> "GME", "GME", "GME", "GME", "GME", "GME", "GME", "GME", "GME", "GME", "GME", "GME", "GME", "GME", "GME", "GME"…
$ date <date> 2002-02-13, 2002-02-14, 2002-02-15, 2002-02-19, 2002-02-20, 2002-02-21, 2002-02-22, 2002-02-25, 2002-02-26, 2…
$ open <dbl> 9.625, 10.175, 10.000, 9.900, 9.600, 9.840, 9.925, 9.650, 9.700, 9.675, 9.600, 9.525, 9.725, 9.800, 9.620, 9.6…
$ high <dbl> 10.060, 10.195, 10.025, 9.900, 9.875, 9.925, 9.925, 9.825, 9.850, 9.680, 9.725, 9.775, 9.905, 9.825, 9.715, 10…
$ low <dbl> 9.525, 9.925, 9.850, 9.375, 9.525, 9.750, 9.600, 9.540, 9.545, 9.500, 9.550, 9.490, 9.710, 9.580, 9.600, 9.675…
$ close <dbl> 10.050, 10.000, 9.950, 9.550, 9.875, 9.850, 9.675, 9.750, 9.750, 9.575, 9.550, 9.685, 9.850, 9.625, 9.675, 10.…
$ volume <dbl> 19054000, 2755400, 2097400, 1852600, 1723200, 1744200, 881400, 863400, 690400, 1022800, 687800, 478400, 795200…
$ adjusted <dbl> 6.766666, 6.733002, 6.699337, 6.430017, 6.648838, 6.632006, 6.514179, 6.564677, 6.564677, 6.446849, 6.430017, …
data_stocks %>% head()
data_stocks %>%
plot_time_series(date, adjusted)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
# # ggplot alternative
# data_stocks %>%
# ggplot(aes(x = date, y = adjusted,)) +
# geom_line() +
# labs(x = 'Date', y = "Adjusted Price")
# Limit data
data <- data_stocks %>%
rename(index = date, value = adjusted) %>%
select(index, value) %>%
arrange(index)
# Remodel value as percentage change
data %<>%
distinct(index, .keep_all = TRUE) %>%
tidyr::fill(value, .direction = "downup") %>%
mutate(value = (value - lag(value,1)) / lag(value,1) ) %>%
drop_na()
data %>%
ggplot(aes(x = index, y = value)) +
geom_line() +
labs(x = 'Date', y = "Price change in pct")
data %>%
plot_acf_diagnostics(date, value)
# We use time_splits here to maintain the sequences
data_split <- data %>% initial_time_split(prop = 0.75)
data_train <- data_split %>% training()
data_test <- data_split %>% testing()
# See ehat we got
data_train %>% pull(index) %>% min()
[1] "2002-02-14"
data_train %>% pull(index) %>% max()
[1] "2016-12-06"
data_test %>% pull(index) %>% min()
[1] "2016-12-07"
data_test %>% pull(index) %>% max()
[1] "2021-11-15"
data_train %>% mutate(split = 'training') %>%
bind_rows(data_test %>% mutate(split = 'testing')) %>%
ggplot(aes(x = index, y = value, col = split)) +
geom_line()
step_range
data_recipe
Data Recipe
Inputs:
Training data contained 3730 data points and no missing data.
Operations:
Centering and scaling for value [trained]
Row arrangement [trained]
# Preserve the values for later (to reconstruct original values)
prep_history <- tibble(
mean = data_recipe$steps[[1]]$means,
sds = data_recipe$steps[[1]]$sds
)
prep_history
# Number of lags
n_lag = 1
# Train data
x_train <- data_recipe %>% juice()
y_train <- data_recipe %>% juice() %>%
mutate(value = value %>% lead(n_lag)) %>%
tidyr::fill(value, .direction = "downup")
# And the same for the test data
x_test <- data_recipe %>% bake(data_test)
y_test <- data_recipe %>% bake(data_test) %>%
mutate(value = value %>% lead(n_lag)) %>%
tidyr::fill(value, .direction = "downup")
# TRansforming the x sequence to a 3d tensor (necessary for LSTMs)
x_train_arr <- x_train %>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1, 1))
x_test_arr <- x_test %>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1, 1))
y_train_arr <- y_train %>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1))
y_test_arr <- y_test %>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1))
x_train_arr %>% dim()
[1] 3730 1 1
x_train_arr %>% glimpse()
num [1:3730, 1, 1] -0.203 -0.204 -1.469 1.198 -0.116 ...
model <- keras_model_sequential() %>%
# LSTM layer
layer_lstm(units = 32,
dropout=0.2,
recurrent_dropout=0.2,
input_shape = dim(x_train_arr)[-1],
return_sequences = TRUE) %>%
# LSTM layer
layer_lstm(units = 32,
dropout=0.2,
recurrent_dropout=0.2,
return_sequences = FALSE) %>%
# A DENSE LAYER IN BETWEEN
layer_dense(units = 32, activation = 'relu') %>%
#Final prediction layer
layer_dense(units = 1, activation = 'linear')
Loaded Tensorflow version 2.7.0
2021-11-17 10:42:50.041624: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
# Compile model
model %>%
compile(loss = "mse",
metric = 'mse',
optimizer = optimizer_adam())
model %>% summary()
Model: "sequential"
_________________________________________________________________________________________________________________________________
Layer (type) Output Shape Param #
=================================================================================================================================
lstm_1 (LSTM) (None, 1, 32) 4352
lstm (LSTM) (None, 32) 8320
dense_1 (Dense) (None, 32) 1056
dense (Dense) (None, 1) 33
=================================================================================================================================
Total params: 13,761
Trainable params: 13,761
Non-trainable params: 0
_________________________________________________________________________________________________________________________________
shuffle = FALSE
to preserve sequenceshist_model <- model %>% fit(x = x_train_arr,
y = y_train_arr,
epochs = 10,
verbose = TRUE,
batch_size = 64,
validation_split = 0.25,
shuffle = FALSE)
Epoch 1/10
1/44 [..............................] - ETA: 2:49 - loss: 0.9618 - mse: 0.9618
17/44 [==========>...................] - ETA: 0s - loss: 1.1581 - mse: 1.1581
30/44 [===================>..........] - ETA: 0s - loss: 1.2840 - mse: 1.2840
43/44 [============================>.] - ETA: 0s - loss: 1.0833 - mse: 1.0833
44/44 [==============================] - 4s 4ms/step - loss: 1.0756 - mse: 1.0756
44/44 [==============================] - 5s 30ms/step - loss: 1.0756 - mse: 1.0756 - val_loss: 0.7748 - val_mse: 0.7748
Epoch 2/10
1/44 [..............................] - ETA: 0s - loss: 0.9596 - mse: 0.9596
15/44 [=========>....................] - ETA: 0s - loss: 1.2292 - mse: 1.2292
27/44 [=================>............] - ETA: 0s - loss: 1.2358 - mse: 1.2358
36/44 [=======================>......] - ETA: 0s - loss: 1.1733 - mse: 1.1733
44/44 [==============================] - 0s 5ms/step - loss: 1.0748 - mse: 1.0748
44/44 [==============================] - 0s 9ms/step - loss: 1.0748 - mse: 1.0748 - val_loss: 0.7757 - val_mse: 0.7757
Epoch 3/10
1/44 [..............................] - ETA: 0s - loss: 0.9570 - mse: 0.9570
12/44 [=======>......................] - ETA: 0s - loss: 1.3214 - mse: 1.3214
24/44 [===============>..............] - ETA: 0s - loss: 1.0544 - mse: 1.0544
33/44 [=====================>........] - ETA: 0s - loss: 1.2418 - mse: 1.2418
44/44 [==============================] - 0s 5ms/step - loss: 1.0742 - mse: 1.0742
44/44 [==============================] - 0s 8ms/step - loss: 1.0742 - mse: 1.0742 - val_loss: 0.7770 - val_mse: 0.7770
Epoch 4/10
1/44 [..............................] - ETA: 0s - loss: 0.9555 - mse: 0.9555
16/44 [=========>....................] - ETA: 0s - loss: 1.1910 - mse: 1.1910
32/44 [====================>.........] - ETA: 0s - loss: 1.2577 - mse: 1.2577
44/44 [==============================] - 0s 3ms/step - loss: 1.0745 - mse: 1.0745
44/44 [==============================] - 0s 7ms/step - loss: 1.0745 - mse: 1.0745 - val_loss: 0.7770 - val_mse: 0.7770
Epoch 5/10
1/44 [..............................] - ETA: 0s - loss: 0.9538 - mse: 0.9538
9/44 [=====>........................] - ETA: 0s - loss: 1.5461 - mse: 1.5461
19/44 [===========>..................] - ETA: 0s - loss: 1.1149 - mse: 1.1149
34/44 [======================>.......] - ETA: 0s - loss: 1.2203 - mse: 1.2203
44/44 [==============================] - 0s 5ms/step - loss: 1.0740 - mse: 1.0740
44/44 [==============================] - 0s 9ms/step - loss: 1.0740 - mse: 1.0740 - val_loss: 0.7778 - val_mse: 0.7778
Epoch 6/10
1/44 [..............................] - ETA: 0s - loss: 0.9553 - mse: 0.9553
10/44 [=====>........................] - ETA: 0s - loss: 1.4411 - mse: 1.4411
20/44 [============>.................] - ETA: 0s - loss: 1.0731 - mse: 1.0731
32/44 [====================>.........] - ETA: 0s - loss: 1.2580 - mse: 1.2580
43/44 [============================>.] - ETA: 0s - loss: 1.0823 - mse: 1.0823
44/44 [==============================] - 0s 5ms/step - loss: 1.0746 - mse: 1.0746
44/44 [==============================] - 0s 9ms/step - loss: 1.0746 - mse: 1.0746 - val_loss: 0.7777 - val_mse: 0.7777
Epoch 7/10
1/44 [..............................] - ETA: 0s - loss: 0.9508 - mse: 0.9508
17/44 [==========>...................] - ETA: 0s - loss: 1.1560 - mse: 1.1560
33/44 [=====================>........] - ETA: 0s - loss: 1.2422 - mse: 1.2422
44/44 [==============================] - 0s 4ms/step - loss: 1.0744 - mse: 1.0744
44/44 [==============================] - 0s 7ms/step - loss: 1.0744 - mse: 1.0744 - val_loss: 0.7781 - val_mse: 0.7781
Epoch 8/10
1/44 [..............................] - ETA: 0s - loss: 0.9561 - mse: 0.9561
12/44 [=======>......................] - ETA: 0s - loss: 1.3166 - mse: 1.3166
21/44 [=============>................] - ETA: 0s - loss: 1.0524 - mse: 1.0524
29/44 [==================>...........] - ETA: 0s - loss: 1.2945 - mse: 1.2945
43/44 [============================>.] - ETA: 0s - loss: 1.0809 - mse: 1.0809
44/44 [==============================] - 0s 5ms/step - loss: 1.0733 - mse: 1.0733
44/44 [==============================] - 0s 9ms/step - loss: 1.0733 - mse: 1.0733 - val_loss: 0.7786 - val_mse: 0.7786
Epoch 9/10
1/44 [..............................] - ETA: 0s - loss: 0.9443 - mse: 0.9443
12/44 [=======>......................] - ETA: 0s - loss: 1.3194 - mse: 1.3194
27/44 [=================>............] - ETA: 0s - loss: 1.2326 - mse: 1.2326
38/44 [========================>.....] - ETA: 0s - loss: 1.1413 - mse: 1.1413
44/44 [==============================] - 0s 4ms/step - loss: 1.0730 - mse: 1.0730
44/44 [==============================] - 0s 8ms/step - loss: 1.0730 - mse: 1.0730 - val_loss: 0.7800 - val_mse: 0.7800
Epoch 10/10
1/44 [..............................] - ETA: 0s - loss: 0.9420 - mse: 0.9420
15/44 [=========>....................] - ETA: 0s - loss: 1.2291 - mse: 1.2291
24/44 [===============>..............] - ETA: 0s - loss: 1.0560 - mse: 1.0560
31/44 [====================>.........] - ETA: 0s - loss: 1.2664 - mse: 1.2664
41/44 [==========================>...] - ETA: 0s - loss: 1.1025 - mse: 1.1025
44/44 [==============================] - 0s 5ms/step - loss: 1.0750 - mse: 1.0750
44/44 [==============================] - 0s 10ms/step - loss: 1.0750 - mse: 1.0750 - val_loss: 0.7789 - val_mse: 0.7789
hist_model %>% plot()
`geom_smooth()` using formula 'y ~ x'
model %>% evaluate(x_test_arr, y_test_arr)
1/39 [..............................] - ETA: 0s - loss: 0.6815 - mse: 0.6815
32/39 [=======================>......] - ETA: 0s - loss: 2.8909 - mse: 2.8909
39/39 [==============================] - 0s 2ms/step - loss: 9.8967 - mse: 9.8967
39/39 [==============================] - 0s 2ms/step - loss: 9.8967 - mse: 9.8967
loss mse
9.89671 9.89671
model_pred <- model %>% predict(x_test_arr) %>% as.numeric()
reverse_norm<- function(x, mean, sds) {
x_re <- (x * sds) + mean
return(x_re)
}
eval <- tibble(
index = data_test %>% pull(index),
truth = data_test %>% pull(value),
pred = model_pred %>% reverse_norm(x = ., mean = prep_history$mean, sds = prep_history$sds)
)
eval %>%
pivot_longer(-index) %>%
ggplot(aes(x = index, y = value, col = name)) +
geom_line()
Well… soso
data_example
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
[32] 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
[63] 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
[94] 94 95 96 97 98 99 100
n_timesteps <- 10 # Define that we would like to have 5 timesteps
batch_size <- 6 # Batch size (somewhat arbitrary)
n_features <- 1 # Number of features. Since we only predict the outcome based on its own sequence, it will be 1
timeseries_generator
, which will feed the LSTM (or other architecture) with on-the-fly generated sequencestrain_gen <-
timeseries_generator(
data = data_example, # The data we will use to create the sequences.
targets = data_example, # The putcome data, in this case the same, since we just want to predict the subsequent period
length = n_timesteps, # How many previous steps in the sequence should be used for the prediction
sampling_rate = 1, # Should we use every observation in the sequence or skip some?
stride = 1, # How many steps should be skipped
shuffle = FALSE, # Should the sequence be shuffled? In time-series prediction, we want to preserve the order of sequences, so always FALSE
batch_size = batch_size # size of the batches generated. USe this batch size also later in the LSTM
)
train_gen
<keras.preprocessing.sequence.TimeseriesGenerator>
batch_0 <- train_gen[0]
batch_0
[[1]]
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 1 2 3 4 5 6 7 8 9 10
[2,] 2 3 4 5 6 7 8 9 10 11
[3,] 3 4 5 6 7 8 9 10 11 12
[4,] 4 5 6 7 8 9 10 11 12 13
[5,] 5 6 7 8 9 10 11 12 13 14
[6,] 6 7 8 9 10 11 12 13 14 15
[[2]]
[1] 11 12 13 14 15 16
# create the model
model <- keras_model_sequential() %>%
# Add the layer. We will make it as simple as possible here with just one LSTM and an output layer.
layer_lstm(
units = 32,
batch_input_shape = c(batch_size, n_timesteps, n_features), # the first layer in a model needs to know the shape of the input data
#dropout = 0.1,
#recurrent_dropout = 0.1,
return_sequences = FALSE, # by default, an LSTM just returns the final state
stateful = TRUE) %>%
# Final output layer
layer_dense(units = 1)
model %>% compile(loss = 'mse', optimizer = optimizer_adam(), metrics = 'mse')
length(data_example)
[1] 100
stride
to time_p + 1
?# Define a function that outputs time_p timesteps for y
gen_timeseries_output <- function(data, n_timesteps_put){
target <- matrix(nrow = length(data), ncol =n_timesteps_out)
data <- data %>% as.numeric()
for (i in 1:length(data)) {
target[i,] <- data[(i+1):(i+n_timesteps_out)]
}
return(target)
}
n_timesteps_out <- 5
outcome_sequnce <- data_example %>%
gen_timeseries_output(n_timesteps_out)
outcome_sequnce %>% head(20)
[,1] [,2] [,3] [,4] [,5]
[1,] 2 3 4 5 6
[2,] 3 4 5 6 7
[3,] 4 5 6 7 8
[4,] 5 6 7 8 9
[5,] 6 7 8 9 10
[6,] 7 8 9 10 11
[7,] 8 9 10 11 12
[8,] 9 10 11 12 13
[9,] 10 11 12 13 14
[10,] 11 12 13 14 15
[11,] 12 13 14 15 16
[12,] 13 14 15 16 17
[13,] 14 15 16 17 18
[14,] 15 16 17 18 19
[15,] 16 17 18 19 20
[16,] 17 18 19 20 21
[17,] 18 19 20 21 22
[18,] 19 20 21 22 23
[19,] 20 21 22 23 24
[20,] 21 22 23 24 25
train_gen_seq <-
timeseries_generator(
data = data,
targets = outcome_sequnce,
length = 5,
sampling_rate = 1,
stride = 1,
shuffle = FALSE,
batch_size = 16
)
Error in py_get_attr_impl(x, name, silent) :
AttributeError: 'list' object has no attribute 'astype'
batch_0_seq = train_gen_seq[0]
batch_0_seq
n_x <- 1 # number of features
time_x <- 4 # 4 days
time_y <- 1 # ... to predict one day ahead
# TRansforming the x sequence to a 3d tensor (necessary for LSTMs)
x_train_arr_n1 <- data_recipe %>% juice() %>% pull(value) %>% as.matrix(ncol = time_x) %>% array_reshape(dim = c(nrow(.), ncol(.), 1))
x_test_arr_n1 <- x_test %>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1, 1))
y_train_arr2 <- y_train %>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1))
y_test_arr2 <- y_test %>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1))
x <- data_recipe %>% juice() %>% pull(value) %>% matrix(ncol = time_x)
dim(x)[2]
length(x)[2]
x_train_arr %>% dim()
x_train_arr %>% glimpse()
tsteps_x = 5
tsteps_y = 5
train_arr <- x_train %>% pull(value) %>% as.numeric() %>% matrix(ncol = (tsteps_x + tsteps_y))
x_train_arr <- train_arr[,1:tsteps_x] %>% array_reshape(dim = c(length(.), 1, 1))
#x_train %<>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1, 1))
#x_test %<>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1, 1))
#y_train %<>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1))
#y_test %<>% pull(value) %>% as.numeric() %>% array_reshape(dim = c(length(.), 1))