Project Description
Time series forecast for electricity prices. In this particular forecast, random forest, exponential smoothing method as proposed in paper by Hyndman et al (2008), linear model and ARIMA. The goal was to minimize MAE (mean square error) and RMSE (root mean square error). Entire code was writen in R.
The goal was to predict price for x to x+24 hours in future using data until x. Once predicted, choose y = x + 24 and predict price from y to y + 24 by training on data until y. In the end we look at test statistics and choose the best one. We only have time series of price point 1 and highly correlated price point 2 which represents different type of electricity. So two time series data only. Things taken into account were therefore past price movements and seasonality. For more information and additional files, feel free to contact me.
Code:
## Approach from the paper
### Below is the model and it's statistics
result.df
cat("MAE is:",MAE,"and RMSE is:",RMSE,"\n")### Below is the proceedure.
library("randomForest")
require("ranger")
require("forecast")#' Tests auto.arima and ets time serias model for prices
#' @param price.file.name Name of CSV file with prices
#' @param start.date First trading date (yyyy-mm-dd string in central European time zone)
#' @param last.date Last trading date (yyyy-mm-dd string in central European time zone)
#' @param lookback.window Window in days
#' @param price.type Name of price field in the input file (e.g. lambdaD)
#' @param use.log If true then log(price) will be used
#' @param market.time.zone String, default = 'CET'
#' @param print.progress If true prints processed dates
#' @return data.frame
#'
price_file_name = "./data/input/Germany15-16-price-ren-load-C.csv"
first_date = "2015-06-01 00:00:00 UTC"
last_date = "2016-12-31"
lookback_window = 8
market_time_zone <- 'CET'
print_progress = TRUE
output_folder = "../data/output/"
price_types = c("Price1_DA", "MAvsMD", "IBvsMD")
# test.models <- function(price.file.name, first.date, last.date, lookback.window,
# price.type, use.log, market.time.zone = 'CET', print.progress = TRUE) {
price.type <- price_types[1]
date.parse <- function(s) {
s2 <- gsub('([+-][0-9]{2}):([0-9]{2})$', '\\1\\2', s)as.POSIXct(strptime(s2, format = "%Y-%m-%d %H:%M:%S", tz = 'UTC'))
}prices <- read.csv(file=price_file_name,head=TRUE)
prices["time"] <- lapply(prices["time"], date.parse)
prices$Dlog_ren_fcst <- c(NA, diff(log(ts(prices$Renewable_Forecast, frequency = 24))))
prices$Dlog_ren_act <- c(NA, diff(log(ts(prices$Renewable_Actual, frequency = 24))))
prices$Dlog_load <- c(NA, diff(log(ts(prices$Load_Actual, frequency = 24))))first.date <- as.POSIXct(first_date, tz = market_time_zone)
last.date <- as.POSIXct(last_date, tz = market_time_zone)
trading.date <- first_dateresult.df <- data.frame()
# Get the future ETS model predictions for whole period. Same model as previously sent.
while (trading.date <= last.date) {
if (print_progress) {
print(sprintf('Processing %s %s, %s', price.type, format(date.parse(trading.date), tz = market_time_zone),
format(Sys.time(), "%b %d %H:%M:%S %Z(%z)", tz='CET')))
}
train.first.moment <- round.POSIXt(date.parse(trading.date) - lookback_window * 24 * 3600, units = 'days')
after.train.last.moment <- date.parse(trading.date)
test.first.moment <- trading.date
after.test.last.moment <- round.POSIXt(date.parse(trading.date) + 1 + 24 * 3600, units = 'days')
after.test.last.moment <- date.parse(trading.date) + 1 + 24 * 3600train <- subset(prices, prices$time >= as.POSIXct(c
(as.POSIXct(train.first.moment,tz = market_time_zone)),
tz = 'UTC') & prices$time <
as.POSIXct(c(as.POSIXct(after.train.last.moment, tz = market_time_zone)), tz = 'UTC'))
train_y <- ts(train[price.type], frequency = 24)test <- subset(prices, prices$time >= as.POSIXct(c(as.POSIXct(test.first.moment, tz = market_time_zone)),
tz = 'UTC') & prices$time <
as.POSIXct(c(as.POSIXct(after.test.last.moment, tz = market_time_zone)), tz = 'UTC'))test_y <- ts(test[price.type], frequency = 24, start = c(end(train_y)[1] + 1, 1))
ets.model <- ets(train_y)
ets.forecast <- forecast(ets.model, h = 24)for (i in 1:24) {
result.df <- rbind(result.df, data.frame(trading.date = date.parse(trading.date) + (i-1) * 3600,
price.actual = test_y[i],
ets.fcst = ets.forecast$mean[i]
))
}
trading.date <- format(round.POSIXt(date.parse(trading.date) + 24 * 3600, units = 'days') - 1 +1,format="%Y-%m-%d %H:%M:%S")
}
#attach the numbers to the dataset.
prices$ets_forecast <- NA
i <- 1
while (i <= nrow(result.df)){
myindex <- which(prices$time == result.df$trading.date[i])
prices$ets_forecast[myindex] <- result.df$ets.fcst[i]
i <- i + 1
}
### Now we begin with increasing dataset and including relevant variables.
## Write parameteres all over again.
price_file_name = "./data/input/Germany15-16-price-ren-load-C.csv"
first_date = "2015-02-01 00:00:00 UTC"
last_date = "2016-12-31"
lookback_window = 8
market_time_zone <- 'CET'
print_progress = TRUE
output_folder = "../data/output/"
price_types = c("Price1_DA", "MAvsMD", "IBvsMD")
## which hour do I start with?
start_hour <- first_date
start_hour <- date.parse(start_hour)
start_hour <- format(as.POSIXct(start_hour, format="%Y%m-%d %H:%M"), format="%H:%M")# Defining holidays. This is probably not necessary, but at the time when there are public holidays in most
# German regions there might be slighlty different prices. Might improve the fit a little bit.
holidays <- c("2016-01-01","2016-01-06","2016-05-01","2016-06-15","2016-10-03","2016-11-01", "2016-11-22",
"2016-12-25","2016-12-26", "2015-12-26","2015-12-25","2015-06-15","2015-10-03","2015-11-01", "2015-11-22")
prices$holidays <- 0
for (i in 1:nrow(prices)){
if (any(holidays == as.Date(prices$time[i]))){
prices$holidays[i] <- 1
}
}
# This is the equation for later model to be used. All relevant features are included.
train_equation <- Price1_DA ~ Dlog_ren_act + Dlog_load + days_23 + days_24 + days_25 + days_48 +# days_49 +
lag_mean + last_info + diff_6_info + days_24_P2 + last_info_P2 +last6_info_P2 + last_3 +
ets_forecast + day_of_weekMonday + day_of_weekSaturday + holidays +
day_of_weekSunday + last_minprice_types = c("Price1_DA", "MAvsMD", "IBvsMD")
price.type <- price_types[1]
prices["time"] <- lapply(prices["time"], date.parse)
prices$Dlog_ren_fcst <- c(NA, diff(log(ts(prices$Renewable_Forecast, frequency = 24))))
prices$Dlog_ren_act <- c(NA, diff(log(ts(prices$Renewable_Actual, frequency = 24))))
prices$Dlog_load <- c(NA, diff(log(ts(prices$Load_Actual, frequency = 24))))
prices$Dlog_load_fcst <- c(NA, diff(log(ts(prices$Load_Forecast, frequency = 24))))first.date <- format(as.POSIXct(first_date, tz = market_time_zone), format="%Y%m-%d %H:%M")
last.date <- format(as.POSIXct(last_date, tz = market_time_zone), format="%Y%m-%d %H:%M")#### Looking at prices 23,24,25,47,48,49 hours ago
### Always also looking at the price of highly correlated Price2 (also x hours ago).
twenty_three <- prices$Price1_DA[1:(length(prices$Price1_DA)-23)]
twenty_three <- c(rep(NA,23),twenty_three)
prices$days_23 <- twenty_three
twenty_four <- prices$Price1_DA[1:(length(prices$Price1_DA)-24)]
twenty_four <- c(rep(NA,24),twenty_four)
prices$days_24 <- twenty_four
twenty_four <- prices$Price2_ID[1:(length(prices$Price2_ID)-24)]
twenty_four <- c(rep(NA,24),twenty_four)
prices$days_24_P2 <- twenty_four
twenty_four <- prices$Price3_IB[1:(length(prices$Price3_IB)-24)]
twenty_four <- c(rep(NA,24),twenty_four)
prices$days_24_P3 <- twenty_fourtwenty_five <- prices$Price1_DA[1:(length(prices$Price1_DA)-25)]
twenty_five <- c(rep(NA,25),twenty_five)
prices$days_25 <- twenty_five
twenty_five <- prices$Price2_ID[1:(length(prices$Price2_ID)-25)]
twenty_five <- c(rep(NA,25),twenty_five)
prices$days_25_P2 <- twenty_fiveforty_seven <- prices$Price1_DA[1:(length(prices$Price1_DA)-47)]
forty_seven <- c(rep(NA,47),forty_seven)
prices$days_47 <- forty_seven
forty_seven <- prices$Price2_ID[1:(length(prices$Price2_ID)-47)]
forty_seven <- c(rep(NA,47),forty_seven)
prices$days_47_P2 <- forty_sevenforty_eight <- prices$Price1_DA[1:(length(prices$Price1_DA)-48)]
forty_eight <- c(rep(NA,48),forty_eight)
prices$days_48 <- forty_eight
forty_eight <- prices$Price2_ID[1:(length(prices$Price2_ID)-48)]
forty_eight <- c(rep(NA,48),forty_eight)
prices$days_48_P2 <- forty_eightforty_nine <- prices$Price1_DA[1:(length(prices$Price1_DA)-49)]
forty_nine <- c(rep(NA,49),forty_nine)
prices$days_49 <- forty_nine
forty_nine <- prices$Price2_ID[1:(length(prices$Price2_ID)-49)]
forty_nine <- c(rep(NA,49),forty_nine)
prices$days_49_P2 <- forty_ninen <- nrow(prices)
### ok now last 6 hours in check
### This function is important to always start at specific time; for example midnight. Sometimes time might change,
### perhaps because we change the clock twice a year.
right_start <- function(mytime,start_time){
mytime <- date.parse(mytime)
if (start_time != format(as.POSIXct(mytime, format="%Y-%m-%d %H:%M"), format="%H:%M")){
mytime <- date.parse(mytime) + 3600
if (start_time != format(as.POSIXct(mytime, format="%Y-%m-%d %H:%M"), format="%H:%M")){
mytime <- date.parse(mytime) - 7200
}
}
return(mytime)
}time=format(as.POSIXct(prices$time, format="%Y-%m-%d %H:%M"), format="%H:%M")
any(time == start_hour)## Adding more relevant features. Some of them were inspired by Ziel and Weron (2017)
last_moment <- which(time == start_hour)
last_info <- rep(NA,nrow(prices))
last6_info <- rep(NA,nrow(prices))
diff_6_days <- rep(NA,nrow(prices)) # 6 hours actually
lag_mean <- rep(NA,length(prices$Price1_DA))
last_max <- rep(NA,nrow(prices))
last_min <- rep(NA,nrow(prices))
last_drop <- rep(NA,nrow(prices)) # drop in last 6 hrs.
last_3 <- rep(NA,nrow(prices)) # last 3 hrs
current_info <- NA
current_info6 <- NA
current_info_diff <- NA
current_lag_mean <- NA
current_max <- NA
current_min <- NA
current_drop <- NA
current_3 <- NA
i <- 26
while (i <= n){
last_info[i] <- current_info
last6_info[i] <- current_info6
diff_6_days[i] <- current_info_diff
lag_mean[i] <- current_lag_mean
last_max[i] <- current_max
last_min[i] <- current_min
last_drop[i] <- current_drop
last_3[i] <- current_3
if (any(last_moment == i)){
current_info <- prices$Price1_DA[i]
old <- current_info6
current_info6 <- mean(prices$Price1_DA[(i-5):i])
current_info_diff <- current_info6 - old
old <- current_lag_mean
current_lag_mean <- mean(prices$Price1_DA[(i-24):(i-1)])
current_max <- max(prices$Price1_DA[(i-24):(i-1)])
current_min <- min(prices$Price1_DA[(i-24):(i-1)])
current_drop <- prices$Price1_DA[i]-prices$Price1_DA[i-5]
prices$days_23[i] <- prices$days_23[i-1] + prices$days_23[i-24] - prices$days_23[i-25] + (current_lag_mean-old)/24
current_3 <- mean(prices$Price1_DA[(i-2):i])
}
i <- i + 1
}
prices$last_info <- last_info
prices$last6_info <- last6_info
prices$diff_6_info <- diff_6_days
prices$last_max <- last_max
prices$last_min <- last_min
prices$lag_mean <- lag_mean
prices$last_drop <- last_drop
prices$last_3 <- last_3### For Price 2 as well;
last_moment <- which(time == start_hour)
last_info <- rep(NA,nrow(prices))
last6_info <- rep(NA,nrow(prices))
diff_6_days <- rep(NA,nrow(prices))
lag_mean <- rep(NA,length(prices$Price2_ID))
last_max <- rep(NA,nrow(prices))
last_min <- rep(NA,nrow(prices))
current_info <- NA
current_info6 <- NA
current_info_diff <- NA
current_lag_mean <- NA
current_max <- NA
current_min <- NA
i <- 24
while (i <= n){
last_info[i] <- current_info
last6_info[i] <- current_info6
diff_6_days[i] <- current_info_diff
lag_mean[i] <- current_lag_mean
last_max[i] <- current_max
last_min[i] <- current_min
if (any(last_moment == i)){
current_info <- prices$Price2_ID[i]
old <- current_info6
current_info6 <- mean(prices$Price2_ID[(i-5):i])
current_info_diff <- current_info6 - old
current_lag_mean <- mean(prices$Price2_ID[(i-24):(i-1)])
current_max <- max(prices$Price2_ID[(i-24):(i-1)])
current_min <- min(prices$Price2_ID[(i-24):(i-1)])
}
i <- i + 1
}
prices$last_info_P2 <- last_info
prices$last6_info_P2 <- last6_info
prices$diff_6_info_P2 <- diff_6_days
prices$last_max_P2 <- last_max
prices$last_min_P2 <- last_min
prices$lag_mean_P2 <- lag_mean### daily average and difference (daily change) from yesterday to one day before.
lag_mean <- rep(0,length(prices$Price1_DA))
lag_mean_P2 <- rep(0,length(prices$Price1_DA))i <- 48
while(i <= n){
lag_mean[i] <- prices$lag_mean[i-24]
lag_mean_P2[i] <- prices$lag_mean_P2[i-24]
i <- i + 1
}
prices$lag_mean_2days <- lag_mean
prices$Daily_change <- prices$lag_mean - prices$lag_mean_2days
prices$lag_mean_2days_P2 <- lag_mean_P2
prices$Daily_change_P2 <- prices$lag_mean_P2 - prices$lag_mean_2days_P2# creating hours variable
prices$hours <- format(as.POSIXct(prices$time, format="%Y%m-%d %H:%M"), format="%H")
prices$hours <- as.character(prices$hours)## Day of the week effect;
day_of_week <- weekdays(prices$time)
weekdays <- model.matrix(~day_of_week)
weekdays <- as.data.frame(weekdays)
weekdays$`(Intercept)` <- NULL
prices <- cbind(prices,weekdays)result.df <- data.frame()
trading.date <- first_date
### Now the training and forecasting starts; loop below
while (trading.date <= last.date) {
if (print_progress) {
print(sprintf('Processing %s %s, %s', price.type, format(date.parse(trading.date), format="%Y-%m-%d %H:%M:%S"),
format(Sys.time(), "%b %d %H:%M:%S %Z(%z)", tz='CET')))
}train.first.moment <- round.POSIXt(date.parse(trading.date) - lookback_window * 24 * 3600, units = 'days')
# 5-6 months of history would format(right_start(trading.date,start_hour),format="%Y-%m-%d %H:%M:%S")
after.train.last.moment <- date.parse(trading.date)
test.first.moment <- trading.date
after.test.last.moment <- round.POSIXt(date.parse(trading.date) + 24 * 3600, units = 'days')
after.test.last.moment <- date.parse(trading.date) + 24 * 3600
train <- subset(prices, prices$time > as.POSIXct(c
(as.POSIXct(train.first.moment,tz = market_time_zone)),
tz = 'UTC') & prices$time <=
as.POSIXct(c(as.POSIXct(after.train.last.moment, tz = market_time_zone)), tz = 'UTC'))myhours <- train$hours[1:12]
myindexes <- c()
for(k in myhours){
myindexes <- c(myindexes,which(train$hours==k))
}
all_indexes <- 1:nrow(train)
half_train <- train[myindexes,] # creating training for first 12
myhours <- train$hours[13:24]
myindexes <- c()
for(k in myhours){
myindexes <- c(myindexes,which(train$hours==k))
}
all_indexes <- 1:nrow(train)
full_train <- train[myindexes,] # second halftest <- subset(prices, prices$time > as.POSIXct(c(as.POSIXct(test.first.moment, tz = market_time_zone))) &
prices$time <= as.POSIXct(c(as.POSIXct(after.test.last.moment, tz = market_time_zone)), tz = 'UTC'))
test$Dlog_ren_act <- test$Dlog_ren_fcst
test$Dlog_load <- test$Dlog_load
half_test <- test[1:floor(nrow(test)/2),] # creating test for first 12
full_test <- test[(nrow(half_test)+1):nrow(test),] # second half
# After getting dataset we split it on first and second 12 hours. Then we forecast seperately first and second 12 hours.
model1 <- lm( formula = train_equation, data = half_train)
y1 <- predict(model1, half_test)
names(y1) <- NULL
### Increase num.trees to get higher accuracy in prediction.model1 <- ranger( formula = train_equation, data = half_train, num.trees = 100)
y1 <- predict(model1, half_test)
y1 <- y1$predictions### Increase num.trees to get higher accuracy in prediction.
model2 <- ranger( formula = train_equation, data = full_train, num.trees = 100)y2 <- predict(model2, full_test)
y2 <- y2$predictions
y <- c(y1,y2)
result.df <- rbind(result.df,data.frame(trading.date = test$time,price.actual = test$Price1_DA , OLS_fcast = y))trading.date <- format(date.parse(trading.date) + 24 * 3600,format="%Y-%m-%d %H:%M:%S")
trading.date <- format(right_start(trading.date,start_hour),format="%Y-%m-%d %H:%M:%S")
}### After we get the results, we look at MAE and RMSE:
MAE <- sum((1/(nrow(result.df))) * abs(result.df$price.actual-result.df$OLS_fcast))
RMSE <- sqrt(sum((result.df$price.actual-result.df$OLS_fcast)^2)/nrow(result.df))