From 266ab9afea8a2b58504787df1410f5f863086cb8 Mon Sep 17 00:00:00 2001
From: MMenchero <marianamenchero@gmail.com>
Date: Thu, 9 Nov 2023 20:16:11 -0600
Subject: [PATCH] WIP: Added support for exogenous variables and improved plot
 function.

---
 R/date_conversion.R              |  2 +-
 R/timegpt_anomaly_detection.R    | 16 ++++++---
 R/timegpt_cross_validation.R     | 20 +++++++++--
 R/timegpt_forecast.R             | 31 ++++++++++-------
 R/timegpt_historic.R             | 16 ++++++---
 R/timegpt_plot.R                 | 58 +++++++++++++++++++++++++-------
 man/timegpt_anomaly_detection.Rd |  3 --
 man/timegpt_historic.Rd          |  3 --
 8 files changed, 108 insertions(+), 41 deletions(-)

diff --git a/R/date_conversion.R b/R/date_conversion.R
index 4151a39..82abd25 100644
--- a/R/date_conversion.R
+++ b/R/date_conversion.R
@@ -23,7 +23,7 @@ date_conversion <- function(df){
   }else if(cls == "Date"){
     freq <- "D"
 
-  }else if(cls %in% c("POSIXct", "POSIXt")){
+  }else if(cls %in% c("POSIXt", "POSIXct", "POSIXlt")){
     freq <- "H"
 
   }else{
diff --git a/R/timegpt_anomaly_detection.R b/R/timegpt_anomaly_detection.R
index b271d69..5e9f4e3 100644
--- a/R/timegpt_anomaly_detection.R
+++ b/R/timegpt_anomaly_detection.R
@@ -5,14 +5,13 @@
 #' @param id_col Column that identifies each series.
 #' @param time_col Column that identifies each timestep.
 #' @param target_col Column that contains the target variable.
-#' @param X_df A tsibble or a data frame with future exogenous variables.
 #' @param level The confidence level (0-100) for the prediction interval used in anomaly detection. Default is 99.
 #' @param clean_ex_first Clean exogenous signal before making the forecasts using TimeGPT.
 #'
 #' @return A tsibble or a data frame with the anomalies detected in the historical period.
 #' @export
 #'
-timegpt_anomaly_detection <- function(df, freq=NULL, id_col=NULL, time_col="ds", target_col="y", X_df=NULL, level=c(99), clean_ex_first=TRUE){
+timegpt_anomaly_detection <- function(df, freq=NULL, id_col=NULL, time_col="ds", target_col="y", level=c(99), clean_ex_first=TRUE){
 
   # Validation ----
   token <- get("NIXTLAR_TOKEN", envir = nixtlaR_env)
@@ -38,8 +37,17 @@ timegpt_anomaly_detection <- function(df, freq=NULL, id_col=NULL, time_col="ds",
     clean_ex_first = clean_ex_first
   )
 
-  # Add exogenous regressors here
-  # ----------------------------*
+  if(any(!(names(df) %in% c("unique_id", "ds", "y")))){
+    exogenous <- df |>
+      dplyr::select(-y)
+
+    x <- list(
+      columns = names(exogenous),
+      data = lapply(1:nrow(exogenous), function(i) as.list(exogenous[i,]))
+    )
+
+    timegpt_data[['x']] <- x
+  }
 
   if(length(level) > 1){
     message("Multiple levels are not allowed for anomaly detection. Will use the largest.")
diff --git a/R/timegpt_cross_validation.R b/R/timegpt_cross_validation.R
index 2f8373b..54cbcca 100644
--- a/R/timegpt_cross_validation.R
+++ b/R/timegpt_cross_validation.R
@@ -50,8 +50,24 @@ timegpt_cross_validation <- function(df, h=8, freq=NULL, id_col=NULL, time_col="
     clean_ex_first = clean_ex_first
   )
 
-  # Add exogenous regressors here
-  # ----------------------------*
+  if(!is.null(X_df)){
+    names(X_df)[which(names(X_df) == time_col)] <- "ds"
+    if(!is.null(id_col)){
+      names(X_df)[which(names(X_df) == id_col)] <- "unique_id"
+    }
+
+    exogenous <-  df |>
+      dplyr::select(-y)
+
+    exogenous <- rbind(exogenous, X_df)
+
+    x <- list(
+      columns = names(exogenous),
+      data = lapply(1:nrow(exogenous), function(i) as.list(exogenous[i,]))
+    )
+
+    timegpt_data[['x']] <- x
+  }
 
   if(!is.null(level)){
     level <- as.list(level)
diff --git a/R/timegpt_forecast.R b/R/timegpt_forecast.R
index bdedad9..a574858 100644
--- a/R/timegpt_forecast.R
+++ b/R/timegpt_forecast.R
@@ -43,17 +43,24 @@ timegpt_forecast <- function(df, h=8, freq=NULL, id_col=NULL, time_col="ds", tar
     clean_ex_first = clean_ex_first
     )
 
-  # if(!is.null(X_df)){
-  #   names(X_df)[which(names(X_df) == time_col)] <- "ds"
-  #   if(!is.null(id_col)){
-  #     names(X_df)[which(names(X_df) == id_col)] <- "unique_id"
-  #   }
-  #   x <- list(
-  #     columns = names(X_df),
-  #     data = lapply(1:nrow(X_df), function(i) as.list(X_df[i,]))
-  #   )
-  #   timegpt_data[["x"]] <- x
-  # }
+  if(!is.null(X_df)){
+    names(X_df)[which(names(X_df) == time_col)] <- "ds"
+    if(!is.null(id_col)){
+      names(X_df)[which(names(X_df) == id_col)] <- "unique_id"
+    }
+
+    exogenous <-  df |>
+      dplyr::select(-y)
+
+    exogenous <- rbind(exogenous, X_df)
+
+    x <- list(
+      columns = names(exogenous),
+      data = lapply(1:nrow(exogenous), function(i) as.list(exogenous[i,]))
+    )
+
+    timegpt_data[['x']] <- x
+  }
 
   if(!is.null(level)){
     level <- as.list(level)
@@ -126,7 +133,7 @@ timegpt_forecast <- function(df, h=8, freq=NULL, id_col=NULL, time_col="ds", tar
 
   # Generate fitted values ----
   if(add_history){
-    fitted <- timegpt_historic(df, freq=freq, id_col=id_col, time_col=time_col, target_col=target_col, X_df=X_df, level=level, finetune_steps=finetune_steps, clean_ex_first=clean_ex_first)
+    fitted <- timegpt_historic(df, freq=freq, id_col=id_col, time_col=time_col, target_col=target_col, level=level, finetune_steps=finetune_steps, clean_ex_first=clean_ex_first)
     if(tsibble::is_tsibble(df)){
       fcst <- dplyr::bind_rows(fitted, fcst)
     }else{
diff --git a/R/timegpt_historic.R b/R/timegpt_historic.R
index f5a9c6d..b9a7ea6 100644
--- a/R/timegpt_historic.R
+++ b/R/timegpt_historic.R
@@ -5,7 +5,6 @@
 #' @param id_col Column that identifies each series.
 #' @param time_col Column that identifies each timestep.
 #' @param target_col Column that contains the target variable.
-#' @param X_df A tsibble or a data frame with future exogenous variables.
 #' @param level The confidence levels (0-100) for the prediction intervals.
 #' @param finetune_steps Number of steps used to finetune TimeGPT in the new data.
 #' @param clean_ex_first Clean exogenous signal before making the forecasts using TimeGPT.
@@ -13,7 +12,7 @@
 #' @return TimeGPT's forecast for the in-sample period.
 #' @export
 #'
-timegpt_historic <- function(df, freq=NULL, id_col=NULL, time_col="ds", target_col="y", X_df=NULL, level=NULL, finetune_steps=0, clean_ex_first=TRUE){
+timegpt_historic <- function(df, freq=NULL, id_col=NULL, time_col="ds", target_col="y", level=NULL, finetune_steps=0, clean_ex_first=TRUE){
 
   # Validation ----
   token <- get("NIXTLAR_TOKEN", envir = nixtlaR_env)
@@ -40,8 +39,17 @@ timegpt_historic <- function(df, freq=NULL, id_col=NULL, time_col="ds", target_c
     clean_ex_first = clean_ex_first
   )
 
-  # Add exogenous regressors here
-  # ----------------------------*
+  if(any(!(names(df) %in% c("unique_id", "ds", "y")))){
+    exogenous <- df |>
+      dplyr::select(-y)
+
+    x <- list(
+      columns = names(exogenous),
+      data = lapply(1:nrow(exogenous), function(i) as.list(exogenous[i,]))
+    )
+
+    timegpt_data[['x']] <- x
+  }
 
   if(!is.null(level)){
     level <- as.list(level)
diff --git a/R/timegpt_plot.R b/R/timegpt_plot.R
index 322d2bd..93a306a 100644
--- a/R/timegpt_plot.R
+++ b/R/timegpt_plot.R
@@ -15,6 +15,10 @@
 #'
 timegpt_plot <- function(df, fcst=NULL, h=NULL, id_col=NULL, time_col="ds", target_col="y", unique_ids = NULL, max_insample_length=NULL, plot_anomalies=FALSE){
 
+  if(!tsibble::is_tsibble(df) & !is.data.frame(df)){
+    stop("Only tsibbles or data frames are allowed.")
+  }
+
   # Select facets ----
   nrow <- 4
   ncol <- 2
@@ -25,14 +29,16 @@ timegpt_plot <- function(df, fcst=NULL, h=NULL, id_col=NULL, time_col="ds", targ
   if(!is.null(id_col)){
     names(df)[which(names(df) == id_col)] <- "unique_id"
 
+    ids <- unique(df$unique_id)
+    if(length(ids) == 2){ # reshape for better viz
+      nrow <- 2
+      ncol <- 1
+    }
+
     ## Select time series if there are more than 8 ----
-    if(length(unique(df$unique_id)) > 8){
+    if(length(ids) > 8){
       if(!is.null(unique_ids)){
         ids <- unique_ids[1:min(length(unique_ids), 8)]
-        if(length(ids) == 2){ # reshape for better viz
-          nrow = 2
-          ncol = 1
-        }
       }else{
         ids <- sample(unique(df$unique_id), size=8, replace=FALSE)
       }
@@ -47,13 +53,27 @@ timegpt_plot <- function(df, fcst=NULL, h=NULL, id_col=NULL, time_col="ds", targ
     }
   }
 
-  # Check for cross validation output
-  cross_validation <- FALSE
-  if("cutoff" %in% names(fcst)){
-    cross_validation <- TRUE
-    if(plot_anomalies){
-      message("Can't plot anomalies and cross validation output at the same time. Setting plot_anomalies=FALSE")
-      plot_anomalies <- FALSE
+  # Convert dates if necessary ----
+  # ggplot2 requires ds to be Dates while TimeGPT's API requires them to be chr
+  cls <- class(df$ds)[1]
+  if(!(cls %in% c("Date", "POSIXt", "POSIXct", "POSIXlt"))){
+
+    if(tsibble::is_tsibble(df)){
+      df_list <- nixtlaR::date_conversion(df)
+      df <- df_list$df
+      freq <- df_list$freq
+    }else{
+      freq <- nixtlaR::infer_frequency(df)
+    }
+
+    if(is.null(freq)){
+      stop("Can't figure out the frequency of the data. Please convert time_col to Date or POSIXt.")
+    }
+
+    if(freq == "H"){
+      df$ds <- lubridate::ymd_hms(df$ds)
+    }else{
+      df$ds <- lubridate::ymd(df$ds)
     }
   }
 
@@ -74,6 +94,10 @@ timegpt_plot <- function(df, fcst=NULL, h=NULL, id_col=NULL, time_col="ds", targ
 
   }else{
     # Plot historical values and forecast ----
+    if(!tsibble::is_tsibble(fcst) & !is.data.frame(fcst)){
+      stop("fcst needs to be the output of timegpt_forecast, timegpt_historic, timegpt_anomaly_detection or timegpt_cross_validation.")
+    }
+
     color_vals <- c("#B5838D", "steelblue")
 
     # Rename forecast columns ----
@@ -83,6 +107,16 @@ timegpt_plot <- function(df, fcst=NULL, h=NULL, id_col=NULL, time_col="ds", targ
       names(fcst)[which(names(fcst) == id_col)] <- "unique_id"
     }
 
+    # Check for cross validation output
+    cross_validation <- FALSE
+    if("cutoff" %in% names(fcst)){
+      cross_validation <- TRUE
+      if(plot_anomalies){
+        message("Can't plot anomalies and cross validation output at the same time. Setting plot_anomalies=FALSE")
+        plot_anomalies <- FALSE
+      }
+    }
+
     if(!is.null(max_insample_length)){
       df <- df |>
         dplyr::group_by(.data$unique_id) |>
diff --git a/man/timegpt_anomaly_detection.Rd b/man/timegpt_anomaly_detection.Rd
index c884c29..ebdbaf2 100644
--- a/man/timegpt_anomaly_detection.Rd
+++ b/man/timegpt_anomaly_detection.Rd
@@ -10,7 +10,6 @@ timegpt_anomaly_detection(
   id_col = NULL,
   time_col = "ds",
   target_col = "y",
-  X_df = NULL,
   level = c(99),
   clean_ex_first = TRUE
 )
@@ -26,8 +25,6 @@ timegpt_anomaly_detection(
 
 \item{target_col}{Column that contains the target variable.}
 
-\item{X_df}{A tsibble or a data frame with future exogenous variables.}
-
 \item{level}{The confidence level (0-100) for the prediction interval used in anomaly detection. Default is 99.}
 
 \item{clean_ex_first}{Clean exogenous signal before making the forecasts using TimeGPT.}
diff --git a/man/timegpt_historic.Rd b/man/timegpt_historic.Rd
index 9b87630..375ab0c 100644
--- a/man/timegpt_historic.Rd
+++ b/man/timegpt_historic.Rd
@@ -10,7 +10,6 @@ timegpt_historic(
   id_col = NULL,
   time_col = "ds",
   target_col = "y",
-  X_df = NULL,
   level = NULL,
   finetune_steps = 0,
   clean_ex_first = TRUE
@@ -27,8 +26,6 @@ timegpt_historic(
 
 \item{target_col}{Column that contains the target variable.}
 
-\item{X_df}{A tsibble or a data frame with future exogenous variables.}
-
 \item{level}{The confidence levels (0-100) for the prediction intervals.}
 
 \item{finetune_steps}{Number of steps used to finetune TimeGPT in the new data.}