From 266ab9afea8a2b58504787df1410f5f863086cb8 Mon Sep 17 00:00:00 2001 From: MMenchero Date: Thu, 9 Nov 2023 20:16:11 -0600 Subject: [PATCH] WIP: Added support for exogenous variables and improved plot function. --- R/date_conversion.R | 2 +- R/timegpt_anomaly_detection.R | 16 ++++++--- R/timegpt_cross_validation.R | 20 +++++++++-- R/timegpt_forecast.R | 31 ++++++++++------- R/timegpt_historic.R | 16 ++++++--- R/timegpt_plot.R | 58 +++++++++++++++++++++++++------- man/timegpt_anomaly_detection.Rd | 3 -- man/timegpt_historic.Rd | 3 -- 8 files changed, 108 insertions(+), 41 deletions(-) diff --git a/R/date_conversion.R b/R/date_conversion.R index 4151a39..82abd25 100644 --- a/R/date_conversion.R +++ b/R/date_conversion.R @@ -23,7 +23,7 @@ date_conversion <- function(df){ }else if(cls == "Date"){ freq <- "D" - }else if(cls %in% c("POSIXct", "POSIXt")){ + }else if(cls %in% c("POSIXt", "POSIXct", "POSIXlt")){ freq <- "H" }else{ diff --git a/R/timegpt_anomaly_detection.R b/R/timegpt_anomaly_detection.R index b271d69..5e9f4e3 100644 --- a/R/timegpt_anomaly_detection.R +++ b/R/timegpt_anomaly_detection.R @@ -5,14 +5,13 @@ #' @param id_col Column that identifies each series. #' @param time_col Column that identifies each timestep. #' @param target_col Column that contains the target variable. -#' @param X_df A tsibble or a data frame with future exogenous variables. #' @param level The confidence level (0-100) for the prediction interval used in anomaly detection. Default is 99. #' @param clean_ex_first Clean exogenous signal before making the forecasts using TimeGPT. #' #' @return A tsibble or a data frame with the anomalies detected in the historical period. #' @export #' -timegpt_anomaly_detection <- function(df, freq=NULL, id_col=NULL, time_col="ds", target_col="y", X_df=NULL, level=c(99), clean_ex_first=TRUE){ +timegpt_anomaly_detection <- function(df, freq=NULL, id_col=NULL, time_col="ds", target_col="y", level=c(99), clean_ex_first=TRUE){ # Validation ---- token <- get("NIXTLAR_TOKEN", envir = nixtlaR_env) @@ -38,8 +37,17 @@ timegpt_anomaly_detection <- function(df, freq=NULL, id_col=NULL, time_col="ds", clean_ex_first = clean_ex_first ) - # Add exogenous regressors here - # ----------------------------* + if(any(!(names(df) %in% c("unique_id", "ds", "y")))){ + exogenous <- df |> + dplyr::select(-y) + + x <- list( + columns = names(exogenous), + data = lapply(1:nrow(exogenous), function(i) as.list(exogenous[i,])) + ) + + timegpt_data[['x']] <- x + } if(length(level) > 1){ message("Multiple levels are not allowed for anomaly detection. Will use the largest.") diff --git a/R/timegpt_cross_validation.R b/R/timegpt_cross_validation.R index 2f8373b..54cbcca 100644 --- a/R/timegpt_cross_validation.R +++ b/R/timegpt_cross_validation.R @@ -50,8 +50,24 @@ timegpt_cross_validation <- function(df, h=8, freq=NULL, id_col=NULL, time_col=" clean_ex_first = clean_ex_first ) - # Add exogenous regressors here - # ----------------------------* + if(!is.null(X_df)){ + names(X_df)[which(names(X_df) == time_col)] <- "ds" + if(!is.null(id_col)){ + names(X_df)[which(names(X_df) == id_col)] <- "unique_id" + } + + exogenous <- df |> + dplyr::select(-y) + + exogenous <- rbind(exogenous, X_df) + + x <- list( + columns = names(exogenous), + data = lapply(1:nrow(exogenous), function(i) as.list(exogenous[i,])) + ) + + timegpt_data[['x']] <- x + } if(!is.null(level)){ level <- as.list(level) diff --git a/R/timegpt_forecast.R b/R/timegpt_forecast.R index bdedad9..a574858 100644 --- a/R/timegpt_forecast.R +++ b/R/timegpt_forecast.R @@ -43,17 +43,24 @@ timegpt_forecast <- function(df, h=8, freq=NULL, id_col=NULL, time_col="ds", tar clean_ex_first = clean_ex_first ) - # if(!is.null(X_df)){ - # names(X_df)[which(names(X_df) == time_col)] <- "ds" - # if(!is.null(id_col)){ - # names(X_df)[which(names(X_df) == id_col)] <- "unique_id" - # } - # x <- list( - # columns = names(X_df), - # data = lapply(1:nrow(X_df), function(i) as.list(X_df[i,])) - # ) - # timegpt_data[["x"]] <- x - # } + if(!is.null(X_df)){ + names(X_df)[which(names(X_df) == time_col)] <- "ds" + if(!is.null(id_col)){ + names(X_df)[which(names(X_df) == id_col)] <- "unique_id" + } + + exogenous <- df |> + dplyr::select(-y) + + exogenous <- rbind(exogenous, X_df) + + x <- list( + columns = names(exogenous), + data = lapply(1:nrow(exogenous), function(i) as.list(exogenous[i,])) + ) + + timegpt_data[['x']] <- x + } if(!is.null(level)){ level <- as.list(level) @@ -126,7 +133,7 @@ timegpt_forecast <- function(df, h=8, freq=NULL, id_col=NULL, time_col="ds", tar # Generate fitted values ---- if(add_history){ - fitted <- timegpt_historic(df, freq=freq, id_col=id_col, time_col=time_col, target_col=target_col, X_df=X_df, level=level, finetune_steps=finetune_steps, clean_ex_first=clean_ex_first) + fitted <- timegpt_historic(df, freq=freq, id_col=id_col, time_col=time_col, target_col=target_col, level=level, finetune_steps=finetune_steps, clean_ex_first=clean_ex_first) if(tsibble::is_tsibble(df)){ fcst <- dplyr::bind_rows(fitted, fcst) }else{ diff --git a/R/timegpt_historic.R b/R/timegpt_historic.R index f5a9c6d..b9a7ea6 100644 --- a/R/timegpt_historic.R +++ b/R/timegpt_historic.R @@ -5,7 +5,6 @@ #' @param id_col Column that identifies each series. #' @param time_col Column that identifies each timestep. #' @param target_col Column that contains the target variable. -#' @param X_df A tsibble or a data frame with future exogenous variables. #' @param level The confidence levels (0-100) for the prediction intervals. #' @param finetune_steps Number of steps used to finetune TimeGPT in the new data. #' @param clean_ex_first Clean exogenous signal before making the forecasts using TimeGPT. @@ -13,7 +12,7 @@ #' @return TimeGPT's forecast for the in-sample period. #' @export #' -timegpt_historic <- function(df, freq=NULL, id_col=NULL, time_col="ds", target_col="y", X_df=NULL, level=NULL, finetune_steps=0, clean_ex_first=TRUE){ +timegpt_historic <- function(df, freq=NULL, id_col=NULL, time_col="ds", target_col="y", level=NULL, finetune_steps=0, clean_ex_first=TRUE){ # Validation ---- token <- get("NIXTLAR_TOKEN", envir = nixtlaR_env) @@ -40,8 +39,17 @@ timegpt_historic <- function(df, freq=NULL, id_col=NULL, time_col="ds", target_c clean_ex_first = clean_ex_first ) - # Add exogenous regressors here - # ----------------------------* + if(any(!(names(df) %in% c("unique_id", "ds", "y")))){ + exogenous <- df |> + dplyr::select(-y) + + x <- list( + columns = names(exogenous), + data = lapply(1:nrow(exogenous), function(i) as.list(exogenous[i,])) + ) + + timegpt_data[['x']] <- x + } if(!is.null(level)){ level <- as.list(level) diff --git a/R/timegpt_plot.R b/R/timegpt_plot.R index 322d2bd..93a306a 100644 --- a/R/timegpt_plot.R +++ b/R/timegpt_plot.R @@ -15,6 +15,10 @@ #' timegpt_plot <- function(df, fcst=NULL, h=NULL, id_col=NULL, time_col="ds", target_col="y", unique_ids = NULL, max_insample_length=NULL, plot_anomalies=FALSE){ + if(!tsibble::is_tsibble(df) & !is.data.frame(df)){ + stop("Only tsibbles or data frames are allowed.") + } + # Select facets ---- nrow <- 4 ncol <- 2 @@ -25,14 +29,16 @@ timegpt_plot <- function(df, fcst=NULL, h=NULL, id_col=NULL, time_col="ds", targ if(!is.null(id_col)){ names(df)[which(names(df) == id_col)] <- "unique_id" + ids <- unique(df$unique_id) + if(length(ids) == 2){ # reshape for better viz + nrow <- 2 + ncol <- 1 + } + ## Select time series if there are more than 8 ---- - if(length(unique(df$unique_id)) > 8){ + if(length(ids) > 8){ if(!is.null(unique_ids)){ ids <- unique_ids[1:min(length(unique_ids), 8)] - if(length(ids) == 2){ # reshape for better viz - nrow = 2 - ncol = 1 - } }else{ ids <- sample(unique(df$unique_id), size=8, replace=FALSE) } @@ -47,13 +53,27 @@ timegpt_plot <- function(df, fcst=NULL, h=NULL, id_col=NULL, time_col="ds", targ } } - # Check for cross validation output - cross_validation <- FALSE - if("cutoff" %in% names(fcst)){ - cross_validation <- TRUE - if(plot_anomalies){ - message("Can't plot anomalies and cross validation output at the same time. Setting plot_anomalies=FALSE") - plot_anomalies <- FALSE + # Convert dates if necessary ---- + # ggplot2 requires ds to be Dates while TimeGPT's API requires them to be chr + cls <- class(df$ds)[1] + if(!(cls %in% c("Date", "POSIXt", "POSIXct", "POSIXlt"))){ + + if(tsibble::is_tsibble(df)){ + df_list <- nixtlaR::date_conversion(df) + df <- df_list$df + freq <- df_list$freq + }else{ + freq <- nixtlaR::infer_frequency(df) + } + + if(is.null(freq)){ + stop("Can't figure out the frequency of the data. Please convert time_col to Date or POSIXt.") + } + + if(freq == "H"){ + df$ds <- lubridate::ymd_hms(df$ds) + }else{ + df$ds <- lubridate::ymd(df$ds) } } @@ -74,6 +94,10 @@ timegpt_plot <- function(df, fcst=NULL, h=NULL, id_col=NULL, time_col="ds", targ }else{ # Plot historical values and forecast ---- + if(!tsibble::is_tsibble(fcst) & !is.data.frame(fcst)){ + stop("fcst needs to be the output of timegpt_forecast, timegpt_historic, timegpt_anomaly_detection or timegpt_cross_validation.") + } + color_vals <- c("#B5838D", "steelblue") # Rename forecast columns ---- @@ -83,6 +107,16 @@ timegpt_plot <- function(df, fcst=NULL, h=NULL, id_col=NULL, time_col="ds", targ names(fcst)[which(names(fcst) == id_col)] <- "unique_id" } + # Check for cross validation output + cross_validation <- FALSE + if("cutoff" %in% names(fcst)){ + cross_validation <- TRUE + if(plot_anomalies){ + message("Can't plot anomalies and cross validation output at the same time. Setting plot_anomalies=FALSE") + plot_anomalies <- FALSE + } + } + if(!is.null(max_insample_length)){ df <- df |> dplyr::group_by(.data$unique_id) |> diff --git a/man/timegpt_anomaly_detection.Rd b/man/timegpt_anomaly_detection.Rd index c884c29..ebdbaf2 100644 --- a/man/timegpt_anomaly_detection.Rd +++ b/man/timegpt_anomaly_detection.Rd @@ -10,7 +10,6 @@ timegpt_anomaly_detection( id_col = NULL, time_col = "ds", target_col = "y", - X_df = NULL, level = c(99), clean_ex_first = TRUE ) @@ -26,8 +25,6 @@ timegpt_anomaly_detection( \item{target_col}{Column that contains the target variable.} -\item{X_df}{A tsibble or a data frame with future exogenous variables.} - \item{level}{The confidence level (0-100) for the prediction interval used in anomaly detection. Default is 99.} \item{clean_ex_first}{Clean exogenous signal before making the forecasts using TimeGPT.} diff --git a/man/timegpt_historic.Rd b/man/timegpt_historic.Rd index 9b87630..375ab0c 100644 --- a/man/timegpt_historic.Rd +++ b/man/timegpt_historic.Rd @@ -10,7 +10,6 @@ timegpt_historic( id_col = NULL, time_col = "ds", target_col = "y", - X_df = NULL, level = NULL, finetune_steps = 0, clean_ex_first = TRUE @@ -27,8 +26,6 @@ timegpt_historic( \item{target_col}{Column that contains the target variable.} -\item{X_df}{A tsibble or a data frame with future exogenous variables.} - \item{level}{The confidence levels (0-100) for the prediction intervals.} \item{finetune_steps}{Number of steps used to finetune TimeGPT in the new data.}