plot_jobstats

#!/usr/bin/env Rscript

# Read Uppmax jobstats files, analyse resource usage, and produce plots if
# requested.
#
# Latest is available at https://github.com/UPPMAX/jobstats
#
# DONE: add 'static' maxmem
# DONE: added bianca info
# DONE: add mem2TB node for snowy
# DONE: add core-busy dot, depend on --cpu-free threshhold
# DONE: add finishedjobinfo max-mem line even with extended jobstats
# DONE: add finishedjobinfo max-mem line to end of plot only if it is greater than the greatest max-mem reported in the jobstats data
# DONE: handle extended jobstats
# DONE: --version
# TODO: implement receiving job info on stdin with --stdin 
# TODO: decide what to do about multiple-node jobs with respect to produceFlags()
# TODO: set limits below using arguments
# TODO: better documentation
#
# see processArgs() for command-line argument structure

.version <- "2023-11-16"
.debug <- FALSE

.enable_stdin <- FALSE

# User options
do.flags <- TRUE  # print list of flags to stdout
do.memory <- FALSE  # include flags for memory-caution-only jobs
do.extended <- TRUE  # the jobstats file is the extended version
do.paging <- FALSE  # include PAGE_IN/PAGE_OUT information from extended jobstats
do.verbose <- FALSE  # produce verbose instead of terse flags

# When to produce flags, for more see produceFlags()
flag_overbooked.fraction <- 0.8
flag_half_overbooked.fraction <- 0.5
flag_severely_overbooked.fraction <- 0.25
flag_cores_overbooked.fraction <- 0.8
flag_mem_overbooked.fraction <- 0.25
flag_core_mem_overbooked.fraction <- 0.5

# In any given time slot, a CPU with a busy percentage below this is counted as
# unused
cpu_free <- 3.0

# Allow table output to be very wide, and don't convert strings to factors
options(width = 500, stringsAsFactors = FALSE)

# Basic plot options
do.plot <- TRUE  # produce the plot
do.big.plot <- FALSE  # produce plot with twice the dimensions
do.maxmem <- TRUE  # include trace of maxmem used by the process
do.stdin <- FALSE  # read job-specific info from stdin rather than from the command line
do.core.busy <- TRUE  # produce a blue dot for the number of busy cores, as deteremined by the cpu_free threshhold (settable by --cpu-free)
col.GB <- "black"
lty.GB <- 1  # memory lines
lwd.GB <- 2  # memory lines
col.core <- "blue"
lty.core <- 2  # core usage lines
lwd.core <- 2  # core usage lines
pch.core.busy <- "-"
maxmem.col <- "black"
maxmem.lty <- 3  # for maxmem
maxmem.lwd <- 1  # for maxmem
paging.col <- "green4"
paging.cex <- 0.8
paging.in.pch <- 2
paging.out.pch <- 6
swap.col <- "red"
swap.pch <- 16  # if swap was used
top.lines <- 9  # lines to leave at the top of the plot
main.line <- top.lines - 2  # main. options control the title
main.cex <- 1.7
main.sep <- "  "
user.line <- 1.5  # user. for the user name and project
user.cex <- 1.4
user.adj <- 0.02
axis.cex <- 1.2  # axis1. for x-axis, axis2. for y-axis
axis1.line <- 2.5
axis1.cex <- 1.2
axis2.line <- 4.2
flags.line <- user.line  # flags. for the usage flags
flags.sep <- ",  "
flags.line.sep <- ",\n"
flags.wrap <- 3
flags.col <- "red3"
flags.cex <- 1.2
flags.adj <- 0.98
pagein.pch <- 2
pagein.cex <- 1.2
pagein.col <- "green4"
pageout.pch <- 6
pageout.cex <- 1.2
pageout.col <- "coral3"

# When are data sampled, used for producing plots
sampling.plot.offset <- 2.5
sampling.window <- 5

# first column in jobstats file for core usage
first.core.column <- 9  # for extended jobstats; 6 for --no-extended

# sizes of available nodes for determining misbooking, sorted in
# increasing mem size
node.types <- list(
    milou           = c(mem128GB = 128, mem256GB = 256, mem512GB = 512),
    milou.default   = "mem128GB",
    irma            = c(mem256GB = 256),
    irma.default    = "mem256GB",
    kalkyl          = c(mem24GB = 24, mem48GB = 48, mem72GB = 72),
    kalkyl.default  = "mem24GB",
    tintin          = c(mem64GB = 64, mem128GB = 128),
    tintin.default  = "mem64GB",
    rackham         = c(mem128GB = 128, mem256GB = 256, mem1TB = 1024),
    rackham.default = "mem128GB",
    snowy           = c(mem128GB = 128, mem256GB = 256, mem512GB = 512, mem2TB = 2048, veryfat = 4096),
    snowy.default   = "mem128GB",
    bianca          = c(mem128GB = 128, mem256GB = 256, mem512GB = 512),
    bianca.default  = "mem128GB",
    halvan          = c(halvan = 2048),
    halvan.default  = "halvan")


# process command-line args, note these are order dependent
#
processArgs <- function(args) {
    job <- list()

    # peel off the first args, which specify options
    while ( 1 ) {
        switch(args[1],
            "-n" =, "--no-plot" =  { do.plot     <<- FALSE;
                                     args         <- args[-1] },
            "-v" =, "--verbose" =  { do.verbose  <<- TRUE;
                                     flags.wrap  <<- 2;
                                     args         <- args[-1] },
            "-m" =, "--memory"  =  { do.memory   <<- TRUE;
                                     args         <- args[-1] },
            "--no-extended"     =  { do.extended  <<- FALSE;
                                     first.core.column <<- 6; # within jobstats file
                                     args         <- args[-1] },
            "--paging"          =  { do.paging   <<- TRUE;
                                     args         <- args[-1] },
            "-c" =, "--cluster" =  { job$cluster  <- args[2];
                                    args          <- args[-c(1, 2)] },
            "-b" =, "--big-plot" = { do.big.plot <<- TRUE;
                                     args         <- args[-1] },
            "--cpu-free" =         { cpu_free    <<- as.numeric(args[2]);
                                     args         <- args[-c(1, 2)] },
            "--no-core-busy" =     { do.core.busy <<- FALSE;
                                     args         <- args[-1] },
            break)
    }

    # remaining args are either formatted lines (if next arg is --fji or --db)
    # a list of jobstats filenames to read

    if (args[1] == "--fji" || args[1] == "--finishedjobinfo" || args[1] == "--squeue") {

        # a finishedjobinfo column-wise set of args as produced by the jobstats
        # Perl script. If --squeue, it is a currently running job, and we have a
        # final arg that is run timelimit in minutes

        if (args[1] == "--squeue") {
            job$data_type <- "squeue"
        } else {
            job$data_type <- "finishedjobinfo"
        }
        args <- args[-1]

        if (args[1] == "--stdin") {
            do.stdin <<- TRUE
            stopifnot(.enable_stdin)
        } else {
            # jobid cluster jobstate user project jobname endtime runtime flags
            # coresbooked maxmem core_list node_list jobstats_file_list
            job$jobid     <- as.integer(args[1])
            job$cluster   <- if (is.null(job$cluster)) as.character(args[2])
                else job$cluster
            job$jobstate  <- as.character(args[3])
            job$user      <- as.character(args[4])
            job$project   <- as.character(args[5])
            job$jobname   <- as.character(args[6])
            job$endtime   <- as.character(args[7])
            job$runtime   <- as.character(args[8])
            job$initial_flag_list <- if (args[idx <- 9] == ".") character(0)
                else unlist(strsplit(args[idx], ",", fixed = TRUE))
            job$booked    <- if (args[idx <- 10] == ".") NA
                else as.integer(args[idx])
            job$maxmem    <- if (args[idx <- 11] == ".") NA
                else as.numeric(args[idx])
            job$core_list <- as.integer(unlist(strsplit(args[12], ",", fixed = TRUE)))
            job$node_list <- unlist(strsplit(args[13], ",", fixed = TRUE))
            job$file_list <- unlist(strsplit(args[14], ",", fixed = TRUE))
            if (job$data_type == "squeue")
                job$timelimit_minutes <- as.integer(args[15]) # last arg
        }

    } else if (args[1] == "--slurm") {

        # args as produced by Lennart Karlsson's script slurm.epilog at the end
        # of SLURM jobs.  ** NOT YET IMPLEMENTED **

        job$data_type <- "slurm"
        args <- args[-1]

        # for now act as if it is same as --fji
        #
        # jobid cluster jobstate user project jobname endtime runtime flags
        # coresbooked core_list node_list jobstats_file_list
        job$jobid     <- as.integer(args[1])
        job$cluster   <- if (is.null(job$cluster)) as.character(args[2])
                         else job$cluster
        job$jobstate  <- as.character(args[3])
        job$user      <- as.character(args[4])
        job$project   <- as.character(args[5])
        job$jobname   <- as.character(args[6])
        job$endtime   <- as.character(args[7])
        job$runtime   <- as.character(args[8])
        job$initial_flag_list <- if (args[idx <- 9] == ".") character(0)
            else unlist(strsplit(args[idx], ",", fixed = TRUE))
        job$booked    <- if (args[idx <- 10] == ".") NA
            else as.integer(args[idx])
        job$core_list <- as.integer(unlist(strsplit(args[11], ",", fixed = TRUE)))
        job$node_list <- unlist(strsplit(args[12], ",", fixed = TRUE))
        job$file_list <- unlist(strsplit(args[13], ",", fixed = TRUE))

    } else if (args[1] == "--db" || args[1] == "--database") {

        # column-wise set of args as produced by Martin Dahlo's sqlite3
        # database  ** NOT YET IMPLEMENTED **

        job$data_type <- "database"
        args <- args[-1]

        # jobid cluster jobstate user project jobname endtime runtime flags
        # coresbooked core_list node_list jobstats_file_list
        job$jobid     <- as.integer(args[1])
        job$cluster   <- if (is.null(job$cluster)) as.character(args[2])
                         else job$cluster
        job$jobstate  <- as.character(args[3])
        job$user      <- as.character(args[4])
        job$project   <- as.character(args[5])
        job$jobname   <- as.character(args[6])
        job$endtime   <- as.character(args[7])
        job$runtime   <- as.character(args[8])
        job$initial_flag_list <- if (args[idx <- 9] == ".") character(0)
            else unlist(strsplit(args[idx], ",", fixed = TRUE))
        job$booked <- if (args[idx <- 10] == ".") NA
            else as.integer(args[idx])
        job$core_list <- as.integer(unlist(strsplit(args[11], ",", fixed = TRUE)))
        job$node_list <- unlist(strsplit(args[12], ",", fixed = TRUE))
        job$file_list <- unlist(strsplit(args[13], ",", fixed = TRUE))

    } else {

        # arguments are a list of jobstats files

        job$data_type <- "file"
        job$cluster   <- if (is.null(job$cluster)) "unknown"
                         else job$cluster
        job$jobid     <- basename(args[1])
        job$file_list <- args
        # dummy up a node list
        job$node_list <- job$file_list

    }

    if (do.maxmem && is.null(job$maxmem))
        write(paste0("job ", job$jobid, " missing maxmem information but we expected it"),
              stderr())
    job$flag_list <- character(0)
    job$data_list <- list()
    return(job)
}


# read jobstats file and fill in data.frame attributes
#
readJobstatsFile <- function(file, node = "unknown", ncores = NA) {
    if (! do.extended) {  # not using extended jobstats file
        dat <- read.table(file, header = FALSE, skip = 1)
        num.cores <- ncol(dat) - first.core.column + 1
        if (!is.na(ncores) && num.cores != ncores)
            write(paste0("inconsistent core counts: ", ncores, " from jobstats and ", num.cores, " from jobstats file"),
                  stderr())
        names(dat) <- c("LOCALTIME", "TIME", "GB_LIMIT", "GB_USED", "GB_SWAP_USED",
            paste0("core", 1:num.cores))
    } else {  # using extended jobstats file
        dat <-read.table(file, header = FALSE, fill = TRUE, skip = 1)
        num.cores <- ncol(dat) - first.core.column + 1
        # look for formatting bug, where col 6 and 7 are jammed together.  correct for all lines, or some lines
        split.field <- function(.x) {
            e <- sub("^(.+)(-.+)$", "\\1\t\\2", .x, perl=TRUE)
            f <- strsplit(e, "\t", perl=TRUE)
            if (! is.list(f)) f = list(f)
            g <- do.call(rbind, f)
            return(g)
        }
        if (!is.na(ncores) && num.cores != ncores) {  # this may happen if all lines include merged columns
            dat <- cbind(dat[,1:5], split.field(dat[,6]), dat[,7:ncol(dat)])
        } else {
            rows.to.split <- is.na(dat[,ncol(dat)])
            if (sum(rows.to.split) > 0) {
                dat.to.split <- dat[rows.to.split, , drop=FALSE]
                #write(paste0(sum(rows.to.split), " rows to split"), stderr())
                new67 <- split.field(dat.to.split[, 6, drop=FALSE])
                dat.to.split <- cbind(dat.to.split[,1:5], new67, dat.to.split[, 7:(ncol(dat.to.split) - 1), drop=FALSE])
                dat[rows.to.split, ] <- dat.to.split
            }
        }
        names(dat) <- c("LOCALTIME", "TIME", "GB_LIMIT", "GB_USED", "GB_MAX_USED",
            "GB_SWAP_USED", "PAGE_IN", "PAGE_OUT", 
            paste0("core", 1:num.cores))
        for (col in c("GB_MAX_USED", "GB_SWAP_USED", "PAGE_IN", "PAGE_OUT")) dat[[col]] <- as.numeric(dat[[col]])
        dat$GB_MAX_USED <- sapply(1:nrow(dat), function(.x) max(dat$GB_MAX_USED[1:.x])) # rolling max
        dat$PAGE_IN[dat$PAGE_IN < 0] <- 0  # where PAGE_IN or PAGE_OUT are not valid, set them to 0
        dat$PAGE_OUT[dat$PAGE_OUT < 0] <- 0
    }
    attr(dat, "file") <- file
    attr(dat, "node") <- node
    return(dat)
}


# Look at jobstats data.frame (with attributes) to see if there are
# usage patterns that should be flagged
#
# No clue what aspects of PAGE_IN/PAGE_OUT should be highlighted
#
produceFlags <- function(job) {
    dat <- job$data_list[[1]]

    file <- attr(dat, "file")
    node <- attr(dat, "node")
    cluster <- attr(dat, "cluster")

    # Determine resources used

    num.cores <- ncol(dat) - first.core.column + 1
    core.columns <- first.core.column:ncol(dat)
    # Here, we use cpu_free as a tolerance for determining whether a core is
    # busy or not.
    core.busy.even_once <- function(.busy_list) sum(.busy_list > cpu_free)
    core.busy <- apply(dat[, core.columns, drop = FALSE], 1,
                       core.busy.even_once)
    max.core.busy <- max(core.busy)
    max.GB.avail <- max(dat$GB_LIMIT)
    if (is.null(job$maxmem) || is.na(job$maxmem) || job$maxmem == '.')  # if we don't have independent maxmem info
        job$maxmem = if (do.extended) max(dat$GB_MAX_USED) else max(dat$GB_USED)
    max.GB.used <- job$maxmem
    swap.used <- any(dat$GB_SWAP_USED > 0)
    core.GB <- max.GB.avail/num.cores
    core.mem.used <- round(max.core.busy * core.GB, 1)

    # Determine whether resources misused

    flag_overbooked <- FALSE  # some fraction of all booked resources unused
    flag_half_overbooked <- FALSE  # half of all booked resources used
    flag_severely_overbooked <- FALSE  # one-quarter of all booked resources used
    flag_node_type_overbooked <- FALSE  # if on a non-default node, booking was unnecessary
    flag_cores_overbooked <- FALSE  # some cores never used
    flag_mem_overbooked <- FALSE  # max mem used < booked
    flag_core_mem_overbooked <- FALSE  # max mem used < mem in used cores

    if (!cluster %in% names(node.types))
        write(paste0(cluster, " missing node types, cannot check if ",
                     "node_type_overbooked"), stderr())
    else {
        nt <- node.types[[cluster]]
        nt.default <- node.types[[paste0(cluster, ".default")]]
        # TODO: retrieve node.type.booked from sacct or squeue
        node.type.booked <- names(nt)[which(max.GB.avail <= nt)[1]]
        node.type.needed <- names(nt)[which(max.GB.used <= nt)[1]]
        if (!length(node.type.booked) || !length(node.type.needed))
            write(paste0(cluster, " missing node types, cannot check if ",
                        "node_type_overbooked"), stderr())
        else if (node.type.booked != node.type.needed)
            flag_node_type_overbooked <- TRUE
    }
    fraction.cores.used <- max.core.busy/num.cores
    fraction.mem.used <- max.GB.used/max.GB.avail
    flag_cores_overbooked <- fraction.cores.used < flag_cores_overbooked.fraction
    flag_mem_overbooked <- num.cores > 1 && fraction.mem.used < flag_mem_overbooked.fraction
    flag_core_mem_overbooked <- num.cores > 1 && flag_cores_overbooked &&
        max.GB.used < (core.mem.used * flag_core_mem_overbooked.fraction)
    max.fraction.used <- max(fraction.cores.used, fraction.mem.used)
    flag_overbooked <- max.fraction.used < flag_overbooked.fraction
    flag_half_overbooked <- max.fraction.used < flag_half_overbooked.fraction
    flag_severely_overbooked <- max.fraction.used < flag_severely_overbooked.fraction
    flag_swap_used <- swap.used

    # Unset all the sub-node flags if node.type.needed is bigger than the default node type.
    # These flags are not selectable for anything bigger than the default node
    if (node.type.needed != nt.default) {
        flag_cores_overbooked <- FALSE
        flag_mem_overbooked <- FALSE
        flag_core_mem_overbooked <- FALSE
        flag_overbooked <- FALSE
        flag_half_overbooked <- FALSE
        flag_severely_overbooked <- FALSE
    }

    include_memory_flag <- do.memory || any(flag_cores_overbooked, flag_half_overbooked,
        flag_severely_overbooked)

    createFlag <- function(avail, used, verbose_msg, msg) {
        if (do.verbose)
            if (is.null(avail)) verbose_msg
            else paste(avail, verbose_msg, "but", used, "used")
        else if (is.null(avail)) msg
            else paste0(msg, ":", avail, ":", used)
    }

    flag_list <- character(0)

    if (flag_overbooked) {
        if (do.verbose)
            flag <- paste0("Just ", round(max.fraction.used * 100, 0),
                           "% of booked core and RAM resources were used")
        else flag <- paste0("overbooked:", round(max.fraction.used * 100, 0), "%")
        flag_list <- c(flag_list, flag)
    }

    if (flag_half_overbooked)
        flag_list <- c(flag_list,
                       createFlag(NULL, NULL,
                                  "!! Less than half the booked cores and RAM were used",
                                  "!!half_overbooked"))

    if (flag_severely_overbooked)
        flag_list <- c(flag_list,
                       createFlag(NULL, NULL,
                                  "!! Less than one-quarter the booked cores and RAM were used",
                                  "!!severely_overbooked"))

    if (flag_swap_used)
        flag_list <- c(flag_list,
                       createFlag(NULL, NULL,
                                  "!! Swap space was used",
                                  "!!swap_used"))

    if (flag_node_type_overbooked)
        flag_list <- c(flag_list,
                       createFlag(node.type.booked,
                                  ifelse(do.verbose,
                                         paste("resources of a", node.type.needed, "were"),
                                         node.type.needed),
                                  "node was booked",
                                  "node_type_overbooked"))

    if (flag_cores_overbooked)
        flag_list <- c(flag_list,
                       createFlag(num.cores, max.core.busy,
                                  "cores booked", "cores_overbooked"))

    if (flag_mem_overbooked && include_memory_flag)
        flag_list <- c(flag_list,
                       createFlag(max.GB.avail, max.GB.used,
                                  "GB RAM available in booked cores",
                                  "mem_overbooked"))

    if (flag_core_mem_overbooked && include_memory_flag)
        flag_list <- c(flag_list,
                       createFlag(core.mem.used, max.GB.used,
                                  "GB RAM available in used cores",
                                  "core_mem_overbooked"))

    return(flag_list)
}


# Form filename for plot
#
plotFilename <- function(job) {
    p <- ifelse(job$project == ".", "noproj", job$project)
    u <- ifelse(job$user == ".", "nouser", job$user)
    paste0(paste(sep = "-", job$cluster, p, u, job$jobid),
        ".png")
}


# Plot a full set of jobstats panels.  Could be just 1
#
plotJobstats <- function(job, do.png = TRUE) {

    # calculate plot size and layout
    n.panels <- length(job$node_list)
    n.jobstats <- length(names(job$data_list))
    if (.debug)
        cat("n.panels =", n.panels, " n.jobstats =", n.jobstats, "\n")

    n.columns <- if (n.panels > 1) 2 else 1
    n.rows <- if (n.panels > 1) as.integer(n.panels/2 + 0.5) else 1
    if (.debug)
        cat("n.columns =", n.columns, " n.rows =", n.rows, "\n")
    width <- 800
    top.height <- 200
    panel.height <- switch(as.character(n.panels),
                           "1" = 500, "2" = 250, n.rows * 250)
    height <- top.height + panel.height
    if (do.big.plot) {
        width <- width * 2
        height <- height * 2
    }
    if (.debug)
        cat("width =", width, " height =", height, "\n")

    if (do.png)
        png(plotFilename(job), width = width, height = height)

    opa <- par(no.readonly = TRUE)
    par(mfrow = c(n.rows, n.columns), oma = c(0, 0, top.lines, 0))
    # plot the individual panels for the nodes for which we have data
    for (i in seq_along(names(job$data_list))) {
        this.row <- ifelse(n.panels == 1, 0, as.integer(i/2 + 0.5))
        this.col <- ifelse(n.panels == 1, 0, ifelse(i%%2, 1, 2))
        n <- names(job$data_list)[i]
        plotJobstatsPanel(job, n, this.row, this.col)
    }
    if (n.panels > n.jobstats) {
        # now plot the individual panels for the unused nodes
        for (n in job$node_list[(n.jobstats + 1):n.panels]) {
            plot.new()
            plot.window(xlim = c(0, 1), ylim = c(0, 1))
            text(0.5, 0.5, paste0("Node ", n, " booked but unused"))
        }
    }

    # Header lines: top line: jobid jobstate cluster endtime runtime
    txt <- paste(job$jobid, job$jobstate, "on", job$cluster)
    if (!is.null(job$endtime) && job$endtime != ".")
        txt <- paste(sep = main.sep, txt, paste("end:", job$endtime))
    if (!is.null(job$runtime) && job$runtime != ".")
        txt <- paste(sep = main.sep, txt, paste("runtime:", job$runtime))
    if (! is.null(job$timelimit_minutes)) # running job, shrink the main title a bit
        main.cex <- main.cex * 0.95
    mtext(txt, font = 2, cex = main.cex, line = main.line, side = 3, outer = TRUE)

    if (n.rows > 2) {
        user.line <- user.line - 1
        flags.line <- flags.line - 1
    }

    # User, project and jobname lines
    txt <- paste0("User: ", job$user, "\n", "Proj: ", job$project, "\n",
        "Jobname: ", job$jobname)
    mtext(txt, font = 2, cex = user.cex, line = user.line, side = 3, adj = user.adj,
        outer = TRUE)

    # Flags in red Flags passed in job$initial_flag_list, flags determined
    # here job$flag_list
    flags.output <- character(0)
    flags.list <- c(job$initial_flag_list, job$flag_list)
    flags.count <- length(flags.list)
    if (flags.count == 0) {
        flags.output <- "Flags: none"
    } else if (flags.count <= flags.wrap) {
        flags.output <- paste(collapse = flags.sep, flags.list)
    } else {
        flags.output <- paste(collapse = flags.sep, flags.list[1:flags.wrap])
        i <- flags.wrap + 1
        while ((i + flags.wrap - 1) <= flags.count) {
            j <- i + flags.wrap - 1
            flags.output <- paste(sep = flags.line.sep, flags.output, paste(collapse = flags.sep,
                flags.list[i:j]))
            i <- i + flags.wrap
        }
        if (i <= flags.count) {
            flags.output <- paste(sep = flags.line.sep, flags.output, paste(collapse = flags.sep,
                flags.list[i:flags.count]))
        }
    }
    # txt = paste(flags.output)
    mtext(flags.output, font = 1, cex = flags.cex, line = flags.line, side = 3,
        outer = TRUE, col = flags.col, adj = flags.adj)
    par(opa)

    if (do.png)
        graphics.off()
}


# Plot a single jobstats panel
#
plotJobstatsPanel <- function(job, node = "unknown", this.row = 0, this.col = 0) {

    dat <- job$data_list[[node]]

    # if these are not defined in job, will be NULL
    this.timelimit <- job$timelimit_minutes
    # eventually will be node (thus panel)-specific with more info from jobstats
    this.maxmem <- if (is.null(job$maxmem) || is.na(job$maxmem) || job$maxmem == '.') NULL
        else job$maxmem

    if (.debug)
        cat('this.row = ', this.row, ' this.col = ', this.col, '\n')

    num.cores <- ncol(dat) - first.core.column + 1
    core.columns <- first.core.column:ncol(dat)

    # set up plot extents based on resource availability

    range.GB <- c(0, dat[1, "GB_LIMIT"])  # GB_LIMIT fixed for job duration
    range.cores <- c(0, num.cores * 100)
    swap.y <- range.GB[2]
    # set up traces based on resource usage, 5 minute sampling times
    dat$x <- ((dat$TIME - dat$TIME[1]) / 60) + sampling.plot.offset
    range.x <- c(0, max(ceiling(dat$x + sampling.window - sampling.plot.offset)))
    if (! is.null(this.timelimit)) range.x[2] = max(range.x[2], this.timelimit)
    core.at <- seq(range.cores[1], range.cores[2], by = 100)
    core.labels <- paste0(as.character(core.at), "%")
    core.to.GB <- function(.x) return((.x/range.cores[2]) * range.GB[2])
    core.at <- core.to.GB(core.at)
    dat$core_ <- core.to.GB(apply(dat[, core.columns, drop = FALSE], 1, sum))
    # core_busy_ is handled by counting, in each row, the number of cores > cpu_free
    core.busy.cpu_free <- function(.busy_list) sum(.busy_list > cpu_free)
    dat$core_busy_ <- core.to.GB(apply(dat[, core.columns, drop=FALSE], 1, core.busy.cpu_free) * 100) # count of cores for each time, scaled
    dat$swap_ <- ifelse(dat$GB_SWAP_USED > 0, swap.y, NA)

    # if just one entry, then give it some body by doubling it to make a line
    if (nrow(dat) == 1) {
        dat <- rbind(dat, dat)
        dat$x[1] <- 0  # reset left x
        dat$x[2] <- sampling.window  # reset right x
    }

    par(mar = c(4, 4, 3, 5.5), las = 1, mgp = c(2.6, 0.5, 0), cex.main = 1.5, cex.axis = axis.cex, tcl = -0.4)

    with(dat, plot(x, GB_USED, xlim = range.x, ylim = range.GB, col = col.GB,
                   type = "l", lwd = lwd.GB, lty = lty.GB, bty = "U",
                   main = node, xlab = "", ylab = ""))
                   #cex.main = 1.5, xlab = "", ylab = "", cex.axis = axis.cex))
    if (do.extended) { # trace
        with(dat, lines(x, GB_MAX_USED, col = maxmem.col, lwd = maxmem.lwd, lty = maxmem.lty))
        if (do.paging) { # now PAGE_IN/PAGE_OUT
            range.page <- c(0, dat$PAGE_IN, dat$PAGE_OUT)
            if (range.page[2] != 0) {
                paging.to.GB <- function(.x) return((.x/range.page[2]) * range.GB[2])
                dat$pagein_ <- paging.to.GB(dat$PAGE_IN); dat$pagein_[dat$pagein_ == 0] <- NA
                dat$pageout_ <- paging.to.GB(dat$PAGE_OUT); dat$pageout_[dat$pageout_ == 0] <- NA
                with(dat, points(x, pagein_, col = paging.col, cex = paging.cex, pch = paging.in.pch))
                with(dat, points(x, pageout_, col = paging.col, cex = paging.cex, pch = paging.out.pch))
                # add annotation to the highest and lowest paging points
                annotate.page.max <- function(x, page, PAGE) {
                    nm = deparse(substitute(PAGE))
                    if (any(! is.na(page))) {
                        mx = max(PAGE)
                        idx = which(PAGE == mx)[1]
                        write(paste0("maximum ", nm, " is ", mx, " at position ", idx), stderr())
                        text(x=x[i], y=page[i], labels=PAGE[i], pos=3, col=paging.col, cex=paging.cex)
                    }
                }
                with(dat, annotate.page.max(x, pagein_, PAGE_IN))
                with(dat, annotate.page.max(x, pageout_, PAGE_OUT))
            }
        }
        # add a line to the right side if the finishedjobinfo value is greater than the greatest GB_MAX_USED
        # this might occur if the job increases its max RSS after the final polling visit
        if (! is.null(this.maxmem) && this.maxmem > max(dat$GB_MAX_USED)) {
            with(dat, lines(c(max(x), range.x[2]), c(this.maxmem, this.maxmem), col = maxmem.col, lwd = maxmem.lwd, lty = maxmem.lty))
        }
    } else if (! is.null(this.maxmem)) {
        # plot a line based on finishedjobinfo data
        abline(h = this.maxmem, col = maxmem.col, lwd = maxmem.lwd, lty = maxmem.lty)
    }
    mtext(paste0("Wall minutes since job start (5 min resolution, max ", range.x[2], " min)"),
          side = 1, line = axis1.line, las = 0, col = col.GB, cex = axis1.cex)
    mtext(ifelse(this.col <= 1, paste0("GB used (max ", range.GB[2], " GB)"), ""),
          side = 2, line = axis1.line, las = 0, col = col.GB, cex = axis1.cex)
    with(dat, lines(x, core_, col = col.core, lwd = lwd.core, lty = lty.core))
    if (do.core.busy)
        with(dat, points(x, core_busy_, col = col.core, pch=pch.core.busy))
    axis(4, at = core.at, labels = core.labels, col.axis = col.core, cex.axis = axis.cex)

    if (this.col %in% c(0, 2)) # multiplanel axes
        mtext(paste0("Core busy for ", num.cores, " cores (max ",
                     num.cores * 100, "%)"),
              side = 4, line = axis2.line, las = 0, col = col.core,
              cex = axis.cex)

    # If swap used, plot points and legend
    if (any(!is.na(dat$swap_))) {
        with(dat, points(x, swap_, pch = swap.pch, col = swap.col))
        legend("topleft", legend = "Swap\nused", bty = "n", pch = swap.pch,
               col = swap.col, text.col = swap.col)
    }
}


# collect contents of all jobstats files, produce summaries and flag lists
#
gatherAllJobstats <- function(job) {
    for (i in 1:length(job$file_list)) {
        node <- job$node_list[i]
        job$data_list[[node]] <- readJobstatsFile(job$file_list[i], node, job$core_list[i])
        attr(job$data_list[[node]], "cluster") <- job$cluster
    }
    job$swap_used <- any(unlist(lapply(job$data_list, function(x) any(x$GB_SWAP_USED >
        0))))
    job$flag_list <- produceFlags(job)
    return(job)
}


# MAIN
#
args <- commandArgs(trailingOnly = TRUE)

#this.args <- c( "--fji", "16610111", "rackham", "COMPLETED", "rcvwijk", "snic2020-15-208", 'test4', "2020-11-29T18:44:48", "05:08:24", "overbooked:1%,!!half_overbooked,!!severely_overbooked,!!swap_used,cores_overbooked:13:0,mem_overbooked:98.5:0.5", "16", "0.5", "16", "r251", "/sw/share/slurm/rackham/extended_uppmax_jobstats/r251/16610111")
#args2 <- c("--cpu-free", "3", "--fji", "516782", "rackham", ".", ".", ".", ".", ".", ".", ".", ".", ".", "20", "r97", "/sw/share/slurm/rackham/extended_uppmax_jobstats/r97/516782")
#args = args2


if (.debug) { cat("after commandArgs()\n"); print(args) }

if (.debug && exists('this.args')) args <- this.args

if (args[1] == "--version") {
    write(.version, stdout())
    quit(save = "no", status = 0, runLast = FALSE)
}

job <- processArgs(args)

if (.debug) { cat("after processArgs()\n"); print(job) }

#debug(gatherAllJobstats)
#debug(readJobstatsFile)
#debug(plotJobstats)
#browser()

job <- gatherAllJobstats(job)

if (.debug) { cat("after gatherAllJobstats()\n"); print(job) }

# flags to stdout
if (do.flags)
    write(paste(collapse = ",", job$flag_list), stdout())

if (do.plot)
    plotJobstats(job)