Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

R package fixes & improvements #455

Merged
merged 6 commits into from
Nov 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions .github/actions/r_build/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ runs:
name: Download and unpack Spark
shell: bash
run: |
wget -P /usr/spark-download/raw https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop2.7.tgz
tar zxvf /usr/spark-download/raw/spark-3.2.1-bin-hadoop2.7.tgz -C /usr/spark-download/unzipped
wget -P /usr/spark-download/raw https://archive.apache.org/dist/spark/spark-${{ matrix.spark }}/spark-${{ matrix.spark }}-bin-hadoop3.tgz
tar zxvf /usr/spark-download/raw/spark-${{ matrix.spark }}-bin-hadoop3.tgz -C /usr/spark-download/unzipped
- name: Create R environment
shell: bash
run: |
Expand All @@ -50,16 +50,25 @@ runs:
run: |
cd R
Rscript --vanilla generate_docs.R
env:
SPARK_HOME: /usr/spark-download/unzipped/spark-${{ matrix.spark }}-bin-hadoop3
- name: Build R package
shell: bash
run: |
cd R
Rscript --vanilla build_r_package.R
- name: Test R package
env:
SPARK_HOME: /usr/spark-download/unzipped/spark-${{ matrix.spark }}-bin-hadoop3
- name: Test SparkR package
shell: bash
run: |
cd R/sparkR-mosaic
Rscript --vanilla tests.R
- name: Test sparklyr package
shell: bash
run: |
cd R/sparklyr-mosaic
Rscript --vanilla tests.R
- name: Copy R artifacts to GH Actions run
shell: bash
run: |
Expand Down
1 change: 1 addition & 0 deletions R/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
**/.Rhistory
**/*.tar.gz
/sparklyr-mosaic/metastore_db/
9 changes: 1 addition & 8 deletions R/build_r_package.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
spark_location <- "/usr/spark-download/unzipped/spark-3.2.1-bin-hadoop2.7"
Sys.setenv(SPARK_HOME = spark_location)

spark_location <- Sys.getenv("SPARK_HOME")
library(SparkR, lib.loc = c(file.path(spark_location, "R", "lib")))


library(pkgbuild)
library(sparklyr)



build_mosaic_bindings <- function(){
## build package
Expand Down
95 changes: 50 additions & 45 deletions R/generate_R_bindings.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@ library(methods)

parser <- function(x){
#split on left bracket to get name
splitted = strsplit(x, "(", fixed=T)[[1]]
splitted <- strsplit(x, "(", fixed=T)[[1]]
# extract function name
function_name = splitted[1]
function_name <- splitted[1]
# remove the trailing bracket
args = gsub( ")", '',splitted[2], fixed=T)
args = strsplit(args, ", ", fixed=T)[[1]]
args = lapply(args, function(x){strsplit(x, ": ", fixed=T)}[[1]])
output = list(
args <- gsub( ")", '',splitted[2], fixed=T)
args <- strsplit(args, ", ", fixed=T)[[1]]
args <- lapply(args, function(x){strsplit(x, ": ", fixed=T)}[[1]])
output <- list(
"function_name" = function_name
,"args"=args
)
Expand All @@ -24,8 +24,8 @@ parser <- function(x){

############################################################
build_generic <- function(input){
function_name = input$function_name
args = lapply(input$args, function(x){x[1]})
function_name <- input$function_name
args <- lapply(input$args, function(x){x[1]})
paste0(
'#\' @rdname ', function_name, '
setGeneric(
Expand All @@ -35,21 +35,9 @@ build_generic <- function(input){
')
}


build_generic2 <- function(input){
function_name = input$function_name
args = lapply(input$args, function(x){x[1]})
paste0(
'#\' @rdname ', function_name, '
setGeneric(
name="',function_name,'"
,def=function(',paste0(args, collapse=','), ') {standardGeneric("',function_name, '")}
)
')
}
############################################################
build_column_specifiers <- function(input){
args = lapply(input$args, function(x){x[1]})
args <- lapply(input$args, function(x){x[1]})
build_column_specifier <- function(arg){
return(paste0(arg, '@jc'))
}
Expand All @@ -62,29 +50,32 @@ build_column_specifiers <- function(input){
}
############################################################
build_method<-function(input){
function_name = input$function_name
arg_names = lapply(input$args, function(x){c(x[1])})
function_name <- input$function_name
arg_names <- lapply(input$args, function(x){c(x[1])})
#this handles converting non-Column arguments to their R equivalents
argument_parser <- function(x){
if(x[2] == 'Int'){
x[2] = "numeric"
x[2] <- "numeric"
}
else if(x[2] == 'String'){
x[2] = "character"
x[2] <- "character"
}
else if(x[2] == 'Double'){
x[2] = "numeric"
x[2] <- "numeric"
}
else if(x[2] == 'Boolean') {
x[2] <- "logical"
}
x
}
# convert scala type to R types
args = lapply(input$args, argument_parser)
args <- lapply(input$args, argument_parser)
# take a copy for building the docs
param_args = args
param_args <- args
# wrap the strings in speech marks
args = lapply(args, function(x){c(x[1], paste0("'", x[2], "'"))})
args <- lapply(args, function(x){c(x[1], paste0("'", x[2], "'"))})
# collapse down to a single string
args = lapply(args, function(x){paste0(x, collapse= ' = ')})
args <- lapply(args, function(x){paste0(x, collapse= ' = ')})
column_specifiers <- build_column_specifiers(input)
docstring <- paste0(
c(paste0(c("#'", function_name), collapse=" "),
Expand Down Expand Up @@ -116,48 +107,62 @@ build_method<-function(input){
############################################################
get_function_names <- function(scala_file_path){
#scala_file_path = "~/Documents/mosaic/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala"
scala_file_object = file(scala_file_path)
scala_file_object <- file(scala_file_path)

scala_file = readLines(scala_file_object)
scala_file <- readLines(scala_file_object)
closeAllConnections()
# find where the methods start
start_string = " object functions extends Serializable {"
start_index = grep(start_string, scala_file, fixed=T) + 1
start_string <- " object functions extends Serializable {"
start_index <- grep(start_string, scala_file, fixed=T) + 1
# find the methods end - will be the next curly bracket
# need to find where the matching end brace for the start string is located.
# counter starts at 1 as the start string includes the opening brace
brace_counter = 1
brace_counter <- 1

for(i in start_index : length(scala_file)){
# split the string into characters - returns a list so unlist it
line_characters <- unlist(strsplit(scala_file[i], ''))
# count the number of brace opens
n_opens = sum(grepl("{", line_characters, fixed=T))
n_opens <- sum(grepl("{", line_characters, fixed=T))
# count the number of brace closes
n_closes = sum(grepl("}", line_characters, fixed=T))
n_closes <- sum(grepl("}", line_characters, fixed=T))
# update the counter
brace_counter <- brace_counter + n_opens - n_closes
if (brace_counter == 0) break

}
methods_to_bind = scala_file[start_index:i]
methods_to_bind <- scala_file[start_index:i]
# remove any line that doesn't start with def
def_mask = grepl('\\s+def .*', methods_to_bind)
methods_to_bind = methods_to_bind[def_mask]
def_mask <- grepl('\\s+def .*', methods_to_bind)
methods_to_bind <- methods_to_bind[def_mask]
# parse the string to get just the function_name(input:type...) pattern
methods_to_bind = unlist(lapply(methods_to_bind, function(x){
methods_to_bind <- unlist(lapply(methods_to_bind, function(x){
substr(x
, regexpr("def ", x, fixed=T)[1]+4 # get the starting point to account for whitespace
, regexpr("): ", x, fixed=T)[1] # get the end point of where the return is.
)
}
))
sort(methods_to_bind, T)
sort_methods_by_argcount(methods_to_bind)
}

############################################################
sort_methods_by_argcount <- function(methods) {
# Split the strings by colon and calculate the number of colons
method_names <- sapply(strsplit(methods, "\\("), function(x) x[1])
argcount <- sapply(strsplit(methods, ","), function(x) length(x) - 1)

# Use the order function to sort first alphabetically and then by the number of colons
order_indices <- order(method_names, argcount)

# Return the sorted list
methods_sorted <- methods[order_indices]
return(methods_sorted)
}

############################################################
build_sparklyr_mosaic_function <- function(input){
function_name = input$function_name
function_name <- input$function_name
paste0(

"#' ", function_name, "\n\n",
Expand Down Expand Up @@ -191,7 +196,7 @@ main <- function(scala_file_path){
##########################
##########################
# build sparkr functions
function_data = get_function_names(scala_file_path)
function_data <- get_function_names(scala_file_path)
parsed <- lapply(function_data, parser)


Expand Down Expand Up @@ -223,9 +228,9 @@ main <- function(scala_file_path){
# supplementary files
sparkr_supplementary_files <- c("sparklyr-mosaic/enableMosaic.R", "sparklyr-mosaic/sparkFunctions.R")
copy_supplementary_file(sparkr_supplementary_files, "sparklyr-mosaic/sparklyrMosaic/R/")

}


args <- commandArgs(trailingOnly = T)
if (length(args) != 1){
stop("Please provide the MosaicContext.scala file path to generate_sparkr_functions.R")
Expand Down
4 changes: 1 addition & 3 deletions R/generate_docs.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
spark_location <- "/usr/spark-download/unzipped/spark-3.2.1-bin-hadoop2.7"
Sys.setenv(SPARK_HOME = spark_location)

spark_location <- Sys.getenv("SPARK_HOME")
library(SparkR, lib.loc = c(file.path(spark_location, "R", "lib")))
library(roxygen2)

Expand Down
4 changes: 1 addition & 3 deletions R/install_deps.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
options(repos = c(CRAN = "https://packagemanager.posit.co/cran/__linux__/focal/latest"))

install.packages("pkgbuild")
install.packages("roxygen2")
install.packages("sparklyr")
install.packages(c("pkgbuild", "testthat", "roxygen2", "sparklyr"))
Loading
Loading