diff --git a/Project.toml b/Project.toml index 1c12e9d..ac48a5b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,12 +1,13 @@ name = "RDatasets" uuid = "ce6b1742-4840-55fa-b093-852dadbb1d8b" -version = "0.7.7" +version = "0.8.0" [deps] CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" +Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" RData = "df47a6cb-8c03-5eed-afd8-b6050d6c41da" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" @@ -16,7 +17,7 @@ CSV = "0.5, 0.6, 0.7, 0.8, 0.9, 0.10" CodecZlib = "0.4, 0.5, 0.6, 0.7" DataFrames = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22, 1" FileIO = "1" -RData = "0.5, 0.6, 0.7, 0.8" +RData = "0.5, 0.6, 0.7, 0.8, 1" Reexport = "0.2, 1.0" julia = "1" diff --git a/README.md b/README.md index ce0927b..fe345f3 100644 --- a/README.md +++ b/README.md @@ -5,15 +5,21 @@ The RDatasets package provides an easy way for Julia users to experiment with most of the standard data sets that are available in the core of R as well as datasets included with many of R's most popular packages. This package is essentially a simplistic port of the Rdatasets repo created by Vincent Arelbundock, who conveniently gathered data sets from many of the standard R packages in one convenient location on GitHub at https://github.com/vincentarelbundock/Rdatasets In order to load one of the data sets included in the RDatasets package, you will need to have the `DataFrames` package installed. This package is automatically installed as a dependency of the `RDatasets` package if you install `RDatasets` as follows: - - Pkg.add("RDatasets") - +```julia +Pkg.add("RDatasets") +``` After installing the RDatasets package, you can then load data sets using the `dataset()` function, which takes the name of a package and a data set as arguments: - - using RDatasets - iris = dataset("datasets", "iris") - neuro = dataset("boot", "neuro") - +```julia +using RDatasets +iris = dataset("datasets", "iris") +neuro = dataset("boot", "neuro") +``` +You can also get descriptions of the datasets by calling `RDatasets.description`: +```julia +RDatasets.description("datasets", "iris") +# or +RDatasets.description(iris) # only use this on DataFrames returned from `dataset`! +``` # Data Sets The `RDatasets.packages()` function returns a table of represented R packages: @@ -74,6 +80,23 @@ mlmRev|guImmun|Immunization in Guatemala|2159|13 mlmRev|guPrenat|Prenatal care in Guatemala|2449|15 mlmRev|star|Student Teacher Achievement Ratio (STAR) project data|26796|18 +# How to add datasets from a new package + +**Step 1: add the data from the package** + + 1. In your clone of this repo `mkdir -p data/$PKG` + 2. Go to CRAN + 3. Download the *source package* + 4. Extract one or more of the datasets in the `data` directory into the new directory + +**Step 2: add the metadata** + +Run the script: + + $ scripts/update_doc_one.sh $PKG + +Now it's ready for you to submit your pull request. + # Licensing and Intellectual Property Following Vincent's lead, we have assumed that all of the data sets in this repository can be made available under the GPL-3 license. If you know that one of the datasets released here should not be released publicly or if you know that a data set can only be released under a different license, please contact me so that I can remove the data set from this repository. diff --git a/doc/datasets.csv b/doc/datasets.csv index b8081be..8de3ed7 100644 --- a/doc/datasets.csv +++ b/doc/datasets.csv @@ -506,6 +506,36 @@ "datasets","volcano","Topographic Information on Auckland's Maunga Whau Volcano",87,61 "datasets","warpbreaks","The Number of Breaks in Yarn during Weaving",54,3 "datasets","women","Average Heights and Weights for American Women",15,2 +"gamair","aral","aral",488,4 +"gamair","aral.bnd","aral.bnd",107,3 +"gamair","bird","bird",25100,7 +"gamair","blowfly","blowfly",180,3 +"gamair","bone","bone",23,4 +"gamair","brain","brain",1567,6 +"gamair","cairo","cairo",3780,7 +"gamair","chicago","chicago",5114,8 +"gamair","chl","chl",13840,7 +"gamair","co2s","co2s",507,4 +"gamair","coast","coast",2091,3 +"gamair","engine","engine",19,3 +"gamair","gas","gas",60,804 +"gamair","harrier","harrier",37,3 +"gamair","hubble","hubble",24,4 +"gamair","ipo","ipo",156,7 +"gamair","mack","mack",634,17 +"gamair","mackp","mackp",1162,9 +"gamair","med","med",1476,25 +"gamair","meh","meh",1476,24 +"gamair","mpg","mpg",205,27 +"gamair","prostate","prostate",654,530 +"gamair","sitka","sitka",1027,6 +"gamair","sole","sole",1575,8 +"gamair","sperm.comp1","sperm.comp1",15,5 +"gamair","sperm.comp2","sperm.comp2",24,11 +"gamair","stomata","stomata",24,4 +"gamair","swer","swer",2196,10 +"gamair","wesdr","wesdr",669,5 +"gamair","wine","wine",47,8 "gap","PD","A study of Parkinson's disease and APOE, LRRK2, SNCA makers",825,22 "gap","aldh2","ALDH2 markers and Alcoholism",263,18 "gap","apoeapoc","APOE/APOC1 markers and Alzheimer's",353,8 @@ -732,33 +762,3 @@ "vcd","VonBort","Von Bortkiewicz Horse Kicks Data",280,4 "vcd","WeldonDice","Weldon's Dice Data",11,2 "vcd","WomenQueue","Women in Queues",11,2 -"gamair","aral.bnd","aral.bnd",107,3 -"gamair","aral","aral",488,4 -"gamair","bird","bird",25100,7 -"gamair","blowfly","blowfly",180,3 -"gamair","bone","bone",23,4 -"gamair","brain","brain",1567,6 -"gamair","cairo","cairo",3780,7 -"gamair","chicago","chicago",5114,8 -"gamair","chl","chl",13840,7 -"gamair","co2s","co2s",507,4 -"gamair","coast","coast",2091,3 -"gamair","engine","engine",19,3 -"gamair","gas","gas",60,804 -"gamair","harrier","harrier",37,3 -"gamair","hubble","hubble",24,4 -"gamair","ipo","ipo",156,7 -"gamair","mack","mack",634,17 -"gamair","mackp","mackp",1162,9 -"gamair","med","med",1476,25 -"gamair","meh","meh",1476,24 -"gamair","mpg","mpg",205,27 -"gamair","prostate","prostate",654,530 -"gamair","sitka","sitka",1027,6 -"gamair","sole","sole",1575,8 -"gamair","sperm.comp1","sperm.comp1",15,5 -"gamair","sperm.comp2","sperm.comp2",24,11 -"gamair","stomata","stomata",24,4 -"gamair","swer","swer",2196,10 -"gamair","wesdr","wesdr",669,5 -"gamair","wine","wine",47,8 diff --git a/scripts/update_doc_all.sh b/scripts/update_doc_all.sh new file mode 100755 index 0000000..0d8ff1a --- /dev/null +++ b/scripts/update_doc_all.sh @@ -0,0 +1,4 @@ +R --no-save <`, `

`, `

`, `

`, `

`, `
` -> `#`, `##`, `###`, `####`, `#####`, `######` +- `` -> `#` +- `<code>` -> `` `code` `` +- `<pre>` -> "```R\\npre\\n```" +- `<EM>` -> `*EM*` +- `<B>` -> `**B**` +- `–` -> `-` + +## TODOs + +- Tables +- Links +- Images +""" +function description_to_markdown(string) + html_header_regex = r"<h(?'hnum'\d)>(?'content'[^<]+)<\/h\g'hnum'>" + function regexmatch2md(matched_string) + m = match(html_header_regex, matched_string) + if isnothing(m.captures[1]) || isnothing(m.captures[2]) + return matched_string + end + + hnum = parse(Int, m[:hnum]) + content = m[:content] + + return join(("\n", "#"^hnum, " ", content, "\n\n")) end - error("Unable to locate dataset file $rdaname or $csvname") + title_matcher_regex = r"<title>(?'content'[^<]+)<\/title>" + code_matcher_regex = r"<code>(?'content'[^<]+)<\/code>" + pre_matcher_regex = r"<pre>(?'content'[^<]+)<\/pre>" + emph_matcher_regex = r"<(?i)EM(?-i)>(?'content'[^<]+)<\/(?i)EM(?-i)>" + b_matcher_regex = r"<(?i)B(?-i)>(?'content'[^<]+)<\/(?i)B(?-i)>" + new_string = replace( + string, + html_header_regex => regexmatch2md, + title_matcher_regex => titlestr -> "# " * match(title_matcher_regex, titlestr)[:content], + code_matcher_regex => codestr -> "`" * match(code_matcher_regex, codestr)[:content] * "`", + pre_matcher_regex => prestr -> "\n```R\n" * match(pre_matcher_regex, prestr)[:content] * "\n```\n", + emph_matcher_regex => emphstr -> "*" * match(emph_matcher_regex, emphstr)[:content] * "*", + b_matcher_regex => bstr -> "**" * match(b_matcher_regex, bstr)[:content] * "**", + "–" => "-", + ) + nohtml = replace(new_string, Regex("<[^>]*>") => "") + return replace(nohtml, Regex("\n\n+") => "\n\n") end diff --git a/src/update_doc.r b/src/update_doc.r index 1ac023a..aecda68 100644 --- a/src/update_doc.r +++ b/src/update_doc.r @@ -1,3 +1,129 @@ +install_packages <- function(packages) { + # Pick a decent mirror if none set + r <- getOption("repos") + if (r["CRAN"] == "@CRAN@") { + r <- "http://cran.rstudio.com/" + } + suppressWarnings({install.packages(packages, repos = r)}) +} + +install_packages(c("R2HTML")) +library(R2HTML) + +write_doc = function(package, dataset) { + help.ref <- try(help(eval(dataset), package=eval(package)), silent = TRUE) + doc <- try(utils:::.getHelpFile(help.ref), silent = TRUE) + try(dir.create(paste0('doc/', package)), silent = TRUE) + fn_doc <- paste0('doc/', package, '/', dataset, '.html') + tools::Rd2HTML(doc, out = fn_doc) +} + +do_package_update <- function(data_dir, package_df, old_dataset_df, dataset_df, mismatched_dims_df, package) { + suppressWarnings({library(package, character.only = TRUE)}) + + # Get package description + pdesc <- packageDescription(package) + new_row <- as.data.frame(pdesc[c("Package", "Title")], + stringsAsFactors = FALSE) + package_df <- rbind(package_df, new_row) + + pdat <- data(package=package)$results + + datasets <- dir(path = file.path(data_dir, package)) + + # Trim filenames to dataset names + r <- "(.+)\\.(csv\\.gz|rda|RData)$" + format_recognized <- grepl(r, datasets) + if (!(all(format_recognized))) { + stop("Unrecognized formats:\n", + cat(datasets[!format_recognized], sep = "\n")) + } + datanames <- sub(r, "\\1", datasets) + + for (dataname in datanames) { + evaltext = paste0("data(", dataname, ", package=package)") + eval(parse(text = evaltext)) + ds <- get(dataname) + + write_doc(package, dataname) + + # Get dataset description + title <- unique(pdat[, "Title"][pdat[, "Item"] == dataname]) + if (length(title) != 1) { + stop(package, "/", dataname, " had ", length(title), " descriptions.") + } + + # Old dims to fall back on + old_row = subset(old_dataset_df, + Dataset == dataname & Package == package) + nr <- if (nrow(old_row)) old_row$Rows[[1]] else NA + nc <- if (nrow(old_row)) old_row$Columns[[1]] else NA + + # Check against new dims when simple + new_nr <- NROW(ds) + new_nc <- NCOL(ds) + if (!(any(c("table", "ltraj") %in% class(ds))) && + class(ds) != "list" && + is.numeric(new_nr) && + is.numeric(new_nc)) { + + expected_cols <- c(nc, nc - 1) # row.names sometimes included + if (!is.numeric(nr) || !is.numeric(nc)) { + nr <- new_nr + nc <- new_nc + } else if (new_nr != nr || !(new_nc %in% expected_cols)) { + + new_row <- data.frame(Package = package, + Dataset = dataname, + Class = class(ds), + OldRows = nr, + OldColumns = nc, + NewRows = new_nr, + NewColumns = new_nc) + + mismatched_dims_df <- rbind(mismatched_dims_df, new_row) + + } + } + + new_row <- data.frame(Package = package, + Dataset = dataname, + Title = title, + Rows = nr, + Columns = nc, + stringsAsFactors = FALSE) + + dataset_df <- rbind(dataset_df, new_row) + } + return(list(package_df = package_df, dataset_df = dataset_df, mismatched_dims_df = mismatched_dims_df)) +} + +update_package_doc <- function(pkg_dir, package) { + data_dir <- file.path(pkg_dir, "data") + doc_dir <- file.path(pkg_dir, "doc") + + package_fn <- file.path(doc_dir, "packages.csv") + dataset_fn <- file.path(doc_dir, "datasets.csv") + + package_df <- read.csv(package_fn) + dataset_df <- read.csv(dataset_fn) + + install_packages(c(package)) + + mismatched_dims_df <- data.frame() + dfs <- do_package_update(data_dir, package_df, dataset_df, dataset_df, mismatched_dims_df, package) + package_df <- dfs$package_df + dataset_df <- dfs$dataset_df + mismatched_dims_df <- dfs$mismatched_dims_df + + package_df <- sort_upper_first(clean(package_df), c("Package")) + dataset_df <- sort_upper_first(clean(dataset_df), c("Package", "Dataset")) + + write(package_df, package_fn) + write(dataset_df, dataset_fn) + return(mismatched_dims_df) +} + update_docs <- function(pkg_dir) { data_dir <- file.path(pkg_dir, "data") doc_dir <- file.path(pkg_dir, "doc") @@ -16,90 +142,11 @@ update_docs <- function(pkg_dir) { # Install any missing packages new_packages <- packages[!(packages %in% installed.packages()[, "Package"])] if (length(new_packages)) { - # Pick a decent mirror if none set - r <- getOption("repos") - if (r["CRAN"] == "@CRAN@") { - r <- "http://cran.rstudio.com/" - } - suppressWarnings({install.packages(new_packages, repos = r)}) + install_packages(new_packages) } for (package in packages) { - suppressWarnings({library(package, character.only = TRUE)}) - - # Get package description - pdesc <- packageDescription(package) - new_row <- as.data.frame(pdesc[c("Package", "Title")], - stringsAsFactors = FALSE) - package_df <- rbind(package_df, new_row) - - pdat <- data(package=package)$results - - datasets <- dir(path = file.path(data_dir, package)) - - # Trim filenames to dataset names - r <- "(.+)\\.(csv\\.gz|rda)$" - format_recognized <- grepl(r, datasets) - if (!(all(format_recognized))) { - stop("Unrecognized formats:\n", - cat(datasets[!format_recognized], sep = "\n")) - } - datanames <- sub(r, "\\1", datasets) - - for (dataname in datanames) { - eval(parse(text = paste0("data(", dataname, ", package=package)"))) - ds <- get(dataname) - - # TODO: Write rst and html doc per dataset - - # Get dataset description - title <- unique(pdat[, "Title"][pdat[, "Item"] == dataname]) - if (length(title) != 1) { - stop(package, "/", title, " had ", length(title), " descriptions.") - } - - # Old dims to fall back on - old_row = subset(old_dataset_df, - Dataset == dataname & Package == package) - nr <- if (nrow(old_row)) old_row$Rows[[1]] else NA - nc <- if (nrow(old_row)) old_row$Columns[[1]] else NA - - # Check against new dims when simple - new_nr <- NROW(ds) - new_nc <- NCOL(ds) - if (!(any(c("table", "ltraj") %in% class(ds))) && - class(ds) != "list" && - is.numeric(new_nr) && - is.numeric(new_nc)) { - - expected_cols <- c(nc, nc - 1) # row.names sometimes included - if (!is.numeric(nr) || !is.numeric(nc)) { - nr <- new_nr - nc <- new_nc - } else if (new_nr != nr || !(new_nc %in% expected_cols)) { - - new_row <- data.frame(Package = package, - Dataset = dataname, - Class = class(ds), - OldRows = nr, - OldColumns = nc, - NewRows = new_nr, - NewColumns = new_nc) - - mismatched_dims_df <- rbind(mismatched_dims_df, new_row) - - } - } - - new_row <- data.frame(Package = package, - Dataset = dataname, - Title = title, - Rows = nr, - Columns = nc, - stringsAsFactors = FALSE) - - dataset_df <- rbind(dataset_df, new_row) - } + do_package_update(data_dir, package_df, old_dataset_df, dataset_df, mismatched_dims_df, package) } stopifnot(nrow(dataset_df) > 0) @@ -114,7 +161,7 @@ update_docs <- function(pkg_dir) { } write <- function(df, fn) { - write.table(df, file = fn, sep = ",", qmethod = "escape", row.names = FALSE) + write.table(df, file = fn, sep = ",", qmethod = "double", row.names = FALSE) } clean <- function(df) {