Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Streamline adding a new dataset #135

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,23 @@ mlmRev|guImmun|Immunization in Guatemala|2159|13
mlmRev|guPrenat|Prenatal care in Guatemala|2449|15
mlmRev|star|Student Teacher Achievement Ratio (STAR) project data|26796|18

# How to add datasets from a new package

**Step 1: add the data from the package**

1. In your clone of this repo `mkdir -p data/$PKG`
2. Go to CRAN
3. Download the *source package*
4. Extract one or more of the datasets in the `data` directory into the new directory

**Step 2: add the metadata**

Run the script:

$ scripts/update_doc_one.sh $PKG

Now it's ready for you to submit your pull request.

# Licensing and Intellectual Property

Following Vincent's lead, we have assumed that all of the data sets in this repository can be made available under the GPL-3 license. If you know that one of the datasets released here should not be released publicly or if you know that a data set can only be released under a different license, please contact me so that I can remove the data set from this repository.
60 changes: 30 additions & 30 deletions doc/datasets.csv
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,36 @@
"datasets","volcano","Topographic Information on Auckland's Maunga Whau Volcano",87,61
"datasets","warpbreaks","The Number of Breaks in Yarn during Weaving",54,3
"datasets","women","Average Heights and Weights for American Women",15,2
"gamair","aral","aral",488,4
"gamair","aral.bnd","aral.bnd",107,3
"gamair","bird","bird",25100,7
"gamair","blowfly","blowfly",180,3
"gamair","bone","bone",23,4
"gamair","brain","brain",1567,6
"gamair","cairo","cairo",3780,7
"gamair","chicago","chicago",5114,8
"gamair","chl","chl",13840,7
"gamair","co2s","co2s",507,4
"gamair","coast","coast",2091,3
"gamair","engine","engine",19,3
"gamair","gas","gas",60,804
"gamair","harrier","harrier",37,3
"gamair","hubble","hubble",24,4
"gamair","ipo","ipo",156,7
"gamair","mack","mack",634,17
"gamair","mackp","mackp",1162,9
"gamair","med","med",1476,25
"gamair","meh","meh",1476,24
"gamair","mpg","mpg",205,27
"gamair","prostate","prostate",654,530
"gamair","sitka","sitka",1027,6
"gamair","sole","sole",1575,8
"gamair","sperm.comp1","sperm.comp1",15,5
"gamair","sperm.comp2","sperm.comp2",24,11
"gamair","stomata","stomata",24,4
"gamair","swer","swer",2196,10
"gamair","wesdr","wesdr",669,5
"gamair","wine","wine",47,8
"gap","PD","A study of Parkinson's disease and APOE, LRRK2, SNCA makers",825,22
"gap","aldh2","ALDH2 markers and Alcoholism",263,18
"gap","apoeapoc","APOE/APOC1 markers and Alzheimer's",353,8
Expand Down Expand Up @@ -732,33 +762,3 @@
"vcd","VonBort","Von Bortkiewicz Horse Kicks Data",280,4
"vcd","WeldonDice","Weldon's Dice Data",11,2
"vcd","WomenQueue","Women in Queues",11,2
"gamair","aral.bnd","aral.bnd",107,3
"gamair","aral","aral",488,4
"gamair","bird","bird",25100,7
"gamair","blowfly","blowfly",180,3
"gamair","bone","bone",23,4
"gamair","brain","brain",1567,6
"gamair","cairo","cairo",3780,7
"gamair","chicago","chicago",5114,8
"gamair","chl","chl",13840,7
"gamair","co2s","co2s",507,4
"gamair","coast","coast",2091,3
"gamair","engine","engine",19,3
"gamair","gas","gas",60,804
"gamair","harrier","harrier",37,3
"gamair","hubble","hubble",24,4
"gamair","ipo","ipo",156,7
"gamair","mack","mack",634,17
"gamair","mackp","mackp",1162,9
"gamair","med","med",1476,25
"gamair","meh","meh",1476,24
"gamair","mpg","mpg",205,27
"gamair","prostate","prostate",654,530
"gamair","sitka","sitka",1027,6
"gamair","sole","sole",1575,8
"gamair","sperm.comp1","sperm.comp1",15,5
"gamair","sperm.comp2","sperm.comp2",24,11
"gamair","stomata","stomata",24,4
"gamair","swer","swer",2196,10
"gamair","wesdr","wesdr",669,5
"gamair","wine","wine",47,8
4 changes: 4 additions & 0 deletions scripts/update_doc_all.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
R --no-save <<END
source("src/update_doc.r")
update_docs(".")
END
4 changes: 4 additions & 0 deletions scripts/update_doc_one.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
R --no-save <<END
source("src/update_doc.r")
update_package_doc(".", "$1")
END
5 changes: 5 additions & 0 deletions src/dataset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ const Dataset_typedetect_rows = Dict{Tuple{String, String}, Union{Vector,Dict}}(
function dataset(package_name::AbstractString, dataset_name::AbstractString)
basename = joinpath(@__DIR__, "..", "data", package_name)

rdataname = joinpath(basename, string(dataset_name, ".RData"))
if isfile(rdataname)
return load(rdataname)[dataset_name]
end

rdaname = joinpath(basename, string(dataset_name, ".rda"))
if isfile(rdaname)
return load(rdaname)[dataset_name]
Expand Down
211 changes: 129 additions & 82 deletions src/update_doc.r
Original file line number Diff line number Diff line change
@@ -1,3 +1,129 @@
install_packages <- function(packages) {
# Pick a decent mirror if none set
r <- getOption("repos")
if (r["CRAN"] == "@CRAN@") {
r <- "http://cran.rstudio.com/"
}
suppressWarnings({install.packages(packages, repos = r)})
}

install_packages(c("R2HTML"))
library(R2HTML)

write_doc = function(package, dataset) {
help.ref <- try(help(eval(dataset), package=eval(package)), silent = TRUE)
doc <- try(utils:::.getHelpFile(help.ref), silent = TRUE)
try(dir.create(paste0('doc/', package)), silent = TRUE)
fn_doc <- paste0('doc/', package, '/', dataset, '.html')
tools::Rd2HTML(doc, out = fn_doc)
}

do_package_update <- function(data_dir, package_df, old_dataset_df, dataset_df, mismatched_dims_df, package) {
suppressWarnings({library(package, character.only = TRUE)})

# Get package description
pdesc <- packageDescription(package)
new_row <- as.data.frame(pdesc[c("Package", "Title")],
stringsAsFactors = FALSE)
package_df <- rbind(package_df, new_row)

pdat <- data(package=package)$results

datasets <- dir(path = file.path(data_dir, package))

# Trim filenames to dataset names
r <- "(.+)\\.(csv\\.gz|rda|RData)$"
format_recognized <- grepl(r, datasets)
if (!(all(format_recognized))) {
stop("Unrecognized formats:\n",
cat(datasets[!format_recognized], sep = "\n"))
}
datanames <- sub(r, "\\1", datasets)

for (dataname in datanames) {
evaltext = paste0("data(", dataname, ", package=package)")
eval(parse(text = evaltext))
ds <- get(dataname)

write_doc(package, dataname)

# Get dataset description
title <- unique(pdat[, "Title"][pdat[, "Item"] == dataname])
if (length(title) != 1) {
stop(package, "/", dataname, " had ", length(title), " descriptions.")
}

# Old dims to fall back on
old_row = subset(old_dataset_df,
Dataset == dataname & Package == package)
nr <- if (nrow(old_row)) old_row$Rows[[1]] else NA
nc <- if (nrow(old_row)) old_row$Columns[[1]] else NA

# Check against new dims when simple
new_nr <- NROW(ds)
new_nc <- NCOL(ds)
if (!(any(c("table", "ltraj") %in% class(ds))) &&
class(ds) != "list" &&
is.numeric(new_nr) &&
is.numeric(new_nc)) {

expected_cols <- c(nc, nc - 1) # row.names sometimes included
if (!is.numeric(nr) || !is.numeric(nc)) {
nr <- new_nr
nc <- new_nc
} else if (new_nr != nr || !(new_nc %in% expected_cols)) {

new_row <- data.frame(Package = package,
Dataset = dataname,
Class = class(ds),
OldRows = nr,
OldColumns = nc,
NewRows = new_nr,
NewColumns = new_nc)

mismatched_dims_df <- rbind(mismatched_dims_df, new_row)

}
}

new_row <- data.frame(Package = package,
Dataset = dataname,
Title = title,
Rows = nr,
Columns = nc,
stringsAsFactors = FALSE)

dataset_df <- rbind(dataset_df, new_row)
}
return(list(package_df = package_df, dataset_df = dataset_df, mismatched_dims_df = mismatched_dims_df))
}

update_package_doc <- function(pkg_dir, package) {
data_dir <- file.path(pkg_dir, "data")
doc_dir <- file.path(pkg_dir, "doc")

package_fn <- file.path(doc_dir, "packages.csv")
dataset_fn <- file.path(doc_dir, "datasets.csv")

package_df <- read.csv(package_fn)
dataset_df <- read.csv(dataset_fn)

install_packages(c(package))

mismatched_dims_df <- data.frame()
dfs <- do_package_update(data_dir, package_df, dataset_df, dataset_df, mismatched_dims_df, package)
package_df <- dfs$package_df
dataset_df <- dfs$dataset_df
mismatched_dims_df <- dfs$mismatched_dims_df

package_df <- sort_upper_first(clean(package_df), c("Package"))
dataset_df <- sort_upper_first(clean(dataset_df), c("Package", "Dataset"))

write(package_df, package_fn)
write(dataset_df, dataset_fn)
return(mismatched_dims_df)
}

update_docs <- function(pkg_dir) {
data_dir <- file.path(pkg_dir, "data")
doc_dir <- file.path(pkg_dir, "doc")
Expand All @@ -16,90 +142,11 @@ update_docs <- function(pkg_dir) {
# Install any missing packages
new_packages <- packages[!(packages %in% installed.packages()[, "Package"])]
if (length(new_packages)) {
# Pick a decent mirror if none set
r <- getOption("repos")
if (r["CRAN"] == "@CRAN@") {
r <- "http://cran.rstudio.com/"
}
suppressWarnings({install.packages(new_packages, repos = r)})
install_packages(new_packages)
}

for (package in packages) {
suppressWarnings({library(package, character.only = TRUE)})

# Get package description
pdesc <- packageDescription(package)
new_row <- as.data.frame(pdesc[c("Package", "Title")],
stringsAsFactors = FALSE)
package_df <- rbind(package_df, new_row)

pdat <- data(package=package)$results

datasets <- dir(path = file.path(data_dir, package))

# Trim filenames to dataset names
r <- "(.+)\\.(csv\\.gz|rda)$"
format_recognized <- grepl(r, datasets)
if (!(all(format_recognized))) {
stop("Unrecognized formats:\n",
cat(datasets[!format_recognized], sep = "\n"))
}
datanames <- sub(r, "\\1", datasets)

for (dataname in datanames) {
eval(parse(text = paste0("data(", dataname, ", package=package)")))
ds <- get(dataname)

# TODO: Write rst and html doc per dataset

# Get dataset description
title <- unique(pdat[, "Title"][pdat[, "Item"] == dataname])
if (length(title) != 1) {
stop(package, "/", title, " had ", length(title), " descriptions.")
}

# Old dims to fall back on
old_row = subset(old_dataset_df,
Dataset == dataname & Package == package)
nr <- if (nrow(old_row)) old_row$Rows[[1]] else NA
nc <- if (nrow(old_row)) old_row$Columns[[1]] else NA

# Check against new dims when simple
new_nr <- NROW(ds)
new_nc <- NCOL(ds)
if (!(any(c("table", "ltraj") %in% class(ds))) &&
class(ds) != "list" &&
is.numeric(new_nr) &&
is.numeric(new_nc)) {

expected_cols <- c(nc, nc - 1) # row.names sometimes included
if (!is.numeric(nr) || !is.numeric(nc)) {
nr <- new_nr
nc <- new_nc
} else if (new_nr != nr || !(new_nc %in% expected_cols)) {

new_row <- data.frame(Package = package,
Dataset = dataname,
Class = class(ds),
OldRows = nr,
OldColumns = nc,
NewRows = new_nr,
NewColumns = new_nc)

mismatched_dims_df <- rbind(mismatched_dims_df, new_row)

}
}

new_row <- data.frame(Package = package,
Dataset = dataname,
Title = title,
Rows = nr,
Columns = nc,
stringsAsFactors = FALSE)

dataset_df <- rbind(dataset_df, new_row)
}
do_package_update(data_dir, package_df, old_dataset_df, dataset_df, mismatched_dims_df, package)
}

stopifnot(nrow(dataset_df) > 0)
Expand All @@ -114,7 +161,7 @@ update_docs <- function(pkg_dir) {
}

write <- function(df, fn) {
write.table(df, file = fn, sep = ",", qmethod = "escape", row.names = FALSE)
write.table(df, file = fn, sep = ",", qmethod = "double", row.names = FALSE)
}

clean <- function(df) {
Expand Down
Loading