From fa0460915de51890aae8d0353f1feae55b03d758 Mon Sep 17 00:00:00 2001 From: Frankie Robertson Date: Mon, 29 Aug 2022 17:53:34 +0300 Subject: [PATCH 1/3] Streamline adding a new dataset * Add instructions to README for adding a new dataset * Add scripts to update the dataset metadata * Add update_doc method to only add a single dataset * Add HTML documentation generation to update_doc * Change update_doc to correctly round trip quotes in the metadata CSV --- README.md | 17 +++ scripts/update_doc_all.sh | 4 + scripts/update_doc_one.sh | 4 + src/update_doc.r | 211 +++++++++++++++++++++++--------------- 4 files changed, 154 insertions(+), 82 deletions(-) create mode 100755 scripts/update_doc_all.sh create mode 100755 scripts/update_doc_one.sh diff --git a/README.md b/README.md index ce0927bd..2003f5b2 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,23 @@ mlmRev|guImmun|Immunization in Guatemala|2159|13 mlmRev|guPrenat|Prenatal care in Guatemala|2449|15 mlmRev|star|Student Teacher Achievement Ratio (STAR) project data|26796|18 +# How to add datasets from a new package + +**Step 1: add the data from the package** + + 1. In your clone of this repo `mkdir -p data/$PKG` + 2. Go to CRAN + 3. Download the *source package* + 4. Extract one or more of the datasets in the `data` directory into the new directory + +**Step 2: add the metadata** + +Run the script: + + $ scripts/update_doc_one.sh $PKG + +Now it's ready for you to submit your pull request. + # Licensing and Intellectual Property Following Vincent's lead, we have assumed that all of the data sets in this repository can be made available under the GPL-3 license. If you know that one of the datasets released here should not be released publicly or if you know that a data set can only be released under a different license, please contact me so that I can remove the data set from this repository. diff --git a/scripts/update_doc_all.sh b/scripts/update_doc_all.sh new file mode 100755 index 00000000..0d8ff1a1 --- /dev/null +++ b/scripts/update_doc_all.sh @@ -0,0 +1,4 @@ +R --no-save < 0) @@ -114,7 +161,7 @@ update_docs <- function(pkg_dir) { } write <- function(df, fn) { - write.table(df, file = fn, sep = ",", qmethod = "escape", row.names = FALSE) + write.table(df, file = fn, sep = ",", qmethod = "double", row.names = FALSE) } clean <- function(df) { From 0c5b9e3fa5c8285c077f610dc90dc8ec54bac5db Mon Sep 17 00:00:00 2001 From: Frankie Robertson Date: Mon, 29 Aug 2022 18:00:18 +0300 Subject: [PATCH 2/3] Sort datasets CSV --- doc/datasets.csv | 60 ++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/doc/datasets.csv b/doc/datasets.csv index b8081be0..8de3ed78 100644 --- a/doc/datasets.csv +++ b/doc/datasets.csv @@ -506,6 +506,36 @@ "datasets","volcano","Topographic Information on Auckland's Maunga Whau Volcano",87,61 "datasets","warpbreaks","The Number of Breaks in Yarn during Weaving",54,3 "datasets","women","Average Heights and Weights for American Women",15,2 +"gamair","aral","aral",488,4 +"gamair","aral.bnd","aral.bnd",107,3 +"gamair","bird","bird",25100,7 +"gamair","blowfly","blowfly",180,3 +"gamair","bone","bone",23,4 +"gamair","brain","brain",1567,6 +"gamair","cairo","cairo",3780,7 +"gamair","chicago","chicago",5114,8 +"gamair","chl","chl",13840,7 +"gamair","co2s","co2s",507,4 +"gamair","coast","coast",2091,3 +"gamair","engine","engine",19,3 +"gamair","gas","gas",60,804 +"gamair","harrier","harrier",37,3 +"gamair","hubble","hubble",24,4 +"gamair","ipo","ipo",156,7 +"gamair","mack","mack",634,17 +"gamair","mackp","mackp",1162,9 +"gamair","med","med",1476,25 +"gamair","meh","meh",1476,24 +"gamair","mpg","mpg",205,27 +"gamair","prostate","prostate",654,530 +"gamair","sitka","sitka",1027,6 +"gamair","sole","sole",1575,8 +"gamair","sperm.comp1","sperm.comp1",15,5 +"gamair","sperm.comp2","sperm.comp2",24,11 +"gamair","stomata","stomata",24,4 +"gamair","swer","swer",2196,10 +"gamair","wesdr","wesdr",669,5 +"gamair","wine","wine",47,8 "gap","PD","A study of Parkinson's disease and APOE, LRRK2, SNCA makers",825,22 "gap","aldh2","ALDH2 markers and Alcoholism",263,18 "gap","apoeapoc","APOE/APOC1 markers and Alzheimer's",353,8 @@ -732,33 +762,3 @@ "vcd","VonBort","Von Bortkiewicz Horse Kicks Data",280,4 "vcd","WeldonDice","Weldon's Dice Data",11,2 "vcd","WomenQueue","Women in Queues",11,2 -"gamair","aral.bnd","aral.bnd",107,3 -"gamair","aral","aral",488,4 -"gamair","bird","bird",25100,7 -"gamair","blowfly","blowfly",180,3 -"gamair","bone","bone",23,4 -"gamair","brain","brain",1567,6 -"gamair","cairo","cairo",3780,7 -"gamair","chicago","chicago",5114,8 -"gamair","chl","chl",13840,7 -"gamair","co2s","co2s",507,4 -"gamair","coast","coast",2091,3 -"gamair","engine","engine",19,3 -"gamair","gas","gas",60,804 -"gamair","harrier","harrier",37,3 -"gamair","hubble","hubble",24,4 -"gamair","ipo","ipo",156,7 -"gamair","mack","mack",634,17 -"gamair","mackp","mackp",1162,9 -"gamair","med","med",1476,25 -"gamair","meh","meh",1476,24 -"gamair","mpg","mpg",205,27 -"gamair","prostate","prostate",654,530 -"gamair","sitka","sitka",1027,6 -"gamair","sole","sole",1575,8 -"gamair","sperm.comp1","sperm.comp1",15,5 -"gamair","sperm.comp2","sperm.comp2",24,11 -"gamair","stomata","stomata",24,4 -"gamair","swer","swer",2196,10 -"gamair","wesdr","wesdr",669,5 -"gamair","wine","wine",47,8 From ebcbfef06d38fbd322d6dd0bb9a58d17f65fd799 Mon Sep 17 00:00:00 2001 From: Frankie Robertson Date: Tue, 30 Aug 2022 12:36:40 +0300 Subject: [PATCH 3/3] Allow datasets with a .RData extension as well as .rda --- src/dataset.jl | 5 +++++ src/update_doc.r | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/dataset.jl b/src/dataset.jl index 494fe00c..8df80232 100644 --- a/src/dataset.jl +++ b/src/dataset.jl @@ -7,6 +7,11 @@ const Dataset_typedetect_rows = Dict{Tuple{String, String}, Union{Vector,Dict}}( function dataset(package_name::AbstractString, dataset_name::AbstractString) basename = joinpath(@__DIR__, "..", "data", package_name) + rdataname = joinpath(basename, string(dataset_name, ".RData")) + if isfile(rdataname) + return load(rdataname)[dataset_name] + end + rdaname = joinpath(basename, string(dataset_name, ".rda")) if isfile(rdaname) return load(rdaname)[dataset_name] diff --git a/src/update_doc.r b/src/update_doc.r index c1b3ef65..aecda680 100644 --- a/src/update_doc.r +++ b/src/update_doc.r @@ -32,7 +32,7 @@ do_package_update <- function(data_dir, package_df, old_dataset_df, dataset_df, datasets <- dir(path = file.path(data_dir, package)) # Trim filenames to dataset names - r <- "(.+)\\.(csv\\.gz|rda)$" + r <- "(.+)\\.(csv\\.gz|rda|RData)$" format_recognized <- grepl(r, datasets) if (!(all(format_recognized))) { stop("Unrecognized formats:\n",