diff --git a/pkgdown.yml b/pkgdown.yml index 0c14636..d004882 100644 --- a/pkgdown.yml +++ b/pkgdown.yml @@ -4,7 +4,7 @@ pkgdown_sha: ~ articles: case-study: case-study.html using-reclanc: using-reclanc.html -last_built: 2024-07-22T14:51Z +last_built: 2024-07-22T18:41Z urls: reference: https://kaiaragaki.github.io/reclanc/reference article: https://kaiaragaki.github.io/reclanc/articles diff --git a/reference/predict.clanc.html b/reference/predict.clanc.html index b6379f9..38bf951 100644 --- a/reference/predict.clanc.html +++ b/reference/predict.clanc.html @@ -70,6 +70,11 @@

Argumentsformat +

Character. Are the data "wide" (default), with genes as +columns, or "tall", with genes as rows?

+ +
...

Not used, but required for extensibility.

@@ -77,11 +82,6 @@

Argumentsmethod

If type is numeric, the method of correlation

- -
format.
-

Character. Are the data "wide" (default), with genes as -columns, or "tall", with genes as rows?

-

Value

diff --git a/search.json b/search.json index 57ccba4..9ef5db0 100644 --- a/search.json +++ b/search.json @@ -1 +1 @@ -[{"path":"https://kaiaragaki.github.io/reclanc/CONTRIBUTING.html","id":null,"dir":"","previous_headings":"","what":"Contributing to reclanc","title":"Contributing to reclanc","text":"outlines propose change reclanc.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/CONTRIBUTING.html","id":"fixing-typos","dir":"","previous_headings":"","what":"Fixing typos","title":"Contributing to reclanc","text":"can fix typos, spelling mistakes, grammatical errors documentation directly using GitHub web interface, long changes made source file. generally means ’ll need edit roxygen2 comments .R, .Rd file. can find .R file generates .Rd reading comment first line.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/CONTRIBUTING.html","id":"bigger-changes","dir":"","previous_headings":"","what":"Bigger changes","title":"Contributing to reclanc","text":"want make bigger change, ’s good idea first file issue make sure someone team agrees ’s needed. ’ve found bug, please file issue illustrates bug minimal reprex (also help write unit test, needed). See guide create great issue advice.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/CONTRIBUTING.html","id":"pull-request-process","dir":"","previous_headings":"Bigger changes","what":"Pull request process","title":"Contributing to reclanc","text":"Fork package clone onto computer. haven’t done , recommend using usethis::create_from_github(\"KaiAragaki/reclanc\", fork = TRUE). Install development dependencies devtools::install_dev_deps(), make sure package passes R CMD check running devtools::check(). R CMD check doesn’t pass cleanly, ’s good idea ask help continuing. Create Git branch pull request (PR). recommend using usethis::pr_init(\"brief-description--change\"). Make changes, commit git, create PR running usethis::pr_push(), following prompts browser. title PR briefly describe change. body PR contain Fixes #issue-number. user-facing changes, add bullet top NEWS.md (.e. just first header). Follow style described https://style.tidyverse.org/news.html.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/CONTRIBUTING.html","id":"code-style","dir":"","previous_headings":"Bigger changes","what":"Code style","title":"Contributing to reclanc","text":"New code follow tidyverse style guide. can use styler package apply styles, please don’t restyle code nothing PR. use roxygen2, Markdown syntax, documentation. use testthat unit tests. Contributions test cases included easier accept.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/CONTRIBUTING.html","id":"code-of-conduct","dir":"","previous_headings":"","what":"Code of Conduct","title":"Contributing to reclanc","text":"Please note reclanc project released Contributor Code Conduct. contributing project agree abide terms.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/LICENSE.html","id":null,"dir":"","previous_headings":"","what":"MIT License","title":"MIT License","text":"Copyright (c) 2024 reclanc authors Permission hereby granted, free charge, person obtaining copy software associated documentation files (“Software”), deal Software without restriction, including without limitation rights use, copy, modify, merge, publish, distribute, sublicense, /sell copies Software, permit persons Software furnished , subject following conditions: copyright notice permission notice shall included copies substantial portions Software. SOFTWARE PROVIDED “”, WITHOUT WARRANTY KIND, EXPRESS IMPLIED, INCLUDING LIMITED WARRANTIES MERCHANTABILITY, FITNESS PARTICULAR PURPOSE NONINFRINGEMENT. EVENT SHALL AUTHORS COPYRIGHT HOLDERS LIABLE CLAIM, DAMAGES LIABILITY, WHETHER ACTION CONTRACT, TORT OTHERWISE, ARISING , CONNECTION SOFTWARE USE DEALINGS SOFTWARE.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/articles/case-study.html","id":"introduction","dir":"Articles","previous_headings":"","what":"Introduction","title":"case-study","text":"Let’s consider relatively full-featured, practical use case reclanc. vignette, ’ll go basics fitting models, well leverage tidymodels elaborate things like resampling tuning hyperparameters. ’ll fit final model, use predict subtypes entirely new dataset. vignette tries assume little knowledge machine learning tidymodels.","code":""},{"path":[]},{"path":"https://kaiaragaki.github.io/reclanc/articles/case-study.html","id":"a-simple-fit","dir":"Articles","previous_headings":"Fitting","what":"A simple fit","title":"case-study","text":"Let’s start fitting procedure. first need gene expression data. data ’m using Sjödahl et al. (2012). contains RNA expression 308 bladder cancer tumors. paper, Sjödahl et al. used transcriptional data classify tumors seven molecular subtypes (MS): ’d like apply subtype framework datasets. , first need generate centroids. can begin, though, need convert outcomes factors. case, outcomes molecular subtypes: simplest form, since clanc accepts ExpressionSet objects, following done : problem method, though, idea good fit . active argument specifies number genes used distinguishing features given class. case, class find 5 genes expression patterns peculiar given molecular subtype, subtype 7 (total number subtypes) x 5 (number active genes) = 35 genes (see blog post - better yet - original paper details). gotten better fit genes? selecting genes need? know?","code":"lund <- s3readRDS(\"lund.rds\", \"reclanc-lund\", region = \"us-east-2\") lund #> ExpressionSet (storageMode: lockedEnvironment) #> assayData: 16940 features, 308 samples #> element names: exprs #> protocolData: none #> phenoData #> sampleNames: UC_0001_1 UC_0002_1 ... UC_0785_1 (308 total) #> varLabels: title source ... sample (16 total) #> varMetadata: labelDescription #> featureData: none #> experimentData: use 'experimentData(object)' #> Annotation: table(lund$molecular_subtype) #> #> MS1a MS1b MS2a.1 MS2a.2 MS2b.1 MS2b2.1 MS2b2.2 #> 53 78 30 55 43 20 29 lund$molecular_subtype <- factor(lund$molecular_subtype) simple_centroids <- clanc(lund, classes = \"molecular_subtype\", active = 5) head(simple_centroids$centroids) #> class gene expression pooled_sd active prior #> 1 MS1a CXCL1 6.534490 0.8749133 5 0.1428571 #> 2 MS1a MMD 7.922508 0.6429620 5 0.1428571 #> 3 MS1a C9orf19 8.378910 0.7510552 5 0.1428571 #> 4 MS1a BNC1 5.297095 0.2106762 5 0.1428571 #> 5 MS1a SLFN11 7.362887 0.6824663 5 0.1428571 #> 6 MS1a CRAT 6.004517 0.3425669 5 0.1428571"},{"path":"https://kaiaragaki.github.io/reclanc/articles/case-study.html","id":"setting-the-stage-for-more-elaborate-analyses","dir":"Articles","previous_headings":"Fitting","what":"Setting the stage for more elaborate analyses","title":"case-study","text":"can get started tackling larger questions, let’s take brief detour land tidymodels. tidymodels collection packages make running tuning algorithms like much less painful much standardized. order leverage tidymodels, need buy-data structures. (Aside: don’t mean make buy-sound begrudging. say need, really mean : ’re going specifying long formulas, reason R really, really hates. Emil Hvitfeldt recently (time writing) allowed tidymodels handle long formulas gracefully, using tidymodels infrastructure gift, chore.) Many tidymodels workflows begin model specification. rationale behind separate model specification step model fitting step (whereas base R, generally happen ). reclanc makes easy specify model adding custom engine parsnip::discrim_linear, specifying model looks like : mod doesn’t anything - ’s kind point: specifies model later fit , doesn’t fitting . allows us reuse specification across code. next step wrangle data bit ‘wide’ format, columns outcomes (classes) predictors (genes), rows observations (samples): Finally, specify formula fitting model. uses recipes package tidymodels. delightful package can help preprocess data, ’s scope vignette. Instead, just think way specify formula keeps R blowing : can bundle model specification (mod) preprocessing steps (recipe, just formula) workflow: Now can fit model: ’ll notice results saw previously, demonstrating ’re using tidymodels rather base R, ’re still thing.","code":"library(tidymodels) mod <- discrim_linear() |> set_engine( engine = \"clanc\", # Note: \"clanc\", not \"reclanc\" active = 5 ) wrangled <- data.frame(class = lund$molecular_subtype, t(exprs(lund))) head(wrangled[1:5]) #> class LOC23117 FCGR2B TRIM44 C15orf39 #> UC_0001_1 MS1b 5.565262 5.306654 9.305053 6.430063 #> UC_0002_1 MS2b.1 5.505854 5.731128 9.242790 7.265748 #> UC_0003_1 MS2a.2 5.336140 5.540470 9.888668 7.244976 #> UC_0006_2 MS2b.1 5.576748 5.847743 9.408895 7.377358 #> UC_0007_1 MS2a.2 5.414919 5.510507 10.482469 6.435552 #> UC_0008_1 MS2b.1 5.279174 5.633093 9.112754 7.057977 # Note that the recipe requires 'template data' recipe <- recipe(class ~ ., wrangled) wf <- workflow() |> add_recipe(recipe) |> add_model(mod) wf #> ══ Workflow ════════════════════════════════════════════════════════════════════ #> Preprocessor: Recipe #> Model: discrim_linear() #> #> ── Preprocessor ──────────────────────────────────────────────────────────────── #> 0 Recipe Steps #> #> ── Model ─────────────────────────────────────────────────────────────────────── #> Linear Discriminant Model Specification (classification) #> #> Engine-Specific Arguments: #> active = 5 #> #> Computational engine: clanc tidymodels_fit <- fit(wf, data = wrangled) head(extract_fit_parsnip(tidymodels_fit)$fit$centroids) #> class gene expression pooled_sd active prior #> 1 MS1a CXCL1 6.534490 0.8749133 5 0.1428571 #> 2 MS1a MMD 7.922508 0.6429620 5 0.1428571 #> 3 MS1a C9orf19 8.378910 0.7510552 5 0.1428571 #> 4 MS1a BNC1 5.297095 0.2106762 5 0.1428571 #> 5 MS1a SLFN11 7.362887 0.6824663 5 0.1428571 #> 6 MS1a CRAT 6.004517 0.3425669 5 0.1428571"},{"path":"https://kaiaragaki.github.io/reclanc/articles/case-study.html","id":"measuring-fit-accuracy-with-cross-validation","dir":"Articles","previous_headings":"Fitting","what":"Measuring fit accuracy with cross-validation","title":"case-study","text":"Now ’ve dialed tidymodels framework, can lot elaborate things ease. One concerns whether 5 active genes good choice (active = 5). somewhat simple way determine good choice 5 genes use cross-validation. Cross-validation allows us test good fit training model , say, 80% data, testing rest (see Wikipedia diagram k-fold cross validation). allows us get measure good fit , without break actual test data - general used ’re ready finalize model. Speaking test data, let’s go ahead split now. ’ll lock test data away use ’ve fit final model. , ’ll use cross validation assess good fit , essentially using training data testing data. course, tidymodels makes easy , using rsample::initial_split: train test just subsets original data, containing 80% 20% original data (respectively). also tries maintain relative proportions classes within datasets (set strata = class): Creating folds cross validation nearly initial_split: can reuse workflow wf, contains model formula. difference use fit_resamples, specify metric want use measure good fit (remember every fold chunk data uses test fit). simplicity, let’s use accuracy: can extract accuracy metrics using collect_metrics, roots around fits helpfully extracts metrics, aggregates , calculated standard error: model accuracy 74%. Applying model testing data: Note testing data accuracy (%) approximates training data accuracy (74%).","code":"set.seed(123) splits <- initial_split(wrangled, prop = 0.8, strata = class) train <- training(splits) test <- testing(splits) round(prop.table(table(train$class)), 2) #> #> MS1a MS1b MS2a.1 MS2a.2 MS2b.1 MS2b2.1 MS2b2.2 #> 0.17 0.25 0.10 0.18 0.15 0.07 0.08 round(prop.table(table(test$class)), 2) #> #> MS1a MS1b MS2a.1 MS2a.2 MS2b.1 MS2b2.1 MS2b2.2 #> 0.19 0.27 0.08 0.16 0.11 0.05 0.16 folds <- vfold_cv(train, v = 5, strata = class) folds #> # 5-fold cross-validation using stratification #> # A tibble: 5 × 2 #> splits id #> #> 1 Fold1 #> 2 Fold2 #> 3 Fold3 #> 4 Fold4 #> 5 Fold5 fits <- fit_resamples( wf, folds, metrics = metric_set(accuracy) ) #> 35/35 (100%) genes in centroids found in data #> 35/35 (100%) genes in centroids found in data #> 35/35 (100%) genes in centroids found in data #> 35/35 (100%) genes in centroids found in data #> 35/35 (100%) genes in centroids found in data fits #> # Resampling results #> # 5-fold cross-validation using stratification #> # A tibble: 5 × 4 #> splits id .metrics .notes #> #> 1 Fold1 #> 2 Fold2 #> 3 Fold3 #> 4 Fold4 #> 5 Fold5 metrics <- collect_metrics(fits) metrics #> # A tibble: 1 × 6 #> .metric .estimator mean n std_err .config #> #> 1 accuracy multiclass 0.737 5 0.0289 Preprocessor1_Model1 # Fit a model using *all* of our training data final_fit <- clanc(class ~ ., train, active = 5) # Use it to predict the (known) classes of our test data preds <- predict(final_fit, new_data = test, type = \"class\") #> 35/35 (100%) genes in centroids found in data w_preds <- cbind(preds, test) # Compare known class vs predicted class metric <- accuracy(w_preds, class, .pred_class) metric #> # A tibble: 1 × 3 #> .metric .estimator .estimate #> #> 1 accuracy multiclass 0.734"},{"path":"https://kaiaragaki.github.io/reclanc/articles/case-study.html","id":"tuning-hyperparameters-with-tune","dir":"Articles","previous_headings":"Fitting","what":"Tuning hyperparameters with tune","title":"case-study","text":"Now least measure good model fits, better genes? get away fewer? Running command different numbers drag - fortunately, ’s yet another beautiful package help us: tune. use tune, need re-specify model let tune know parameters want tune: update previous workflow using update_model, let’s just declare new one: specify range values active try: can fit folds using spread values chose: , can collect metrics - time, however, summary metrics values active: graphically: looks like read maximal accuracy around 21 genes - let’s choose 20 genes nice round number: looks like accuracy little better now ’ve chosen optimal number active genes.","code":"tune_mod <- discrim_linear() |> set_engine( engine = \"clanc\", active = tune() ) tune_wf <- workflow() |> add_recipe(recipe) |> add_model(tune_mod) values <- data.frame(active = seq(from = 1, to = 50, by = 4)) values #> active #> 1 1 #> 2 5 #> 3 9 #> 4 13 #> 5 17 #> 6 21 #> 7 25 #> 8 29 #> 9 33 #> 10 37 #> 11 41 #> 12 45 #> 13 49 # This is going to take some time, since we're fitting 5 folds 13 times each. tuned <- tune_grid( tune_wf, folds, metrics = metric_set(accuracy), grid = values ) tuned #> # Tuning results #> # 5-fold cross-validation using stratification #> # A tibble: 5 × 4 #> splits id .metrics .notes #> #> 1 Fold1 #> 2 Fold2 #> 3 Fold3 #> 4 Fold4 #> 5 Fold5 tuned_metrics <- collect_metrics(tuned) tuned_metrics #> # A tibble: 13 × 7 #> active .metric .estimator mean n std_err .config #> #> 1 1 accuracy multiclass 0.585 5 0.0368 Preprocessor1_Model01 #> 2 5 accuracy multiclass 0.737 5 0.0289 Preprocessor1_Model02 #> 3 9 accuracy multiclass 0.748 5 0.0496 Preprocessor1_Model03 #> 4 13 accuracy multiclass 0.781 5 0.0403 Preprocessor1_Model04 #> 5 17 accuracy multiclass 0.770 5 0.0280 Preprocessor1_Model05 #> 6 21 accuracy multiclass 0.774 5 0.0335 Preprocessor1_Model06 #> 7 25 accuracy multiclass 0.785 5 0.0378 Preprocessor1_Model07 #> 8 29 accuracy multiclass 0.794 5 0.0319 Preprocessor1_Model08 #> 9 33 accuracy multiclass 0.773 5 0.0281 Preprocessor1_Model09 #> 10 37 accuracy multiclass 0.790 5 0.0295 Preprocessor1_Model10 #> 11 41 accuracy multiclass 0.794 5 0.0339 Preprocessor1_Model11 #> 12 45 accuracy multiclass 0.815 5 0.0267 Preprocessor1_Model12 #> 13 49 accuracy multiclass 0.815 5 0.0277 Preprocessor1_Model13 ggplot(tuned_metrics, aes(active, mean)) + geom_line() + coord_cartesian(ylim = c(0, 1)) + labs(x = \"Number Active Genes\", y = \"Accuracy\") final_fit_tuned <- clanc(class ~ ., data = train, active = 20) # Use it to predict the (known) classes of our test data: preds <- predict(final_fit_tuned, new_data = test, type = \"class\") #> 140/140 (100%) genes in centroids found in data w_preds <- cbind(preds, test) # Compare known class vs predicted class: metric <- accuracy(w_preds, class, .pred_class) metric #> # A tibble: 1 × 3 #> .metric .estimator .estimate #> #> 1 accuracy multiclass 0.812"},{"path":"https://kaiaragaki.github.io/reclanc/articles/case-study.html","id":"predicting","dir":"Articles","previous_headings":"","what":"Predicting","title":"case-study","text":"Now want apply classifier new data. second dataset RNAseq data 30 bladder cancer cell lines: Predicting incredibly simple. Since ’re using different sequencing method (RNAseq vs array-based sequencing), probably makes sense use correlation based classification rather original distance-based metric used original ClaNC package. can specifying type = \"numeric\" whatever correlation method prefer. Sjödahl paper, seven subtypes simplified five subtypes merging two similar biological pathways activated. ease interpretation, can :","code":"library(cellebrate) cell_rna #> class: DESeqDataSet #> dim: 18548 30 #> metadata(1): version #> assays(2): counts rlog_norm_counts #> rownames(18548): TSPAN6 TNMD ... MT-ND5 MT-ND6 #> rowData names(0): #> colnames(30): 1A6 253JP ... UC7 UC9 #> colData names(5): cell bsl lum call clade cell_preds <- predict( final_fit_tuned, cell_rna, assay = 2, type = \"numeric\", method = \"spearman\" ) #> 118/140 (84%) genes in centroids found in data out <- cbind(colData(cell_rna), cell_preds) |> as_tibble() out #> # A tibble: 30 × 12 #> cell bsl lum call clade .pred_MS1a .pred_MS1b .pred_MS2a.1 #> #> 1 1A6 99.0 1.02 BSL Epithelial Other 0.0600 0.224 0.149 #> 2 253JP 76.6 23.4 BSL Unknown 0.0574 0.240 0.219 #> 3 5637 98.5 1.46 BSL Epithelial Other 0.0958 0.243 0.160 #> 4 BV 49.9 50.1 LUM Unknown 0.0758 0.262 0.238 #> 5 HT1197 56.0 44.0 BSL Epithelial Other 0.119 0.288 0.224 #> 6 HT1376 10.9 89.1 LUM Epithelial Other 0.100 0.277 0.238 #> 7 J82 98.1 1.91 BSL Mesenchymal 0.127 0.292 0.219 #> 8 RT112 0 100 LUM Luminal Papilla… 0.173 0.380 0.294 #> 9 RT4 0 100 LUM Luminal Papilla… 0.134 0.317 0.257 #> 10 RT4V6 0 100 LUM Luminal Papilla… 0.143 0.207 0.165 #> # ℹ 20 more rows #> # ℹ 4 more variables: .pred_MS2a.2 , .pred_MS2b.1 , #> # .pred_MS2b2.1 , .pred_MS2b2.2 plotting_data <- out |> pivot_longer(cols = starts_with(\".pred\")) plotting_data |> ggplot(aes(cell, value, color = name)) + geom_point() + facet_grid(~clade, scales = \"free_x\", space = \"free_x\") table <- plotting_data |> summarize(winner = name[which.max(value)], .by = c(cell, clade)) |> mutate( five = case_when( winner %in% c(\".pred_MS1a\", \".pred_MS1b\") ~ \"Urobasal A\", winner %in% c(\".pred_MS2a.1\", \".pred_MS2a.2\") ~ \"Genomically unstable\", winner == \".pred_MS2b.1\" ~ \"Infiltrated\", winner == \".pred_MS2b2.1\" ~ \"Uro-B\", winner == \".pred_MS2b2.2\" ~ \"SCC-like\" ) ) |> relocate(cell, five, clade) print(table, n = 30) #> # A tibble: 30 × 4 #> cell five clade winner #> #> 1 1A6 SCC-like Epithelial Other .pred_MS2b2.2 #> 2 253JP SCC-like Unknown .pred_MS2b2.2 #> 3 5637 SCC-like Epithelial Other .pred_MS2b2.2 #> 4 BV Urobasal A Unknown .pred_MS1b #> 5 HT1197 SCC-like Epithelial Other .pred_MS2b2.2 #> 6 HT1376 SCC-like Epithelial Other .pred_MS2b2.2 #> 7 J82 Urobasal A Mesenchymal .pred_MS1b #> 8 RT112 Urobasal A Luminal Papillary .pred_MS1b #> 9 RT4 Urobasal A Luminal Papillary .pred_MS1b #> 10 RT4V6 Urobasal A Luminal Papillary .pred_MS1b #> 11 SCaBER SCC-like Epithelial Other .pred_MS2b2.2 #> 12 SW780 Urobasal A Luminal Papillary .pred_MS1b #> 13 T24 SCC-like Mesenchymal .pred_MS2b2.2 #> 14 TCCSup SCC-like Mesenchymal .pred_MS2b2.2 #> 15 UC10 SCC-like Epithelial Other .pred_MS2b2.2 #> 16 UC11 SCC-like Mesenchymal .pred_MS2b2.2 #> 17 UC12 Urobasal A Mesenchymal .pred_MS1b #> 18 UC13 SCC-like Mesenchymal .pred_MS2b2.2 #> 19 UC14 Urobasal A Luminal Papillary .pred_MS1b #> 20 UC15 SCC-like Epithelial Other .pred_MS2b2.2 #> 21 UC16 SCC-like Epithelial Other .pred_MS2b2.2 #> 22 UC17 SCC-like Luminal Papillary .pred_MS2b2.2 #> 23 UC18 SCC-like Mesenchymal .pred_MS2b2.2 #> 24 UC1 Urobasal A Luminal Papillary .pred_MS1b #> 25 UC3 SCC-like Mesenchymal .pred_MS2b2.2 #> 26 UC4 Urobasal A Unknown .pred_MS1b #> 27 UC5 Urobasal A Luminal Papillary .pred_MS1b #> 28 UC6 Urobasal A Luminal Papillary .pred_MS1b #> 29 UC7 Urobasal A Epithelial Other .pred_MS1b #> 30 UC9 Genomically unstable Epithelial Other .pred_MS2a.1"},{"path":"https://kaiaragaki.github.io/reclanc/articles/using-reclanc.html","id":"introduction","dir":"Articles","previous_headings":"","what":"Introduction","title":"using-reclanc","text":"vignette provide brief introduction basic usage reclanc. ’re interested reclanc works, ’d recommend reading blog post wrote original paper Alan Dabney, created original ClaNC algorithm.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/articles/using-reclanc.html","id":"fitting","dir":"Articles","previous_headings":"","what":"Fitting","title":"using-reclanc","text":"create new centroids existing expression data, use clanc function. reclanc provides synthetic expression data can work : data include 12 samples, 6 class “” 6 class “B”, 100 genes. reclanc able ingest variety common formats expression data. Objects like SummarizedExperiments ExpressionSets frequently used bioinformatic analyses arrange data samples columns genes rows. conflict expected formula input base R, predictors (genes) outcomes (classes) columns. reclanc eases friction expecting input common format, abstracting away wrangling aspect analysis. , two broad categories input - ‘wide data’ ‘tall data’.","code":"library(reclanc) library(SummarizedExperiment) library(parsnip) lapply(synthetic_expression, head) #> $expression #> sample1 sample2 sample3 sample4 sample5 sample6 sample7 sample8 #> gene1 8.097529 7.119188 7.304400 7.554689 7.953206 7.714925 7.512700 8.597547 #> gene2 8.641837 9.400416 8.500865 8.878687 8.318438 8.728683 7.812591 7.638167 #> gene3 3.436236 4.317915 3.435193 3.515755 3.024976 4.762209 5.048956 2.006646 #> gene4 4.368008 5.212750 4.618249 4.201365 3.195294 4.707750 5.126769 6.178658 #> gene5 2.423974 3.563816 4.062362 2.163278 2.021435 2.813873 0.000000 4.652358 #> gene6 5.371205 5.919809 4.366915 4.805534 4.834856 5.622157 3.883531 3.593082 #> sample9 sample10 sample11 sample12 #> gene1 6.475641 7.648858 8.637526 7.345038 #> gene2 8.110285 7.906104 7.424728 7.927039 #> gene3 2.739211 3.111668 3.161077 4.306611 #> gene4 5.170265 4.259578 5.872855 6.159023 #> gene5 1.532242 3.399823 3.691250 1.932937 #> gene6 4.246205 4.637316 3.575837 2.730452 #> #> $classes #> [1] A A A A A A #> Levels: A B"},{"path":"https://kaiaragaki.github.io/reclanc/articles/using-reclanc.html","id":"wide-inputs","dir":"Articles","previous_headings":"Fitting","what":"Wide inputs","title":"using-reclanc","text":"Wide inputs require data predictors outcomes columns, together, single data.frame.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/articles/using-reclanc.html","id":"formula","dir":"Articles","previous_headings":"Fitting > Wide inputs","what":"Formula","title":"using-reclanc","text":"","code":"form_data <- cbind( class = synthetic_expression$classes, as.data.frame(t(synthetic_expression$expression)) ) head(form_data[1:5]) #> class gene1 gene2 gene3 gene4 #> sample1 A 8.097529 8.641837 3.436236 4.368008 #> sample2 A 7.119188 9.400416 4.317915 5.212750 #> sample3 A 7.304400 8.500865 3.435193 4.618249 #> sample4 A 7.554689 8.878687 3.515755 4.201365 #> sample5 A 7.953206 8.318438 3.024976 3.195294 #> sample6 A 7.714925 8.728683 4.762209 4.707750 clanc(class ~ ., form_data, active = 5) #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene12 7.514718 0.4779155 5 0.5 #> 2 A gene2 8.744821 0.3147537 5 0.5 #> 3 A gene13 8.936462 0.3418472 5 0.5 #> 4 A gene21 6.584681 0.5279636 5 0.5 #> 5 A gene24 4.307301 0.7214700 5 0.5 #> 6 A gene74 4.028507 0.4940783 5 0.5 #> 7 A gene41 4.328516 0.6317005 5 0.5 #> 8 A gene95 6.873184 0.4462475 5 0.5 #> 9 A gene52 3.743798 0.5173769 5 0.5 #> 10 A gene66 7.008174 0.5883218 5 0.5 #> 11 B gene12 8.072284 0.4779155 5 0.5 #> 12 B gene13 9.938137 0.3418472 5 0.5 #> 13 B gene2 8.273987 0.3147537 5 0.5 #> 14 B gene24 3.370467 0.7214700 5 0.5 #> 15 B gene21 5.789423 0.5279636 5 0.5 #> 16 B gene41 5.518354 0.6317005 5 0.5 #> 17 B gene74 3.226598 0.4940783 5 0.5 #> 18 B gene52 2.438579 0.5173769 5 0.5 #> 19 B gene95 6.288173 0.4462475 5 0.5 #> 20 B gene66 7.891588 0.5883218 5 0.5"},{"path":"https://kaiaragaki.github.io/reclanc/articles/using-reclanc.html","id":"recipe","dir":"Articles","previous_headings":"Fitting > Wide inputs","what":"recipe","title":"using-reclanc","text":"reclanc also supports tidymodels workflows:","code":"discrim_linear() |> set_engine(\"clanc\", active = 5) |> fit(class ~ ., data = form_data) #> parsnip model object #> #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene12 7.514718 0.4779155 5 0.5 #> 2 A gene2 8.744821 0.3147537 5 0.5 #> 3 A gene13 8.936462 0.3418472 5 0.5 #> 4 A gene21 6.584681 0.5279636 5 0.5 #> 5 A gene24 4.307301 0.7214700 5 0.5 #> 6 A gene74 4.028507 0.4940783 5 0.5 #> 7 A gene41 4.328516 0.6317005 5 0.5 #> 8 A gene95 6.873184 0.4462475 5 0.5 #> 9 A gene52 3.743798 0.5173769 5 0.5 #> 10 A gene66 7.008174 0.5883218 5 0.5 #> 11 B gene12 8.072284 0.4779155 5 0.5 #> 12 B gene13 9.938137 0.3418472 5 0.5 #> 13 B gene2 8.273987 0.3147537 5 0.5 #> 14 B gene24 3.370467 0.7214700 5 0.5 #> 15 B gene21 5.789423 0.5279636 5 0.5 #> 16 B gene41 5.518354 0.6317005 5 0.5 #> 17 B gene74 3.226598 0.4940783 5 0.5 #> 18 B gene52 2.438579 0.5173769 5 0.5 #> 19 B gene95 6.288173 0.4462475 5 0.5 #> 20 B gene66 7.891588 0.5883218 5 0.5"},{"path":"https://kaiaragaki.github.io/reclanc/articles/using-reclanc.html","id":"tall-inputs","dir":"Articles","previous_headings":"Fitting","what":"Tall inputs","title":"using-reclanc","text":"Tall inputs require genes rows samples columns","code":""},{"path":"https://kaiaragaki.github.io/reclanc/articles/using-reclanc.html","id":"data-framematrix","dir":"Articles","previous_headings":"Fitting > Tall inputs","what":"data.frame/matrix","title":"using-reclanc","text":"often convenient supply data.frame, particularly data-munging done. data.frame matrix inputs require expression genes column names sample IDs rownames, well factor vector classes:","code":"clanc( synthetic_expression$expression, classes = synthetic_expression$classes, active = 5 ) #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene12 7.514718 0.4779155 5 0.5 #> 2 A gene2 8.744821 0.3147537 5 0.5 #> 3 A gene13 8.936462 0.3418472 5 0.5 #> 4 A gene21 6.584681 0.5279636 5 0.5 #> 5 A gene24 4.307301 0.7214700 5 0.5 #> 6 A gene74 4.028507 0.4940783 5 0.5 #> 7 A gene41 4.328516 0.6317005 5 0.5 #> 8 A gene95 6.873184 0.4462475 5 0.5 #> 9 A gene52 3.743798 0.5173769 5 0.5 #> 10 A gene66 7.008174 0.5883218 5 0.5 #> 11 B gene12 8.072284 0.4779155 5 0.5 #> 12 B gene13 9.938137 0.3418472 5 0.5 #> 13 B gene2 8.273987 0.3147537 5 0.5 #> 14 B gene24 3.370467 0.7214700 5 0.5 #> 15 B gene21 5.789423 0.5279636 5 0.5 #> 16 B gene41 5.518354 0.6317005 5 0.5 #> 17 B gene74 3.226598 0.4940783 5 0.5 #> 18 B gene52 2.438579 0.5173769 5 0.5 #> 19 B gene95 6.288173 0.4462475 5 0.5 #> 20 B gene66 7.891588 0.5883218 5 0.5"},{"path":"https://kaiaragaki.github.io/reclanc/articles/using-reclanc.html","id":"summarizedexperiment","dir":"Articles","previous_headings":"Fitting > Tall inputs","what":"SummarizedExperiment","title":"using-reclanc","text":"common formats expression SummarizedExperiments ExpressionSets: can specify name colData (pData ExpressionSets) column contains classes classes argument:","code":"se <- SummarizedExperiment( synthetic_expression$expression, colData = DataFrame(class = synthetic_expression$classes) ) se #> class: SummarizedExperiment #> dim: 100 12 #> metadata(0): #> assays(1): '' #> rownames(100): gene1 gene2 ... gene99 gene100 #> rowData names(0): #> colnames(12): sample1 sample2 ... sample11 sample12 #> colData names(1): class fit <- clanc( se, classes = \"class\", active = 20, assay = 1 # Index of assay - SummarizedExperiments only ) fit #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene11 2.2992343 1.2044848 20 0.5 #> 2 A gene2 8.7448209 0.3147537 20 0.5 #> 3 A gene13 8.9364621 0.3418472 20 0.5 #> 4 A gene20 2.1925558 1.3104010 20 0.5 #> 5 A gene10 4.9557850 0.8571716 20 0.5 #> 6 A gene21 6.5846813 0.5279636 20 0.5 #> 7 A gene100 5.6455200 0.6175104 20 0.5 #> 8 A gene22 6.1650079 0.4699756 20 0.5 #> 9 A gene46 6.7344030 0.8233370 20 0.5 #> 10 A gene24 4.3073008 0.7214700 20 0.5 #> 11 A gene15 2.4254020 1.1910158 20 0.5 #> 12 A gene25 5.0353875 0.7498139 20 0.5 #> 13 A gene17 2.9424148 0.6628466 20 0.5 #> 14 A gene4 4.3839026 0.7144711 20 0.5 #> 15 A gene56 6.3441126 0.4078736 20 0.5 #> 16 A gene41 4.3285163 0.6317005 20 0.5 #> 17 A gene57 4.2237139 0.9531773 20 0.5 #> 18 A gene7 5.5545202 0.7875124 20 0.5 #> 19 A gene58 5.6162919 0.8161951 20 0.5 #> 20 A gene12 7.5147181 0.4779155 20 0.5 #> 21 A gene6 5.1534126 0.6194184 20 0.5 #> 22 A gene51 6.6256136 0.7737520 20 0.5 #> 23 A gene60 4.7434923 1.2945446 20 0.5 #> 24 A gene52 3.7437977 0.5173769 20 0.5 #> 25 A gene63 8.9293980 0.5635262 20 0.5 #> 26 A gene53 4.3774614 0.8370528 20 0.5 #> 27 A gene66 7.0081742 0.5883218 20 0.5 #> 28 A gene83 3.6532038 0.8444393 20 0.5 #> 29 A gene67 6.1384613 0.3677756 20 0.5 #> 30 A gene85 5.2179679 0.5930857 20 0.5 #> 31 A gene88 4.6008044 1.0603007 20 0.5 #> 32 A gene70 1.3073340 1.1264747 20 0.5 #> 33 A gene47 9.4528373 0.2030726 20 0.5 #> 34 A gene90 0.9794695 1.3272423 20 0.5 #> 35 A gene74 4.0285071 0.4940783 20 0.5 #> 36 A gene94 7.7773183 0.5375914 20 0.5 #> 37 A gene78 2.1763395 1.6805560 20 0.5 #> 38 A gene95 6.8731844 0.4462475 20 0.5 #> 39 A gene79 3.7138831 1.0587367 20 0.5 #> 40 A gene98 4.5710407 0.6798799 20 0.5 #> 41 B gene10 4.2378889 0.8571716 20 0.5 #> 42 B gene2 8.2739866 0.3147537 20 0.5 #> 43 B gene100 5.0435040 0.6175104 20 0.5 #> 44 B gene20 3.4781598 1.3104010 20 0.5 #> 45 B gene46 7.0200767 0.8233370 20 0.5 #> 46 B gene11 1.2780748 1.2044848 20 0.5 #> 47 B gene12 8.0722841 0.4779155 20 0.5 #> 48 B gene22 6.4609169 0.4699756 20 0.5 #> 49 B gene51 5.8920005 0.7737520 20 0.5 #> 50 B gene13 9.9381374 0.3418472 20 0.5 #> 51 B gene15 1.6008569 1.1910158 20 0.5 #> 52 B gene25 4.5015558 0.7498139 20 0.5 #> 53 B gene17 2.5005839 0.6628466 20 0.5 #> 54 B gene4 4.9225469 0.7144711 20 0.5 #> 55 B gene56 6.1067832 0.4078736 20 0.5 #> 56 B gene41 5.5183538 0.6317005 20 0.5 #> 57 B gene57 3.1175271 0.9531773 20 0.5 #> 58 B gene7 5.3367575 0.7875124 20 0.5 #> 59 B gene21 5.7894231 0.5279636 20 0.5 #> 60 B gene47 9.5903798 0.2030726 20 0.5 #> 61 B gene6 4.4655748 0.6194184 20 0.5 #> 62 B gene74 3.2265977 0.4940783 20 0.5 #> 63 B gene24 3.3704670 0.7214700 20 0.5 #> 64 B gene52 2.4385792 0.5173769 20 0.5 #> 65 B gene63 8.3234317 0.5635262 20 0.5 #> 66 B gene53 3.8479638 0.8370528 20 0.5 #> 67 B gene66 7.8915875 0.5883218 20 0.5 #> 68 B gene83 4.2757218 0.8444393 20 0.5 #> 69 B gene67 6.0190764 0.3677756 20 0.5 #> 70 B gene85 5.8877225 0.5930857 20 0.5 #> 71 B gene79 4.1894417 1.0587367 20 0.5 #> 72 B gene58 4.7194615 0.8161951 20 0.5 #> 73 B gene88 5.5945405 1.0603007 20 0.5 #> 74 B gene70 1.5987845 1.1264747 20 0.5 #> 75 B gene90 1.4036889 1.3272423 20 0.5 #> 76 B gene60 5.2336968 1.2945446 20 0.5 #> 77 B gene78 1.6625207 1.6805560 20 0.5 #> 78 B gene95 6.2881728 0.4462475 20 0.5 #> 79 B gene98 4.1346296 0.6798799 20 0.5 #> 80 B gene94 8.4222554 0.5375914 20 0.5"},{"path":"https://kaiaragaki.github.io/reclanc/articles/using-reclanc.html","id":"predicting","dir":"Articles","previous_headings":"","what":"Predicting","title":"using-reclanc","text":"fit can used predict classes new samples new data. new data can come form matrix, data.frame, SummarizedExperiment, ExpressionSet, expected input Using type = \"class\" predict classes using metric provided Alan Dabney original ClaNC paper. However, particularly comparing across datasets may transformed differently, may accurate use correlation based metric:","code":"predict(fit, new_data = se, type = \"class\") #> 40/40 (100%) genes in centroids found in data #> # A tibble: 12 × 1 #> .pred_class #> #> 1 A #> 2 A #> 3 A #> 4 A #> 5 A #> 6 A #> 7 B #> 8 B #> 9 B #> 10 B #> 11 B #> 12 B predict(fit, new_data = se, type = \"numeric\", method = \"spearman\") #> 40/40 (100%) genes in centroids found in data #> # A tibble: 12 × 2 #> .pred_A .pred_B #> #> 1 0.901 0.811 #> 2 0.929 0.849 #> 3 0.932 0.840 #> 4 0.912 0.829 #> 5 0.862 0.770 #> 6 0.932 0.869 #> 7 0.776 0.904 #> 8 0.824 0.931 #> 9 0.828 0.924 #> 10 0.855 0.946 #> 11 0.805 0.915 #> 12 0.750 0.869"},{"path":"https://kaiaragaki.github.io/reclanc/authors.html","id":null,"dir":"","previous_headings":"","what":"Authors","title":"Authors and Citation","text":"Kai Aragaki. Author, maintainer. Alan Dabney. Author, copyright holder. Original creator ClaNC","code":""},{"path":"https://kaiaragaki.github.io/reclanc/authors.html","id":"citation","dir":"","previous_headings":"","what":"Citation","title":"Authors and Citation","text":"Alan D (2005). “Classification microarrays nearest centroids.” Bioinformatics, 21(22), 4148-4154. doi:10.1093/bioinformatics/bti681.","code":"@Article{, title = {Classification of microarrays to nearest centroids}, author = {Dabney Alan}, journal = {Bioinformatics}, year = {2005}, volume = {21}, number = {22}, pages = {4148-4154}, doi = {10.1093/bioinformatics/bti681}, }"},{"path":"https://kaiaragaki.github.io/reclanc/index.html","id":"reclanc","dir":"","previous_headings":"","what":"A Revival of the ClaNC Algorithm","title":"A Revival of the ClaNC Algorithm","text":"reclanc revival ClaNC (Classification microarrays nearest centroids), Alan R. Dabney. Since source lost (least knowledge), code comes heavy modification. reclanc nearest-centroid classifier expression data. tends little sensitive accurate similar models like PAM. Besides mere existence, reclanc differs slightly original ClaNC package ways: reclanc supports wide variety inputs (data.frame, matrix, formula, recipe, ExpressionSet, SummarizedExperiment) reclanc plays nicely tidymodels, offloads things like making folds rsample tuning tune (see vignette leverage tidymodels reclanc). Provides prediction method based correlation, rather distance - useful predicting classes data different sequencing modalities","code":""},{"path":"https://kaiaragaki.github.io/reclanc/index.html","id":"installation","dir":"","previous_headings":"","what":"Installation","title":"A Revival of the ClaNC Algorithm","text":"can install development version reclanc like :","code":"# install.packages(\"pak\") pak::pak(\"KaiAragaki/reclanc\")"},{"path":"https://kaiaragaki.github.io/reclanc/index.html","id":"how-to-use-it","dir":"","previous_headings":"","what":"How to use it","title":"A Revival of the ClaNC Algorithm","text":"information basic usage, see vignette. case study, well optimize active parameter, see vignette.","code":"library(reclanc) lapply(synthetic_expression, head) # dummy data #> $expression #> sample1 sample2 sample3 sample4 sample5 sample6 sample7 sample8 #> gene1 8.097529 7.119188 7.304400 7.554689 7.953206 7.714925 7.512700 8.597547 #> gene2 8.641837 9.400416 8.500865 8.878687 8.318438 8.728683 7.812591 7.638167 #> gene3 3.436236 4.317915 3.435193 3.515755 3.024976 4.762209 5.048956 2.006646 #> gene4 4.368008 5.212750 4.618249 4.201365 3.195294 4.707750 5.126769 6.178658 #> gene5 2.423974 3.563816 4.062362 2.163278 2.021435 2.813873 0.000000 4.652358 #> gene6 5.371205 5.919809 4.366915 4.805534 4.834856 5.622157 3.883531 3.593082 #> sample9 sample10 sample11 sample12 #> gene1 6.475641 7.648858 8.637526 7.345038 #> gene2 8.110285 7.906104 7.424728 7.927039 #> gene3 2.739211 3.111668 3.161077 4.306611 #> gene4 5.170265 4.259578 5.872855 6.159023 #> gene5 1.532242 3.399823 3.691250 1.932937 #> gene6 4.246205 4.637316 3.575837 2.730452 #> #> $classes #> [1] A A A A A A #> Levels: A B centroids <- clanc( synthetic_expression$expression, classes = synthetic_expression$classes, active = 5 ) centroids #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene13 8.936462 0.3418472 5 0.5 #> 2 A gene21 7.379940 0.5279636 5 0.5 #> 3 A gene2 8.744821 0.3147537 5 0.5 #> 4 A gene74 4.028507 0.4940783 5 0.5 #> 5 A gene41 4.328516 0.6317005 5 0.5 #> 6 A gene66 6.124761 0.5883218 5 0.5 #> 7 A gene24 4.307301 0.7214700 5 0.5 #> 8 A gene95 6.288173 0.4462475 5 0.5 #> 9 A gene94 7.777318 0.5375914 5 0.5 #> 10 A gene52 3.743798 0.5173769 5 0.5 #> 11 B gene13 9.938137 0.3418472 5 0.5 #> 12 B gene2 8.273987 0.3147537 5 0.5 #> 13 B gene21 6.584681 0.5279636 5 0.5 #> 14 B gene41 5.518354 0.6317005 5 0.5 #> 15 B gene74 3.226598 0.4940783 5 0.5 #> 16 B gene24 3.370467 0.7214700 5 0.5 #> 17 B gene66 7.008174 0.5883218 5 0.5 #> 18 B gene94 8.422255 0.5375914 5 0.5 #> 19 B gene95 5.703161 0.4462475 5 0.5 #> 20 B gene52 2.438579 0.5173769 5 0.5"},{"path":"https://kaiaragaki.github.io/reclanc/index.html","id":"how-it-works","dir":"","previous_headings":"","what":"How it works","title":"A Revival of the ClaNC Algorithm","text":"can find gentle introduction reclanc works -depth statistically rigorous definition algorithm works original paper.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/index.html","id":"citation","dir":"","previous_headings":"","what":"Citation","title":"A Revival of the ClaNC Algorithm","text":"Citation original ClaNC paper: Alan R. Dabney, Classification microarrays nearest centroids, Bioinformatics, Volume 21, Issue 22, November 2005, Pages 4148–4154, https://doi.org/10.1093/bioinformatics/bti681","code":""},{"path":"https://kaiaragaki.github.io/reclanc/reference/clanc.html","id":null,"dir":"Reference","previous_headings":"","what":"Calculate centroids from expression data with ClaNC — clanc","title":"Calculate centroids from expression data with ClaNC — clanc","text":"Calculate centroids expression data ClaNC","code":""},{"path":"https://kaiaragaki.github.io/reclanc/reference/clanc.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Calculate centroids from expression data with ClaNC — clanc","text":"","code":"clanc(x, ...) # Default S3 method clanc(x, ...) # S3 method for class 'data.frame' clanc(x, classes, active, priors = \"equal\", ...) # S3 method for class 'matrix' clanc(x, classes, active, priors = \"equal\", ...) # S3 method for class 'SummarizedExperiment' clanc(x, classes, active, priors = \"equal\", assay = 1, ...) # S3 method for class 'ExpressionSet' clanc(x, classes, active, priors = \"equal\", ...) # S3 method for class 'formula' clanc(formula, data, active, priors = \"equal\", ...) # S3 method for class 'recipe' clanc(x, data, active, priors = \"equal\", ...)"},{"path":"https://kaiaragaki.github.io/reclanc/reference/clanc.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Calculate centroids from expression data with ClaNC — clanc","text":"x Depending context: data frame expression. matrix expression. recipe specifying set preprocessing steps created recipes::recipe(). ExpressionSet. SummarizedExperiment assay containing expression. Expression library-size corrected, scaled. supplying data frame, matrix, ExpressionSet, SummarizedExperiment, rows represent genes, columns represent samples (standard expression data). column names sample IDs, row names gene IDs. recipe provided, data genes columns (match formula provided recipe.) ... currently used, required extensibility. classes x data frame matrix, class contains class labels form either: data frame 1 factor column factor vector. x ExpressionSet SummarizedExperiment, class name column pData(x) colData(x) contains classes factor. active Either single number numeric vector equal length number unique class labels. Represents number class-specific genes selected centroid. Note different numbers genes can selected class. See details. x ExpressionSet SummarizedExperiment, active can additionally name column pData(x) colData(x) contains numeric vector priors Can take variety values: \"equal\" - class equal prior \"class\" - class prior equal frequency training set numeric vector length equal number classes x ExpressionSet SummarizedExperiment, active can additionally name column pData(x) colData(x) contains numeric vector assay SummarizedExperiment used, index name assay formula formula specifying classes left-hand side, predictor terms right-hand side. data recipe formula used, data specified : data frame containing expression classes, columns genes class, rows samples.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/reference/clanc.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Calculate centroids from expression data with ClaNC — clanc","text":"clanc object.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/reference/clanc.html","id":"details","dir":"Reference","previous_headings":"","what":"Details","title":"Calculate centroids from expression data with ClaNC — clanc","text":"original description ClaNC can found active sets number class-specific genes, centroid number genes. explain way example, active = 5 3 classes, centroid 15 genes, 5 genes particular given class. genes 'active' class, values mean class. genes active given class, values overall expression given gene across classes.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/reference/clanc.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Calculate centroids from expression data with ClaNC — clanc","text":"","code":"expression_matrix <- synthetic_expression$expression head(expression_matrix) #> sample1 sample2 sample3 sample4 sample5 sample6 sample7 sample8 #> gene1 8.097529 7.119188 7.304400 7.554689 7.953206 7.714925 7.512700 8.597547 #> gene2 8.641837 9.400416 8.500865 8.878687 8.318438 8.728683 7.812591 7.638167 #> gene3 3.436236 4.317915 3.435193 3.515755 3.024976 4.762209 5.048956 2.006646 #> gene4 4.368008 5.212750 4.618249 4.201365 3.195294 4.707750 5.126769 6.178658 #> gene5 2.423974 3.563816 4.062362 2.163278 2.021435 2.813873 0.000000 4.652358 #> gene6 5.371205 5.919809 4.366915 4.805534 4.834856 5.622157 3.883531 3.593082 #> sample9 sample10 sample11 sample12 #> gene1 6.475641 7.648858 8.637526 7.345038 #> gene2 8.110285 7.906104 7.424728 7.927039 #> gene3 2.739211 3.111668 3.161077 4.306611 #> gene4 5.170265 4.259578 5.872855 6.159023 #> gene5 1.532242 3.399823 3.691250 1.932937 #> gene6 4.246205 4.637316 3.575837 2.730452 classes <- synthetic_expression$classes classes #> [1] A A A A A A B B B B B B #> Levels: A B # data.frame/tibble/matrix interface: clanc(expression_matrix, classes = classes, active = 5, priors = \"equal\") #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene12 7.514718 0.4779155 5 0.5 #> 2 A gene2 8.744821 0.3147537 5 0.5 #> 3 A gene13 8.936462 0.3418472 5 0.5 #> 4 A gene21 6.584681 0.5279636 5 0.5 #> 5 A gene24 4.307301 0.7214700 5 0.5 #> 6 A gene74 4.028507 0.4940783 5 0.5 #> 7 A gene41 4.328516 0.6317005 5 0.5 #> 8 A gene95 6.873184 0.4462475 5 0.5 #> 9 A gene52 3.743798 0.5173769 5 0.5 #> 10 A gene66 7.008174 0.5883218 5 0.5 #> 11 B gene12 8.072284 0.4779155 5 0.5 #> 12 B gene13 9.938137 0.3418472 5 0.5 #> 13 B gene2 8.273987 0.3147537 5 0.5 #> 14 B gene24 3.370467 0.7214700 5 0.5 #> 15 B gene21 5.789423 0.5279636 5 0.5 #> 16 B gene41 5.518354 0.6317005 5 0.5 #> 17 B gene74 3.226598 0.4940783 5 0.5 #> 18 B gene52 2.438579 0.5173769 5 0.5 #> 19 B gene95 6.288173 0.4462475 5 0.5 #> 20 B gene66 7.891588 0.5883218 5 0.5 #> # Formula interface: # Data must have class included as a column # Genes must be *columns* and samples must be *rows* # Hence the data transposition. for_formula <- data.frame(class = classes, t(expression_matrix)) clanc(class ~ ., for_formula, active = 5, priors = \"equal\") #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene12 7.514718 0.4779155 5 0.5 #> 2 A gene2 8.744821 0.3147537 5 0.5 #> 3 A gene13 8.936462 0.3418472 5 0.5 #> 4 A gene21 6.584681 0.5279636 5 0.5 #> 5 A gene24 4.307301 0.7214700 5 0.5 #> 6 A gene74 4.028507 0.4940783 5 0.5 #> 7 A gene41 4.328516 0.6317005 5 0.5 #> 8 A gene95 6.873184 0.4462475 5 0.5 #> 9 A gene52 3.743798 0.5173769 5 0.5 #> 10 A gene66 7.008174 0.5883218 5 0.5 #> 11 B gene12 8.072284 0.4779155 5 0.5 #> 12 B gene13 9.938137 0.3418472 5 0.5 #> 13 B gene2 8.273987 0.3147537 5 0.5 #> 14 B gene24 3.370467 0.7214700 5 0.5 #> 15 B gene21 5.789423 0.5279636 5 0.5 #> 16 B gene41 5.518354 0.6317005 5 0.5 #> 17 B gene74 3.226598 0.4940783 5 0.5 #> 18 B gene52 2.438579 0.5173769 5 0.5 #> 19 B gene95 6.288173 0.4462475 5 0.5 #> 20 B gene66 7.891588 0.5883218 5 0.5 #> # Recipes interface: rec <- recipes::recipe(class ~ ., data = for_formula) clanc(rec, for_formula, active = 5, priors = \"equal\") #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene12 7.514718 0.4779155 5 0.5 #> 2 A gene2 8.744821 0.3147537 5 0.5 #> 3 A gene13 8.936462 0.3418472 5 0.5 #> 4 A gene21 6.584681 0.5279636 5 0.5 #> 5 A gene24 4.307301 0.7214700 5 0.5 #> 6 A gene74 4.028507 0.4940783 5 0.5 #> 7 A gene41 4.328516 0.6317005 5 0.5 #> 8 A gene95 6.873184 0.4462475 5 0.5 #> 9 A gene52 3.743798 0.5173769 5 0.5 #> 10 A gene66 7.008174 0.5883218 5 0.5 #> 11 B gene12 8.072284 0.4779155 5 0.5 #> 12 B gene13 9.938137 0.3418472 5 0.5 #> 13 B gene2 8.273987 0.3147537 5 0.5 #> 14 B gene24 3.370467 0.7214700 5 0.5 #> 15 B gene21 5.789423 0.5279636 5 0.5 #> 16 B gene41 5.518354 0.6317005 5 0.5 #> 17 B gene74 3.226598 0.4940783 5 0.5 #> 18 B gene52 2.438579 0.5173769 5 0.5 #> 19 B gene95 6.288173 0.4462475 5 0.5 #> 20 B gene66 7.891588 0.5883218 5 0.5 #> # SummarizedExperiment interface: se <- SummarizedExperiment::SummarizedExperiment( expression_matrix, colData = data.frame( class = classes, active = 5, prior = c(0.5, 0.5) ) ) clanc(se, classes = \"class\", active = \"active\", priors = \"equal\") #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene12 7.514718 0.4779155 5 0.5 #> 2 A gene2 8.744821 0.3147537 5 0.5 #> 3 A gene13 8.936462 0.3418472 5 0.5 #> 4 A gene21 6.584681 0.5279636 5 0.5 #> 5 A gene24 4.307301 0.7214700 5 0.5 #> 6 A gene74 4.028507 0.4940783 5 0.5 #> 7 A gene41 4.328516 0.6317005 5 0.5 #> 8 A gene95 6.873184 0.4462475 5 0.5 #> 9 A gene52 3.743798 0.5173769 5 0.5 #> 10 A gene66 7.008174 0.5883218 5 0.5 #> 11 B gene12 8.072284 0.4779155 5 0.5 #> 12 B gene13 9.938137 0.3418472 5 0.5 #> 13 B gene2 8.273987 0.3147537 5 0.5 #> 14 B gene24 3.370467 0.7214700 5 0.5 #> 15 B gene21 5.789423 0.5279636 5 0.5 #> 16 B gene41 5.518354 0.6317005 5 0.5 #> 17 B gene74 3.226598 0.4940783 5 0.5 #> 18 B gene52 2.438579 0.5173769 5 0.5 #> 19 B gene95 6.288173 0.4462475 5 0.5 #> 20 B gene66 7.891588 0.5883218 5 0.5 #> # ExpressionSet interface: adf <- data.frame( row.names = colnames(expression_matrix), class = classes ) |> Biobase::AnnotatedDataFrame() es <- Biobase::ExpressionSet(expression_matrix, adf) clanc(es, classes = \"class\", active = 5, priors = 0.5) #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene12 7.514718 0.4779155 5 0.5 #> 2 A gene2 8.744821 0.3147537 5 0.5 #> 3 A gene13 8.936462 0.3418472 5 0.5 #> 4 A gene21 6.584681 0.5279636 5 0.5 #> 5 A gene24 4.307301 0.7214700 5 0.5 #> 6 A gene74 4.028507 0.4940783 5 0.5 #> 7 A gene41 4.328516 0.6317005 5 0.5 #> 8 A gene95 6.873184 0.4462475 5 0.5 #> 9 A gene52 3.743798 0.5173769 5 0.5 #> 10 A gene66 7.008174 0.5883218 5 0.5 #> 11 B gene12 8.072284 0.4779155 5 0.5 #> 12 B gene13 9.938137 0.3418472 5 0.5 #> 13 B gene2 8.273987 0.3147537 5 0.5 #> 14 B gene24 3.370467 0.7214700 5 0.5 #> 15 B gene21 5.789423 0.5279636 5 0.5 #> 16 B gene41 5.518354 0.6317005 5 0.5 #> 17 B gene74 3.226598 0.4940783 5 0.5 #> 18 B gene52 2.438579 0.5173769 5 0.5 #> 19 B gene95 6.288173 0.4462475 5 0.5 #> 20 B gene66 7.891588 0.5883218 5 0.5 #>"},{"path":"https://kaiaragaki.github.io/reclanc/reference/predict.clanc.html","id":null,"dir":"Reference","previous_headings":"","what":"Predict from a clanc — predict.clanc","title":"Predict from a clanc — predict.clanc","text":"Predict clanc","code":""},{"path":"https://kaiaragaki.github.io/reclanc/reference/predict.clanc.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Predict from a clanc — predict.clanc","text":"","code":"# S3 method for class 'clanc' predict(object, new_data, type, assay = NULL, format = c(\"wide\", \"tall\"), ...)"},{"path":"https://kaiaragaki.github.io/reclanc/reference/predict.clanc.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Predict from a clanc — predict.clanc","text":"object clanc object. new_data data frame matrix new predictors. type single character. type predictions generate. Valid options : \"numeric\" numeric predictions. assay object inherits SummarizedExperiment, index assay. ... used, required extensibility. method type numeric, method correlation format. Character. data \"wide\" (default), genes columns, \"tall\", genes rows?","code":""},{"path":"https://kaiaragaki.github.io/reclanc/reference/predict.clanc.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Predict from a clanc — predict.clanc","text":"tibble predictions. number rows tibble guaranteed number rows new_data.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/reference/synthetic_expression.html","id":null,"dir":"Reference","previous_headings":"","what":"Synthetic Expression of Two Distinct Classes — synthetic_expression","title":"Synthetic Expression of Two Distinct Classes — synthetic_expression","text":"Synthetic Expression Two Distinct Classes","code":""},{"path":"https://kaiaragaki.github.io/reclanc/reference/synthetic_expression.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Synthetic Expression of Two Distinct Classes — synthetic_expression","text":"","code":"synthetic_expression"},{"path":[]},{"path":"https://kaiaragaki.github.io/reclanc/reference/synthetic_expression.html","id":"synthetic-expression","dir":"Reference","previous_headings":"","what":"synthetic_expression","title":"Synthetic Expression of Two Distinct Classes — synthetic_expression","text":"list containing two items: expression Normalized log expression 12 samples across 100 genes classes factor vector classes 12 samples","code":""}] +[{"path":"https://kaiaragaki.github.io/reclanc/CONTRIBUTING.html","id":null,"dir":"","previous_headings":"","what":"Contributing to reclanc","title":"Contributing to reclanc","text":"outlines propose change reclanc.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/CONTRIBUTING.html","id":"fixing-typos","dir":"","previous_headings":"","what":"Fixing typos","title":"Contributing to reclanc","text":"can fix typos, spelling mistakes, grammatical errors documentation directly using GitHub web interface, long changes made source file. generally means ’ll need edit roxygen2 comments .R, .Rd file. can find .R file generates .Rd reading comment first line.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/CONTRIBUTING.html","id":"bigger-changes","dir":"","previous_headings":"","what":"Bigger changes","title":"Contributing to reclanc","text":"want make bigger change, ’s good idea first file issue make sure someone team agrees ’s needed. ’ve found bug, please file issue illustrates bug minimal reprex (also help write unit test, needed). See guide create great issue advice.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/CONTRIBUTING.html","id":"pull-request-process","dir":"","previous_headings":"Bigger changes","what":"Pull request process","title":"Contributing to reclanc","text":"Fork package clone onto computer. haven’t done , recommend using usethis::create_from_github(\"KaiAragaki/reclanc\", fork = TRUE). Install development dependencies devtools::install_dev_deps(), make sure package passes R CMD check running devtools::check(). R CMD check doesn’t pass cleanly, ’s good idea ask help continuing. Create Git branch pull request (PR). recommend using usethis::pr_init(\"brief-description--change\"). Make changes, commit git, create PR running usethis::pr_push(), following prompts browser. title PR briefly describe change. body PR contain Fixes #issue-number. user-facing changes, add bullet top NEWS.md (.e. just first header). Follow style described https://style.tidyverse.org/news.html.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/CONTRIBUTING.html","id":"code-style","dir":"","previous_headings":"Bigger changes","what":"Code style","title":"Contributing to reclanc","text":"New code follow tidyverse style guide. can use styler package apply styles, please don’t restyle code nothing PR. use roxygen2, Markdown syntax, documentation. use testthat unit tests. Contributions test cases included easier accept.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/CONTRIBUTING.html","id":"code-of-conduct","dir":"","previous_headings":"","what":"Code of Conduct","title":"Contributing to reclanc","text":"Please note reclanc project released Contributor Code Conduct. contributing project agree abide terms.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/LICENSE.html","id":null,"dir":"","previous_headings":"","what":"MIT License","title":"MIT License","text":"Copyright (c) 2024 reclanc authors Permission hereby granted, free charge, person obtaining copy software associated documentation files (“Software”), deal Software without restriction, including without limitation rights use, copy, modify, merge, publish, distribute, sublicense, /sell copies Software, permit persons Software furnished , subject following conditions: copyright notice permission notice shall included copies substantial portions Software. SOFTWARE PROVIDED “”, WITHOUT WARRANTY KIND, EXPRESS IMPLIED, INCLUDING LIMITED WARRANTIES MERCHANTABILITY, FITNESS PARTICULAR PURPOSE NONINFRINGEMENT. EVENT SHALL AUTHORS COPYRIGHT HOLDERS LIABLE CLAIM, DAMAGES LIABILITY, WHETHER ACTION CONTRACT, TORT OTHERWISE, ARISING , CONNECTION SOFTWARE USE DEALINGS SOFTWARE.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/articles/case-study.html","id":"introduction","dir":"Articles","previous_headings":"","what":"Introduction","title":"case-study","text":"Let’s consider relatively full-featured, practical use case reclanc. vignette, ’ll go basics fitting models, well leverage tidymodels elaborate things like resampling tuning hyperparameters. ’ll fit final model, use predict subtypes entirely new dataset. vignette tries assume little knowledge machine learning tidymodels.","code":""},{"path":[]},{"path":"https://kaiaragaki.github.io/reclanc/articles/case-study.html","id":"a-simple-fit","dir":"Articles","previous_headings":"Fitting","what":"A simple fit","title":"case-study","text":"Let’s start fitting procedure. first need gene expression data. data ’m using Sjödahl et al. (2012). contains RNA expression 308 bladder cancer tumors. paper, Sjödahl et al. used transcriptional data classify tumors seven molecular subtypes (MS): ’d like apply subtype framework datasets. , first need generate centroids. can begin, though, need convert outcomes factors. case, outcomes molecular subtypes: simplest form, since clanc accepts ExpressionSet objects, following done : problem method, though, idea good fit . active argument specifies number genes used distinguishing features given class. case, class find 5 genes expression patterns peculiar given molecular subtype, subtype 7 (total number subtypes) x 5 (number active genes) = 35 genes (see blog post - better yet - original paper details). gotten better fit genes? selecting genes need? know?","code":"lund <- s3readRDS(\"lund.rds\", \"reclanc-lund\", region = \"us-east-2\") lund #> ExpressionSet (storageMode: lockedEnvironment) #> assayData: 16940 features, 308 samples #> element names: exprs #> protocolData: none #> phenoData #> sampleNames: UC_0001_1 UC_0002_1 ... UC_0785_1 (308 total) #> varLabels: title source ... sample (16 total) #> varMetadata: labelDescription #> featureData: none #> experimentData: use 'experimentData(object)' #> Annotation: table(lund$molecular_subtype) #> #> MS1a MS1b MS2a.1 MS2a.2 MS2b.1 MS2b2.1 MS2b2.2 #> 53 78 30 55 43 20 29 lund$molecular_subtype <- factor(lund$molecular_subtype) simple_centroids <- clanc(lund, classes = \"molecular_subtype\", active = 5) head(simple_centroids$centroids) #> class gene expression pooled_sd active prior #> 1 MS1a CXCL1 6.534490 0.8749133 5 0.1428571 #> 2 MS1a MMD 7.922508 0.6429620 5 0.1428571 #> 3 MS1a C9orf19 8.378910 0.7510552 5 0.1428571 #> 4 MS1a BNC1 5.297095 0.2106762 5 0.1428571 #> 5 MS1a SLFN11 7.362887 0.6824663 5 0.1428571 #> 6 MS1a CRAT 6.004517 0.3425669 5 0.1428571"},{"path":"https://kaiaragaki.github.io/reclanc/articles/case-study.html","id":"setting-the-stage-for-more-elaborate-analyses","dir":"Articles","previous_headings":"Fitting","what":"Setting the stage for more elaborate analyses","title":"case-study","text":"can get started tackling larger questions, let’s take brief detour land tidymodels. tidymodels collection packages make running tuning algorithms like much less painful much standardized. order leverage tidymodels, need buy-data structures. (Aside: don’t mean make buy-sound begrudging. say need, really mean : ’re going specifying long formulas, reason R really, really hates. Emil Hvitfeldt recently (time writing) allowed tidymodels handle long formulas gracefully, using tidymodels infrastructure gift, chore.) Many tidymodels workflows begin model specification. rationale behind separate model specification step model fitting step (whereas base R, generally happen ). reclanc makes easy specify model adding custom engine parsnip::discrim_linear, specifying model looks like : mod doesn’t anything - ’s kind point: specifies model later fit , doesn’t fitting . allows us reuse specification across code. next step wrangle data bit ‘wide’ format, columns outcomes (classes) predictors (genes), rows observations (samples): Finally, specify formula fitting model. uses recipes package tidymodels. delightful package can help preprocess data, ’s scope vignette. Instead, just think way specify formula keeps R blowing : can bundle model specification (mod) preprocessing steps (recipe, just formula) workflow: Now can fit model: ’ll notice results saw previously, demonstrating ’re using tidymodels rather base R, ’re still thing.","code":"library(tidymodels) mod <- discrim_linear() |> set_engine( engine = \"clanc\", # Note: \"clanc\", not \"reclanc\" active = 5 ) wrangled <- data.frame(class = lund$molecular_subtype, t(exprs(lund))) head(wrangled[1:5]) #> class LOC23117 FCGR2B TRIM44 C15orf39 #> UC_0001_1 MS1b 5.565262 5.306654 9.305053 6.430063 #> UC_0002_1 MS2b.1 5.505854 5.731128 9.242790 7.265748 #> UC_0003_1 MS2a.2 5.336140 5.540470 9.888668 7.244976 #> UC_0006_2 MS2b.1 5.576748 5.847743 9.408895 7.377358 #> UC_0007_1 MS2a.2 5.414919 5.510507 10.482469 6.435552 #> UC_0008_1 MS2b.1 5.279174 5.633093 9.112754 7.057977 # Note that the recipe requires 'template data' recipe <- recipe(class ~ ., wrangled) wf <- workflow() |> add_recipe(recipe) |> add_model(mod) wf #> ══ Workflow ════════════════════════════════════════════════════════════════════ #> Preprocessor: Recipe #> Model: discrim_linear() #> #> ── Preprocessor ──────────────────────────────────────────────────────────────── #> 0 Recipe Steps #> #> ── Model ─────────────────────────────────────────────────────────────────────── #> Linear Discriminant Model Specification (classification) #> #> Engine-Specific Arguments: #> active = 5 #> #> Computational engine: clanc tidymodels_fit <- fit(wf, data = wrangled) head(extract_fit_parsnip(tidymodels_fit)$fit$centroids) #> class gene expression pooled_sd active prior #> 1 MS1a CXCL1 6.534490 0.8749133 5 0.1428571 #> 2 MS1a MMD 7.922508 0.6429620 5 0.1428571 #> 3 MS1a C9orf19 8.378910 0.7510552 5 0.1428571 #> 4 MS1a BNC1 5.297095 0.2106762 5 0.1428571 #> 5 MS1a SLFN11 7.362887 0.6824663 5 0.1428571 #> 6 MS1a CRAT 6.004517 0.3425669 5 0.1428571"},{"path":"https://kaiaragaki.github.io/reclanc/articles/case-study.html","id":"measuring-fit-accuracy-with-cross-validation","dir":"Articles","previous_headings":"Fitting","what":"Measuring fit accuracy with cross-validation","title":"case-study","text":"Now ’ve dialed tidymodels framework, can lot elaborate things ease. One concerns whether 5 active genes good choice (active = 5). somewhat simple way determine good choice 5 genes use cross-validation. Cross-validation allows us test good fit training model , say, 80% data, testing rest (see Wikipedia diagram k-fold cross validation). allows us get measure good fit , without break actual test data - general used ’re ready finalize model. Speaking test data, let’s go ahead split now. ’ll lock test data away use ’ve fit final model. , ’ll use cross validation assess good fit , essentially using training data testing data. course, tidymodels makes easy , using rsample::initial_split: train test just subsets original data, containing 80% 20% original data (respectively). also tries maintain relative proportions classes within datasets (set strata = class): Creating folds cross validation nearly initial_split: can reuse workflow wf, contains model formula. difference use fit_resamples, specify metric want use measure good fit (remember every fold chunk data uses test fit). simplicity, let’s use accuracy: can extract accuracy metrics using collect_metrics, roots around fits helpfully extracts metrics, aggregates , calculated standard error: model accuracy 74%. Applying model testing data: Note testing data accuracy (%) approximates training data accuracy (74%).","code":"set.seed(123) splits <- initial_split(wrangled, prop = 0.8, strata = class) train <- training(splits) test <- testing(splits) round(prop.table(table(train$class)), 2) #> #> MS1a MS1b MS2a.1 MS2a.2 MS2b.1 MS2b2.1 MS2b2.2 #> 0.17 0.25 0.10 0.18 0.15 0.07 0.08 round(prop.table(table(test$class)), 2) #> #> MS1a MS1b MS2a.1 MS2a.2 MS2b.1 MS2b2.1 MS2b2.2 #> 0.19 0.27 0.08 0.16 0.11 0.05 0.16 folds <- vfold_cv(train, v = 5, strata = class) folds #> # 5-fold cross-validation using stratification #> # A tibble: 5 × 2 #> splits id #> #> 1 Fold1 #> 2 Fold2 #> 3 Fold3 #> 4 Fold4 #> 5 Fold5 fits <- fit_resamples( wf, folds, metrics = metric_set(accuracy) ) #> 35/35 (100%) genes in centroids found in data #> 35/35 (100%) genes in centroids found in data #> 35/35 (100%) genes in centroids found in data #> 35/35 (100%) genes in centroids found in data #> 35/35 (100%) genes in centroids found in data fits #> # Resampling results #> # 5-fold cross-validation using stratification #> # A tibble: 5 × 4 #> splits id .metrics .notes #> #> 1 Fold1 #> 2 Fold2 #> 3 Fold3 #> 4 Fold4 #> 5 Fold5 metrics <- collect_metrics(fits) metrics #> # A tibble: 1 × 6 #> .metric .estimator mean n std_err .config #> #> 1 accuracy multiclass 0.737 5 0.0289 Preprocessor1_Model1 # Fit a model using *all* of our training data final_fit <- clanc(class ~ ., train, active = 5) # Use it to predict the (known) classes of our test data preds <- predict(final_fit, new_data = test, type = \"class\") #> 35/35 (100%) genes in centroids found in data w_preds <- cbind(preds, test) # Compare known class vs predicted class metric <- accuracy(w_preds, class, .pred_class) metric #> # A tibble: 1 × 3 #> .metric .estimator .estimate #> #> 1 accuracy multiclass 0.734"},{"path":"https://kaiaragaki.github.io/reclanc/articles/case-study.html","id":"tuning-hyperparameters-with-tune","dir":"Articles","previous_headings":"Fitting","what":"Tuning hyperparameters with tune","title":"case-study","text":"Now least measure good model fits, better genes? get away fewer? Running command different numbers drag - fortunately, ’s yet another beautiful package help us: tune. use tune, need re-specify model let tune know parameters want tune: update previous workflow using update_model, let’s just declare new one: specify range values active try: can fit folds using spread values chose: , can collect metrics - time, however, summary metrics values active: graphically: looks like read maximal accuracy around 21 genes - let’s choose 20 genes nice round number: looks like accuracy little better now ’ve chosen optimal number active genes.","code":"tune_mod <- discrim_linear() |> set_engine( engine = \"clanc\", active = tune() ) tune_wf <- workflow() |> add_recipe(recipe) |> add_model(tune_mod) values <- data.frame(active = seq(from = 1, to = 50, by = 4)) values #> active #> 1 1 #> 2 5 #> 3 9 #> 4 13 #> 5 17 #> 6 21 #> 7 25 #> 8 29 #> 9 33 #> 10 37 #> 11 41 #> 12 45 #> 13 49 # This is going to take some time, since we're fitting 5 folds 13 times each. tuned <- tune_grid( tune_wf, folds, metrics = metric_set(accuracy), grid = values ) tuned #> # Tuning results #> # 5-fold cross-validation using stratification #> # A tibble: 5 × 4 #> splits id .metrics .notes #> #> 1 Fold1 #> 2 Fold2 #> 3 Fold3 #> 4 Fold4 #> 5 Fold5 tuned_metrics <- collect_metrics(tuned) tuned_metrics #> # A tibble: 13 × 7 #> active .metric .estimator mean n std_err .config #> #> 1 1 accuracy multiclass 0.585 5 0.0368 Preprocessor1_Model01 #> 2 5 accuracy multiclass 0.737 5 0.0289 Preprocessor1_Model02 #> 3 9 accuracy multiclass 0.748 5 0.0496 Preprocessor1_Model03 #> 4 13 accuracy multiclass 0.781 5 0.0403 Preprocessor1_Model04 #> 5 17 accuracy multiclass 0.770 5 0.0280 Preprocessor1_Model05 #> 6 21 accuracy multiclass 0.774 5 0.0335 Preprocessor1_Model06 #> 7 25 accuracy multiclass 0.785 5 0.0378 Preprocessor1_Model07 #> 8 29 accuracy multiclass 0.794 5 0.0319 Preprocessor1_Model08 #> 9 33 accuracy multiclass 0.773 5 0.0281 Preprocessor1_Model09 #> 10 37 accuracy multiclass 0.790 5 0.0295 Preprocessor1_Model10 #> 11 41 accuracy multiclass 0.794 5 0.0339 Preprocessor1_Model11 #> 12 45 accuracy multiclass 0.815 5 0.0267 Preprocessor1_Model12 #> 13 49 accuracy multiclass 0.815 5 0.0277 Preprocessor1_Model13 ggplot(tuned_metrics, aes(active, mean)) + geom_line() + coord_cartesian(ylim = c(0, 1)) + labs(x = \"Number Active Genes\", y = \"Accuracy\") final_fit_tuned <- clanc(class ~ ., data = train, active = 20) # Use it to predict the (known) classes of our test data: preds <- predict(final_fit_tuned, new_data = test, type = \"class\") #> 140/140 (100%) genes in centroids found in data w_preds <- cbind(preds, test) # Compare known class vs predicted class: metric <- accuracy(w_preds, class, .pred_class) metric #> # A tibble: 1 × 3 #> .metric .estimator .estimate #> #> 1 accuracy multiclass 0.812"},{"path":"https://kaiaragaki.github.io/reclanc/articles/case-study.html","id":"predicting","dir":"Articles","previous_headings":"","what":"Predicting","title":"case-study","text":"Now want apply classifier new data. second dataset RNAseq data 30 bladder cancer cell lines: Predicting incredibly simple. Since ’re using different sequencing method (RNAseq vs array-based sequencing), probably makes sense use correlation based classification rather original distance-based metric used original ClaNC package. can specifying type = \"numeric\" whatever correlation method prefer. Sjödahl paper, seven subtypes simplified five subtypes merging two similar biological pathways activated. ease interpretation, can :","code":"library(cellebrate) cell_rna #> class: DESeqDataSet #> dim: 18548 30 #> metadata(1): version #> assays(2): counts rlog_norm_counts #> rownames(18548): TSPAN6 TNMD ... MT-ND5 MT-ND6 #> rowData names(0): #> colnames(30): 1A6 253JP ... UC7 UC9 #> colData names(5): cell bsl lum call clade cell_preds <- predict( final_fit_tuned, cell_rna, assay = 2, type = \"numeric\", method = \"spearman\" ) #> 118/140 (84%) genes in centroids found in data out <- cbind(colData(cell_rna), cell_preds) |> as_tibble() out #> # A tibble: 30 × 12 #> cell bsl lum call clade .pred_MS1a .pred_MS1b .pred_MS2a.1 #> #> 1 1A6 99.0 1.02 BSL Epithelial Other 0.0600 0.224 0.149 #> 2 253JP 76.6 23.4 BSL Unknown 0.0574 0.240 0.219 #> 3 5637 98.5 1.46 BSL Epithelial Other 0.0958 0.243 0.160 #> 4 BV 49.9 50.1 LUM Unknown 0.0758 0.262 0.238 #> 5 HT1197 56.0 44.0 BSL Epithelial Other 0.119 0.288 0.224 #> 6 HT1376 10.9 89.1 LUM Epithelial Other 0.100 0.277 0.238 #> 7 J82 98.1 1.91 BSL Mesenchymal 0.127 0.292 0.219 #> 8 RT112 0 100 LUM Luminal Papilla… 0.173 0.380 0.294 #> 9 RT4 0 100 LUM Luminal Papilla… 0.134 0.317 0.257 #> 10 RT4V6 0 100 LUM Luminal Papilla… 0.143 0.207 0.165 #> # ℹ 20 more rows #> # ℹ 4 more variables: .pred_MS2a.2 , .pred_MS2b.1 , #> # .pred_MS2b2.1 , .pred_MS2b2.2 plotting_data <- out |> pivot_longer(cols = starts_with(\".pred\")) plotting_data |> ggplot(aes(cell, value, color = name)) + geom_point() + facet_grid(~clade, scales = \"free_x\", space = \"free_x\") table <- plotting_data |> summarize(winner = name[which.max(value)], .by = c(cell, clade)) |> mutate( five = case_when( winner %in% c(\".pred_MS1a\", \".pred_MS1b\") ~ \"Urobasal A\", winner %in% c(\".pred_MS2a.1\", \".pred_MS2a.2\") ~ \"Genomically unstable\", winner == \".pred_MS2b.1\" ~ \"Infiltrated\", winner == \".pred_MS2b2.1\" ~ \"Uro-B\", winner == \".pred_MS2b2.2\" ~ \"SCC-like\" ) ) |> relocate(cell, five, clade) print(table, n = 30) #> # A tibble: 30 × 4 #> cell five clade winner #> #> 1 1A6 SCC-like Epithelial Other .pred_MS2b2.2 #> 2 253JP SCC-like Unknown .pred_MS2b2.2 #> 3 5637 SCC-like Epithelial Other .pred_MS2b2.2 #> 4 BV Urobasal A Unknown .pred_MS1b #> 5 HT1197 SCC-like Epithelial Other .pred_MS2b2.2 #> 6 HT1376 SCC-like Epithelial Other .pred_MS2b2.2 #> 7 J82 Urobasal A Mesenchymal .pred_MS1b #> 8 RT112 Urobasal A Luminal Papillary .pred_MS1b #> 9 RT4 Urobasal A Luminal Papillary .pred_MS1b #> 10 RT4V6 Urobasal A Luminal Papillary .pred_MS1b #> 11 SCaBER SCC-like Epithelial Other .pred_MS2b2.2 #> 12 SW780 Urobasal A Luminal Papillary .pred_MS1b #> 13 T24 SCC-like Mesenchymal .pred_MS2b2.2 #> 14 TCCSup SCC-like Mesenchymal .pred_MS2b2.2 #> 15 UC10 SCC-like Epithelial Other .pred_MS2b2.2 #> 16 UC11 SCC-like Mesenchymal .pred_MS2b2.2 #> 17 UC12 Urobasal A Mesenchymal .pred_MS1b #> 18 UC13 SCC-like Mesenchymal .pred_MS2b2.2 #> 19 UC14 Urobasal A Luminal Papillary .pred_MS1b #> 20 UC15 SCC-like Epithelial Other .pred_MS2b2.2 #> 21 UC16 SCC-like Epithelial Other .pred_MS2b2.2 #> 22 UC17 SCC-like Luminal Papillary .pred_MS2b2.2 #> 23 UC18 SCC-like Mesenchymal .pred_MS2b2.2 #> 24 UC1 Urobasal A Luminal Papillary .pred_MS1b #> 25 UC3 SCC-like Mesenchymal .pred_MS2b2.2 #> 26 UC4 Urobasal A Unknown .pred_MS1b #> 27 UC5 Urobasal A Luminal Papillary .pred_MS1b #> 28 UC6 Urobasal A Luminal Papillary .pred_MS1b #> 29 UC7 Urobasal A Epithelial Other .pred_MS1b #> 30 UC9 Genomically unstable Epithelial Other .pred_MS2a.1"},{"path":"https://kaiaragaki.github.io/reclanc/articles/using-reclanc.html","id":"introduction","dir":"Articles","previous_headings":"","what":"Introduction","title":"using-reclanc","text":"vignette provide brief introduction basic usage reclanc. ’re interested reclanc works, ’d recommend reading blog post wrote original paper Alan Dabney, created original ClaNC algorithm.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/articles/using-reclanc.html","id":"fitting","dir":"Articles","previous_headings":"","what":"Fitting","title":"using-reclanc","text":"create new centroids existing expression data, use clanc function. reclanc provides synthetic expression data can work : data include 12 samples, 6 class “” 6 class “B”, 100 genes. reclanc able ingest variety common formats expression data. Objects like SummarizedExperiments ExpressionSets frequently used bioinformatic analyses arrange data samples columns genes rows. conflict expected formula input base R, predictors (genes) outcomes (classes) columns. reclanc eases friction expecting input common format, abstracting away wrangling aspect analysis. , two broad categories input - ‘wide data’ ‘tall data’.","code":"library(reclanc) library(SummarizedExperiment) library(parsnip) lapply(synthetic_expression, head) #> $expression #> sample1 sample2 sample3 sample4 sample5 sample6 sample7 sample8 #> gene1 8.097529 7.119188 7.304400 7.554689 7.953206 7.714925 7.512700 8.597547 #> gene2 8.641837 9.400416 8.500865 8.878687 8.318438 8.728683 7.812591 7.638167 #> gene3 3.436236 4.317915 3.435193 3.515755 3.024976 4.762209 5.048956 2.006646 #> gene4 4.368008 5.212750 4.618249 4.201365 3.195294 4.707750 5.126769 6.178658 #> gene5 2.423974 3.563816 4.062362 2.163278 2.021435 2.813873 0.000000 4.652358 #> gene6 5.371205 5.919809 4.366915 4.805534 4.834856 5.622157 3.883531 3.593082 #> sample9 sample10 sample11 sample12 #> gene1 6.475641 7.648858 8.637526 7.345038 #> gene2 8.110285 7.906104 7.424728 7.927039 #> gene3 2.739211 3.111668 3.161077 4.306611 #> gene4 5.170265 4.259578 5.872855 6.159023 #> gene5 1.532242 3.399823 3.691250 1.932937 #> gene6 4.246205 4.637316 3.575837 2.730452 #> #> $classes #> [1] A A A A A A #> Levels: A B"},{"path":"https://kaiaragaki.github.io/reclanc/articles/using-reclanc.html","id":"wide-inputs","dir":"Articles","previous_headings":"Fitting","what":"Wide inputs","title":"using-reclanc","text":"Wide inputs require data predictors outcomes columns, together, single data.frame.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/articles/using-reclanc.html","id":"formula","dir":"Articles","previous_headings":"Fitting > Wide inputs","what":"Formula","title":"using-reclanc","text":"","code":"form_data <- cbind( class = synthetic_expression$classes, as.data.frame(t(synthetic_expression$expression)) ) head(form_data[1:5]) #> class gene1 gene2 gene3 gene4 #> sample1 A 8.097529 8.641837 3.436236 4.368008 #> sample2 A 7.119188 9.400416 4.317915 5.212750 #> sample3 A 7.304400 8.500865 3.435193 4.618249 #> sample4 A 7.554689 8.878687 3.515755 4.201365 #> sample5 A 7.953206 8.318438 3.024976 3.195294 #> sample6 A 7.714925 8.728683 4.762209 4.707750 clanc(class ~ ., form_data, active = 5) #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene12 7.514718 0.4779155 5 0.5 #> 2 A gene2 8.744821 0.3147537 5 0.5 #> 3 A gene13 8.936462 0.3418472 5 0.5 #> 4 A gene21 6.584681 0.5279636 5 0.5 #> 5 A gene24 4.307301 0.7214700 5 0.5 #> 6 A gene74 4.028507 0.4940783 5 0.5 #> 7 A gene41 4.328516 0.6317005 5 0.5 #> 8 A gene95 6.873184 0.4462475 5 0.5 #> 9 A gene52 3.743798 0.5173769 5 0.5 #> 10 A gene66 7.008174 0.5883218 5 0.5 #> 11 B gene12 8.072284 0.4779155 5 0.5 #> 12 B gene13 9.938137 0.3418472 5 0.5 #> 13 B gene2 8.273987 0.3147537 5 0.5 #> 14 B gene24 3.370467 0.7214700 5 0.5 #> 15 B gene21 5.789423 0.5279636 5 0.5 #> 16 B gene41 5.518354 0.6317005 5 0.5 #> 17 B gene74 3.226598 0.4940783 5 0.5 #> 18 B gene52 2.438579 0.5173769 5 0.5 #> 19 B gene95 6.288173 0.4462475 5 0.5 #> 20 B gene66 7.891588 0.5883218 5 0.5"},{"path":"https://kaiaragaki.github.io/reclanc/articles/using-reclanc.html","id":"recipe","dir":"Articles","previous_headings":"Fitting > Wide inputs","what":"recipe","title":"using-reclanc","text":"reclanc also supports tidymodels workflows:","code":"discrim_linear() |> set_engine(\"clanc\", active = 5) |> fit(class ~ ., data = form_data) #> parsnip model object #> #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene12 7.514718 0.4779155 5 0.5 #> 2 A gene2 8.744821 0.3147537 5 0.5 #> 3 A gene13 8.936462 0.3418472 5 0.5 #> 4 A gene21 6.584681 0.5279636 5 0.5 #> 5 A gene24 4.307301 0.7214700 5 0.5 #> 6 A gene74 4.028507 0.4940783 5 0.5 #> 7 A gene41 4.328516 0.6317005 5 0.5 #> 8 A gene95 6.873184 0.4462475 5 0.5 #> 9 A gene52 3.743798 0.5173769 5 0.5 #> 10 A gene66 7.008174 0.5883218 5 0.5 #> 11 B gene12 8.072284 0.4779155 5 0.5 #> 12 B gene13 9.938137 0.3418472 5 0.5 #> 13 B gene2 8.273987 0.3147537 5 0.5 #> 14 B gene24 3.370467 0.7214700 5 0.5 #> 15 B gene21 5.789423 0.5279636 5 0.5 #> 16 B gene41 5.518354 0.6317005 5 0.5 #> 17 B gene74 3.226598 0.4940783 5 0.5 #> 18 B gene52 2.438579 0.5173769 5 0.5 #> 19 B gene95 6.288173 0.4462475 5 0.5 #> 20 B gene66 7.891588 0.5883218 5 0.5"},{"path":"https://kaiaragaki.github.io/reclanc/articles/using-reclanc.html","id":"tall-inputs","dir":"Articles","previous_headings":"Fitting","what":"Tall inputs","title":"using-reclanc","text":"Tall inputs require genes rows samples columns","code":""},{"path":"https://kaiaragaki.github.io/reclanc/articles/using-reclanc.html","id":"data-framematrix","dir":"Articles","previous_headings":"Fitting > Tall inputs","what":"data.frame/matrix","title":"using-reclanc","text":"often convenient supply data.frame, particularly data-munging done. data.frame matrix inputs require expression genes column names sample IDs rownames, well factor vector classes:","code":"clanc( synthetic_expression$expression, classes = synthetic_expression$classes, active = 5 ) #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene12 7.514718 0.4779155 5 0.5 #> 2 A gene2 8.744821 0.3147537 5 0.5 #> 3 A gene13 8.936462 0.3418472 5 0.5 #> 4 A gene21 6.584681 0.5279636 5 0.5 #> 5 A gene24 4.307301 0.7214700 5 0.5 #> 6 A gene74 4.028507 0.4940783 5 0.5 #> 7 A gene41 4.328516 0.6317005 5 0.5 #> 8 A gene95 6.873184 0.4462475 5 0.5 #> 9 A gene52 3.743798 0.5173769 5 0.5 #> 10 A gene66 7.008174 0.5883218 5 0.5 #> 11 B gene12 8.072284 0.4779155 5 0.5 #> 12 B gene13 9.938137 0.3418472 5 0.5 #> 13 B gene2 8.273987 0.3147537 5 0.5 #> 14 B gene24 3.370467 0.7214700 5 0.5 #> 15 B gene21 5.789423 0.5279636 5 0.5 #> 16 B gene41 5.518354 0.6317005 5 0.5 #> 17 B gene74 3.226598 0.4940783 5 0.5 #> 18 B gene52 2.438579 0.5173769 5 0.5 #> 19 B gene95 6.288173 0.4462475 5 0.5 #> 20 B gene66 7.891588 0.5883218 5 0.5"},{"path":"https://kaiaragaki.github.io/reclanc/articles/using-reclanc.html","id":"summarizedexperiment","dir":"Articles","previous_headings":"Fitting > Tall inputs","what":"SummarizedExperiment","title":"using-reclanc","text":"common formats expression SummarizedExperiments ExpressionSets: can specify name colData (pData ExpressionSets) column contains classes classes argument:","code":"se <- SummarizedExperiment( synthetic_expression$expression, colData = DataFrame(class = synthetic_expression$classes) ) se #> class: SummarizedExperiment #> dim: 100 12 #> metadata(0): #> assays(1): '' #> rownames(100): gene1 gene2 ... gene99 gene100 #> rowData names(0): #> colnames(12): sample1 sample2 ... sample11 sample12 #> colData names(1): class fit <- clanc( se, classes = \"class\", active = 20, assay = 1 # Index of assay - SummarizedExperiments only ) fit #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene11 2.2992343 1.2044848 20 0.5 #> 2 A gene2 8.7448209 0.3147537 20 0.5 #> 3 A gene13 8.9364621 0.3418472 20 0.5 #> 4 A gene20 2.1925558 1.3104010 20 0.5 #> 5 A gene10 4.9557850 0.8571716 20 0.5 #> 6 A gene21 6.5846813 0.5279636 20 0.5 #> 7 A gene100 5.6455200 0.6175104 20 0.5 #> 8 A gene22 6.1650079 0.4699756 20 0.5 #> 9 A gene46 6.7344030 0.8233370 20 0.5 #> 10 A gene24 4.3073008 0.7214700 20 0.5 #> 11 A gene15 2.4254020 1.1910158 20 0.5 #> 12 A gene25 5.0353875 0.7498139 20 0.5 #> 13 A gene17 2.9424148 0.6628466 20 0.5 #> 14 A gene4 4.3839026 0.7144711 20 0.5 #> 15 A gene56 6.3441126 0.4078736 20 0.5 #> 16 A gene41 4.3285163 0.6317005 20 0.5 #> 17 A gene57 4.2237139 0.9531773 20 0.5 #> 18 A gene7 5.5545202 0.7875124 20 0.5 #> 19 A gene58 5.6162919 0.8161951 20 0.5 #> 20 A gene12 7.5147181 0.4779155 20 0.5 #> 21 A gene6 5.1534126 0.6194184 20 0.5 #> 22 A gene51 6.6256136 0.7737520 20 0.5 #> 23 A gene60 4.7434923 1.2945446 20 0.5 #> 24 A gene52 3.7437977 0.5173769 20 0.5 #> 25 A gene63 8.9293980 0.5635262 20 0.5 #> 26 A gene53 4.3774614 0.8370528 20 0.5 #> 27 A gene66 7.0081742 0.5883218 20 0.5 #> 28 A gene83 3.6532038 0.8444393 20 0.5 #> 29 A gene67 6.1384613 0.3677756 20 0.5 #> 30 A gene85 5.2179679 0.5930857 20 0.5 #> 31 A gene88 4.6008044 1.0603007 20 0.5 #> 32 A gene70 1.3073340 1.1264747 20 0.5 #> 33 A gene47 9.4528373 0.2030726 20 0.5 #> 34 A gene90 0.9794695 1.3272423 20 0.5 #> 35 A gene74 4.0285071 0.4940783 20 0.5 #> 36 A gene94 7.7773183 0.5375914 20 0.5 #> 37 A gene78 2.1763395 1.6805560 20 0.5 #> 38 A gene95 6.8731844 0.4462475 20 0.5 #> 39 A gene79 3.7138831 1.0587367 20 0.5 #> 40 A gene98 4.5710407 0.6798799 20 0.5 #> 41 B gene10 4.2378889 0.8571716 20 0.5 #> 42 B gene2 8.2739866 0.3147537 20 0.5 #> 43 B gene100 5.0435040 0.6175104 20 0.5 #> 44 B gene20 3.4781598 1.3104010 20 0.5 #> 45 B gene46 7.0200767 0.8233370 20 0.5 #> 46 B gene11 1.2780748 1.2044848 20 0.5 #> 47 B gene12 8.0722841 0.4779155 20 0.5 #> 48 B gene22 6.4609169 0.4699756 20 0.5 #> 49 B gene51 5.8920005 0.7737520 20 0.5 #> 50 B gene13 9.9381374 0.3418472 20 0.5 #> 51 B gene15 1.6008569 1.1910158 20 0.5 #> 52 B gene25 4.5015558 0.7498139 20 0.5 #> 53 B gene17 2.5005839 0.6628466 20 0.5 #> 54 B gene4 4.9225469 0.7144711 20 0.5 #> 55 B gene56 6.1067832 0.4078736 20 0.5 #> 56 B gene41 5.5183538 0.6317005 20 0.5 #> 57 B gene57 3.1175271 0.9531773 20 0.5 #> 58 B gene7 5.3367575 0.7875124 20 0.5 #> 59 B gene21 5.7894231 0.5279636 20 0.5 #> 60 B gene47 9.5903798 0.2030726 20 0.5 #> 61 B gene6 4.4655748 0.6194184 20 0.5 #> 62 B gene74 3.2265977 0.4940783 20 0.5 #> 63 B gene24 3.3704670 0.7214700 20 0.5 #> 64 B gene52 2.4385792 0.5173769 20 0.5 #> 65 B gene63 8.3234317 0.5635262 20 0.5 #> 66 B gene53 3.8479638 0.8370528 20 0.5 #> 67 B gene66 7.8915875 0.5883218 20 0.5 #> 68 B gene83 4.2757218 0.8444393 20 0.5 #> 69 B gene67 6.0190764 0.3677756 20 0.5 #> 70 B gene85 5.8877225 0.5930857 20 0.5 #> 71 B gene79 4.1894417 1.0587367 20 0.5 #> 72 B gene58 4.7194615 0.8161951 20 0.5 #> 73 B gene88 5.5945405 1.0603007 20 0.5 #> 74 B gene70 1.5987845 1.1264747 20 0.5 #> 75 B gene90 1.4036889 1.3272423 20 0.5 #> 76 B gene60 5.2336968 1.2945446 20 0.5 #> 77 B gene78 1.6625207 1.6805560 20 0.5 #> 78 B gene95 6.2881728 0.4462475 20 0.5 #> 79 B gene98 4.1346296 0.6798799 20 0.5 #> 80 B gene94 8.4222554 0.5375914 20 0.5"},{"path":"https://kaiaragaki.github.io/reclanc/articles/using-reclanc.html","id":"predicting","dir":"Articles","previous_headings":"","what":"Predicting","title":"using-reclanc","text":"fit can used predict classes new samples new data. new data can come form matrix, data.frame, SummarizedExperiment, ExpressionSet, expected input Using type = \"class\" predict classes using metric provided Alan Dabney original ClaNC paper. However, particularly comparing across datasets may transformed differently, may accurate use correlation based metric:","code":"predict(fit, new_data = se, type = \"class\") #> 40/40 (100%) genes in centroids found in data #> # A tibble: 12 × 1 #> .pred_class #> #> 1 A #> 2 A #> 3 A #> 4 A #> 5 A #> 6 A #> 7 B #> 8 B #> 9 B #> 10 B #> 11 B #> 12 B predict(fit, new_data = se, type = \"numeric\", method = \"spearman\") #> 40/40 (100%) genes in centroids found in data #> # A tibble: 12 × 2 #> .pred_A .pred_B #> #> 1 0.901 0.811 #> 2 0.929 0.849 #> 3 0.932 0.840 #> 4 0.912 0.829 #> 5 0.862 0.770 #> 6 0.932 0.869 #> 7 0.776 0.904 #> 8 0.824 0.931 #> 9 0.828 0.924 #> 10 0.855 0.946 #> 11 0.805 0.915 #> 12 0.750 0.869"},{"path":"https://kaiaragaki.github.io/reclanc/authors.html","id":null,"dir":"","previous_headings":"","what":"Authors","title":"Authors and Citation","text":"Kai Aragaki. Author, maintainer. Alan Dabney. Author, copyright holder. Original creator ClaNC","code":""},{"path":"https://kaiaragaki.github.io/reclanc/authors.html","id":"citation","dir":"","previous_headings":"","what":"Citation","title":"Authors and Citation","text":"Alan D (2005). “Classification microarrays nearest centroids.” Bioinformatics, 21(22), 4148-4154. doi:10.1093/bioinformatics/bti681.","code":"@Article{, title = {Classification of microarrays to nearest centroids}, author = {Dabney Alan}, journal = {Bioinformatics}, year = {2005}, volume = {21}, number = {22}, pages = {4148-4154}, doi = {10.1093/bioinformatics/bti681}, }"},{"path":"https://kaiaragaki.github.io/reclanc/index.html","id":"reclanc","dir":"","previous_headings":"","what":"A Revival of the ClaNC Algorithm","title":"A Revival of the ClaNC Algorithm","text":"reclanc revival ClaNC (Classification microarrays nearest centroids), Alan R. Dabney. Since source lost (least knowledge), code comes heavy modification. reclanc nearest-centroid classifier expression data. tends little sensitive accurate similar models like PAM. Besides mere existence, reclanc differs slightly original ClaNC package ways: reclanc supports wide variety inputs (data.frame, matrix, formula, recipe, ExpressionSet, SummarizedExperiment) reclanc plays nicely tidymodels, offloads things like making folds rsample tuning tune (see vignette leverage tidymodels reclanc). Provides prediction method based correlation, rather distance - useful predicting classes data different sequencing modalities","code":""},{"path":"https://kaiaragaki.github.io/reclanc/index.html","id":"installation","dir":"","previous_headings":"","what":"Installation","title":"A Revival of the ClaNC Algorithm","text":"can install development version reclanc like :","code":"# install.packages(\"pak\") pak::pak(\"KaiAragaki/reclanc\")"},{"path":"https://kaiaragaki.github.io/reclanc/index.html","id":"how-to-use-it","dir":"","previous_headings":"","what":"How to use it","title":"A Revival of the ClaNC Algorithm","text":"information basic usage, see vignette. case study, well optimize active parameter, see vignette.","code":"library(reclanc) lapply(synthetic_expression, head) # dummy data #> $expression #> sample1 sample2 sample3 sample4 sample5 sample6 sample7 sample8 #> gene1 8.097529 7.119188 7.304400 7.554689 7.953206 7.714925 7.512700 8.597547 #> gene2 8.641837 9.400416 8.500865 8.878687 8.318438 8.728683 7.812591 7.638167 #> gene3 3.436236 4.317915 3.435193 3.515755 3.024976 4.762209 5.048956 2.006646 #> gene4 4.368008 5.212750 4.618249 4.201365 3.195294 4.707750 5.126769 6.178658 #> gene5 2.423974 3.563816 4.062362 2.163278 2.021435 2.813873 0.000000 4.652358 #> gene6 5.371205 5.919809 4.366915 4.805534 4.834856 5.622157 3.883531 3.593082 #> sample9 sample10 sample11 sample12 #> gene1 6.475641 7.648858 8.637526 7.345038 #> gene2 8.110285 7.906104 7.424728 7.927039 #> gene3 2.739211 3.111668 3.161077 4.306611 #> gene4 5.170265 4.259578 5.872855 6.159023 #> gene5 1.532242 3.399823 3.691250 1.932937 #> gene6 4.246205 4.637316 3.575837 2.730452 #> #> $classes #> [1] A A A A A A #> Levels: A B centroids <- clanc( synthetic_expression$expression, classes = synthetic_expression$classes, active = 5 ) centroids #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene13 8.936462 0.3418472 5 0.5 #> 2 A gene21 7.379940 0.5279636 5 0.5 #> 3 A gene2 8.744821 0.3147537 5 0.5 #> 4 A gene74 4.028507 0.4940783 5 0.5 #> 5 A gene41 4.328516 0.6317005 5 0.5 #> 6 A gene66 6.124761 0.5883218 5 0.5 #> 7 A gene24 4.307301 0.7214700 5 0.5 #> 8 A gene95 6.288173 0.4462475 5 0.5 #> 9 A gene94 7.777318 0.5375914 5 0.5 #> 10 A gene52 3.743798 0.5173769 5 0.5 #> 11 B gene13 9.938137 0.3418472 5 0.5 #> 12 B gene2 8.273987 0.3147537 5 0.5 #> 13 B gene21 6.584681 0.5279636 5 0.5 #> 14 B gene41 5.518354 0.6317005 5 0.5 #> 15 B gene74 3.226598 0.4940783 5 0.5 #> 16 B gene24 3.370467 0.7214700 5 0.5 #> 17 B gene66 7.008174 0.5883218 5 0.5 #> 18 B gene94 8.422255 0.5375914 5 0.5 #> 19 B gene95 5.703161 0.4462475 5 0.5 #> 20 B gene52 2.438579 0.5173769 5 0.5"},{"path":"https://kaiaragaki.github.io/reclanc/index.html","id":"how-it-works","dir":"","previous_headings":"","what":"How it works","title":"A Revival of the ClaNC Algorithm","text":"can find gentle introduction reclanc works -depth statistically rigorous definition algorithm works original paper.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/index.html","id":"citation","dir":"","previous_headings":"","what":"Citation","title":"A Revival of the ClaNC Algorithm","text":"Citation original ClaNC paper: Alan R. Dabney, Classification microarrays nearest centroids, Bioinformatics, Volume 21, Issue 22, November 2005, Pages 4148–4154, https://doi.org/10.1093/bioinformatics/bti681","code":""},{"path":"https://kaiaragaki.github.io/reclanc/reference/clanc.html","id":null,"dir":"Reference","previous_headings":"","what":"Calculate centroids from expression data with ClaNC — clanc","title":"Calculate centroids from expression data with ClaNC — clanc","text":"Calculate centroids expression data ClaNC","code":""},{"path":"https://kaiaragaki.github.io/reclanc/reference/clanc.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Calculate centroids from expression data with ClaNC — clanc","text":"","code":"clanc(x, ...) # Default S3 method clanc(x, ...) # S3 method for class 'data.frame' clanc(x, classes, active, priors = \"equal\", ...) # S3 method for class 'matrix' clanc(x, classes, active, priors = \"equal\", ...) # S3 method for class 'SummarizedExperiment' clanc(x, classes, active, priors = \"equal\", assay = 1, ...) # S3 method for class 'ExpressionSet' clanc(x, classes, active, priors = \"equal\", ...) # S3 method for class 'formula' clanc(formula, data, active, priors = \"equal\", ...) # S3 method for class 'recipe' clanc(x, data, active, priors = \"equal\", ...)"},{"path":"https://kaiaragaki.github.io/reclanc/reference/clanc.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Calculate centroids from expression data with ClaNC — clanc","text":"x Depending context: data frame expression. matrix expression. recipe specifying set preprocessing steps created recipes::recipe(). ExpressionSet. SummarizedExperiment assay containing expression. Expression library-size corrected, scaled. supplying data frame, matrix, ExpressionSet, SummarizedExperiment, rows represent genes, columns represent samples (standard expression data). column names sample IDs, row names gene IDs. recipe provided, data genes columns (match formula provided recipe.) ... currently used, required extensibility. classes x data frame matrix, class contains class labels form either: data frame 1 factor column factor vector. x ExpressionSet SummarizedExperiment, class name column pData(x) colData(x) contains classes factor. active Either single number numeric vector equal length number unique class labels. Represents number class-specific genes selected centroid. Note different numbers genes can selected class. See details. x ExpressionSet SummarizedExperiment, active can additionally name column pData(x) colData(x) contains numeric vector priors Can take variety values: \"equal\" - class equal prior \"class\" - class prior equal frequency training set numeric vector length equal number classes x ExpressionSet SummarizedExperiment, active can additionally name column pData(x) colData(x) contains numeric vector assay SummarizedExperiment used, index name assay formula formula specifying classes left-hand side, predictor terms right-hand side. data recipe formula used, data specified : data frame containing expression classes, columns genes class, rows samples.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/reference/clanc.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Calculate centroids from expression data with ClaNC — clanc","text":"clanc object.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/reference/clanc.html","id":"details","dir":"Reference","previous_headings":"","what":"Details","title":"Calculate centroids from expression data with ClaNC — clanc","text":"original description ClaNC can found active sets number class-specific genes, centroid number genes. explain way example, active = 5 3 classes, centroid 15 genes, 5 genes particular given class. genes 'active' class, values mean class. genes active given class, values overall expression given gene across classes.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/reference/clanc.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Calculate centroids from expression data with ClaNC — clanc","text":"","code":"expression_matrix <- synthetic_expression$expression head(expression_matrix) #> sample1 sample2 sample3 sample4 sample5 sample6 sample7 sample8 #> gene1 8.097529 7.119188 7.304400 7.554689 7.953206 7.714925 7.512700 8.597547 #> gene2 8.641837 9.400416 8.500865 8.878687 8.318438 8.728683 7.812591 7.638167 #> gene3 3.436236 4.317915 3.435193 3.515755 3.024976 4.762209 5.048956 2.006646 #> gene4 4.368008 5.212750 4.618249 4.201365 3.195294 4.707750 5.126769 6.178658 #> gene5 2.423974 3.563816 4.062362 2.163278 2.021435 2.813873 0.000000 4.652358 #> gene6 5.371205 5.919809 4.366915 4.805534 4.834856 5.622157 3.883531 3.593082 #> sample9 sample10 sample11 sample12 #> gene1 6.475641 7.648858 8.637526 7.345038 #> gene2 8.110285 7.906104 7.424728 7.927039 #> gene3 2.739211 3.111668 3.161077 4.306611 #> gene4 5.170265 4.259578 5.872855 6.159023 #> gene5 1.532242 3.399823 3.691250 1.932937 #> gene6 4.246205 4.637316 3.575837 2.730452 classes <- synthetic_expression$classes classes #> [1] A A A A A A B B B B B B #> Levels: A B # data.frame/tibble/matrix interface: clanc(expression_matrix, classes = classes, active = 5, priors = \"equal\") #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene12 7.514718 0.4779155 5 0.5 #> 2 A gene2 8.744821 0.3147537 5 0.5 #> 3 A gene13 8.936462 0.3418472 5 0.5 #> 4 A gene21 6.584681 0.5279636 5 0.5 #> 5 A gene24 4.307301 0.7214700 5 0.5 #> 6 A gene74 4.028507 0.4940783 5 0.5 #> 7 A gene41 4.328516 0.6317005 5 0.5 #> 8 A gene95 6.873184 0.4462475 5 0.5 #> 9 A gene52 3.743798 0.5173769 5 0.5 #> 10 A gene66 7.008174 0.5883218 5 0.5 #> 11 B gene12 8.072284 0.4779155 5 0.5 #> 12 B gene13 9.938137 0.3418472 5 0.5 #> 13 B gene2 8.273987 0.3147537 5 0.5 #> 14 B gene24 3.370467 0.7214700 5 0.5 #> 15 B gene21 5.789423 0.5279636 5 0.5 #> 16 B gene41 5.518354 0.6317005 5 0.5 #> 17 B gene74 3.226598 0.4940783 5 0.5 #> 18 B gene52 2.438579 0.5173769 5 0.5 #> 19 B gene95 6.288173 0.4462475 5 0.5 #> 20 B gene66 7.891588 0.5883218 5 0.5 #> # Formula interface: # Data must have class included as a column # Genes must be *columns* and samples must be *rows* # Hence the data transposition. for_formula <- data.frame(class = classes, t(expression_matrix)) clanc(class ~ ., for_formula, active = 5, priors = \"equal\") #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene12 7.514718 0.4779155 5 0.5 #> 2 A gene2 8.744821 0.3147537 5 0.5 #> 3 A gene13 8.936462 0.3418472 5 0.5 #> 4 A gene21 6.584681 0.5279636 5 0.5 #> 5 A gene24 4.307301 0.7214700 5 0.5 #> 6 A gene74 4.028507 0.4940783 5 0.5 #> 7 A gene41 4.328516 0.6317005 5 0.5 #> 8 A gene95 6.873184 0.4462475 5 0.5 #> 9 A gene52 3.743798 0.5173769 5 0.5 #> 10 A gene66 7.008174 0.5883218 5 0.5 #> 11 B gene12 8.072284 0.4779155 5 0.5 #> 12 B gene13 9.938137 0.3418472 5 0.5 #> 13 B gene2 8.273987 0.3147537 5 0.5 #> 14 B gene24 3.370467 0.7214700 5 0.5 #> 15 B gene21 5.789423 0.5279636 5 0.5 #> 16 B gene41 5.518354 0.6317005 5 0.5 #> 17 B gene74 3.226598 0.4940783 5 0.5 #> 18 B gene52 2.438579 0.5173769 5 0.5 #> 19 B gene95 6.288173 0.4462475 5 0.5 #> 20 B gene66 7.891588 0.5883218 5 0.5 #> # Recipes interface: rec <- recipes::recipe(class ~ ., data = for_formula) clanc(rec, for_formula, active = 5, priors = \"equal\") #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene12 7.514718 0.4779155 5 0.5 #> 2 A gene2 8.744821 0.3147537 5 0.5 #> 3 A gene13 8.936462 0.3418472 5 0.5 #> 4 A gene21 6.584681 0.5279636 5 0.5 #> 5 A gene24 4.307301 0.7214700 5 0.5 #> 6 A gene74 4.028507 0.4940783 5 0.5 #> 7 A gene41 4.328516 0.6317005 5 0.5 #> 8 A gene95 6.873184 0.4462475 5 0.5 #> 9 A gene52 3.743798 0.5173769 5 0.5 #> 10 A gene66 7.008174 0.5883218 5 0.5 #> 11 B gene12 8.072284 0.4779155 5 0.5 #> 12 B gene13 9.938137 0.3418472 5 0.5 #> 13 B gene2 8.273987 0.3147537 5 0.5 #> 14 B gene24 3.370467 0.7214700 5 0.5 #> 15 B gene21 5.789423 0.5279636 5 0.5 #> 16 B gene41 5.518354 0.6317005 5 0.5 #> 17 B gene74 3.226598 0.4940783 5 0.5 #> 18 B gene52 2.438579 0.5173769 5 0.5 #> 19 B gene95 6.288173 0.4462475 5 0.5 #> 20 B gene66 7.891588 0.5883218 5 0.5 #> # SummarizedExperiment interface: se <- SummarizedExperiment::SummarizedExperiment( expression_matrix, colData = data.frame( class = classes, active = 5, prior = c(0.5, 0.5) ) ) clanc(se, classes = \"class\", active = \"active\", priors = \"equal\") #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene12 7.514718 0.4779155 5 0.5 #> 2 A gene2 8.744821 0.3147537 5 0.5 #> 3 A gene13 8.936462 0.3418472 5 0.5 #> 4 A gene21 6.584681 0.5279636 5 0.5 #> 5 A gene24 4.307301 0.7214700 5 0.5 #> 6 A gene74 4.028507 0.4940783 5 0.5 #> 7 A gene41 4.328516 0.6317005 5 0.5 #> 8 A gene95 6.873184 0.4462475 5 0.5 #> 9 A gene52 3.743798 0.5173769 5 0.5 #> 10 A gene66 7.008174 0.5883218 5 0.5 #> 11 B gene12 8.072284 0.4779155 5 0.5 #> 12 B gene13 9.938137 0.3418472 5 0.5 #> 13 B gene2 8.273987 0.3147537 5 0.5 #> 14 B gene24 3.370467 0.7214700 5 0.5 #> 15 B gene21 5.789423 0.5279636 5 0.5 #> 16 B gene41 5.518354 0.6317005 5 0.5 #> 17 B gene74 3.226598 0.4940783 5 0.5 #> 18 B gene52 2.438579 0.5173769 5 0.5 #> 19 B gene95 6.288173 0.4462475 5 0.5 #> 20 B gene66 7.891588 0.5883218 5 0.5 #> # ExpressionSet interface: adf <- data.frame( row.names = colnames(expression_matrix), class = classes ) |> Biobase::AnnotatedDataFrame() es <- Biobase::ExpressionSet(expression_matrix, adf) clanc(es, classes = \"class\", active = 5, priors = 0.5) #> #> $centroids #> class gene expression pooled_sd active prior #> 1 A gene12 7.514718 0.4779155 5 0.5 #> 2 A gene2 8.744821 0.3147537 5 0.5 #> 3 A gene13 8.936462 0.3418472 5 0.5 #> 4 A gene21 6.584681 0.5279636 5 0.5 #> 5 A gene24 4.307301 0.7214700 5 0.5 #> 6 A gene74 4.028507 0.4940783 5 0.5 #> 7 A gene41 4.328516 0.6317005 5 0.5 #> 8 A gene95 6.873184 0.4462475 5 0.5 #> 9 A gene52 3.743798 0.5173769 5 0.5 #> 10 A gene66 7.008174 0.5883218 5 0.5 #> 11 B gene12 8.072284 0.4779155 5 0.5 #> 12 B gene13 9.938137 0.3418472 5 0.5 #> 13 B gene2 8.273987 0.3147537 5 0.5 #> 14 B gene24 3.370467 0.7214700 5 0.5 #> 15 B gene21 5.789423 0.5279636 5 0.5 #> 16 B gene41 5.518354 0.6317005 5 0.5 #> 17 B gene74 3.226598 0.4940783 5 0.5 #> 18 B gene52 2.438579 0.5173769 5 0.5 #> 19 B gene95 6.288173 0.4462475 5 0.5 #> 20 B gene66 7.891588 0.5883218 5 0.5 #>"},{"path":"https://kaiaragaki.github.io/reclanc/reference/predict.clanc.html","id":null,"dir":"Reference","previous_headings":"","what":"Predict from a clanc — predict.clanc","title":"Predict from a clanc — predict.clanc","text":"Predict clanc","code":""},{"path":"https://kaiaragaki.github.io/reclanc/reference/predict.clanc.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Predict from a clanc — predict.clanc","text":"","code":"# S3 method for class 'clanc' predict(object, new_data, type, assay = NULL, format = c(\"wide\", \"tall\"), ...)"},{"path":"https://kaiaragaki.github.io/reclanc/reference/predict.clanc.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Predict from a clanc — predict.clanc","text":"object clanc object. new_data data frame matrix new predictors. type single character. type predictions generate. Valid options : \"numeric\" numeric predictions. assay object inherits SummarizedExperiment, index assay. format Character. data \"wide\" (default), genes columns, \"tall\", genes rows? ... used, required extensibility. method type numeric, method correlation","code":""},{"path":"https://kaiaragaki.github.io/reclanc/reference/predict.clanc.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Predict from a clanc — predict.clanc","text":"tibble predictions. number rows tibble guaranteed number rows new_data.","code":""},{"path":"https://kaiaragaki.github.io/reclanc/reference/synthetic_expression.html","id":null,"dir":"Reference","previous_headings":"","what":"Synthetic Expression of Two Distinct Classes — synthetic_expression","title":"Synthetic Expression of Two Distinct Classes — synthetic_expression","text":"Synthetic Expression Two Distinct Classes","code":""},{"path":"https://kaiaragaki.github.io/reclanc/reference/synthetic_expression.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Synthetic Expression of Two Distinct Classes — synthetic_expression","text":"","code":"synthetic_expression"},{"path":[]},{"path":"https://kaiaragaki.github.io/reclanc/reference/synthetic_expression.html","id":"synthetic-expression","dir":"Reference","previous_headings":"","what":"synthetic_expression","title":"Synthetic Expression of Two Distinct Classes — synthetic_expression","text":"list containing two items: expression Normalized log expression 12 samples across 100 genes classes factor vector classes 12 samples","code":""}]