diff --git a/CHANGELOG.md b/CHANGELOG.md index d29cc1e..9683b0b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Scaden Changelog +### Version 1.0.2 + +* General improvement of logging using the 'rich' library for colorized output +* Added verification check for '--train_datasets' parameter to notify user of + unavailable datasets + ### Version 1.0.1 * Made identification of datasets more robust to fix issue [#66](https://github.com/KevinMenden/scaden/issues/66) diff --git a/README.md b/README.md index 7e443cd..e1b0f11 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ ![Scaden](docs/img/scaden_logo.png) -![Scaden version](https://img.shields.io/badge/Scaden-v1.0.0-cyan) +![Scaden version](https://img.shields.io/badge/scaden-v1.0.2-cyan) + ![MIT](https://img.shields.io/badge/License-MIT-black) ![Install with pip](https://img.shields.io/badge/Install%20with-pip-blue) ![Install with Bioconda](https://img.shields.io/badge/Install%20with-conda-green) diff --git a/docs/changelog.md b/docs/changelog.md index 1ad49f1..9683b0b 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,11 @@ # Scaden Changelog +### Version 1.0.2 + +* General improvement of logging using the 'rich' library for colorized output +* Added verification check for '--train_datasets' parameter to notify user of + unavailable datasets + ### Version 1.0.1 * Made identification of datasets more robust to fix issue [#66](https://github.com/KevinMenden/scaden/issues/66) @@ -13,7 +19,6 @@ ### Version 0.9.6 - + fixed Dockerfile (switched to pip installation) + added better error messages to `simulate` command + cleaned up dependencies @@ -51,5 +56,4 @@ Commands: * `scaden process`: Process a training dataset for training * `scaden train`: Train a Scaden model -* `scaden predict`: Predict cell type compositions of a given sample - +* `scaden predict`: Predict cell type compositions of a given sample \ No newline at end of file diff --git a/scaden/__main__.py b/scaden/__main__.py index 580d27b..4200625 100644 --- a/scaden/__main__.py +++ b/scaden/__main__.py @@ -1,5 +1,8 @@ import click +import sys import scaden +import rich +import rich.logging import logging import os from scaden.train import training @@ -7,6 +10,7 @@ from scaden.process import processing from scaden.simulate import simulation from scaden.example import exampleData + """ author: Kevin Menden @@ -17,7 +21,16 @@ # Logging logger = logging.getLogger() logger.setLevel(logging.INFO) -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0' +logger.addHandler( + rich.logging.RichHandler( + level=logging.INFO, + console=rich.console.Console(file=sys.stderr), + show_time=False, + markup=True, + ) +) + +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0" def main(): @@ -29,11 +42,11 @@ def main(): |____/ \___\__,_|\__,_|\___|_| |_| """ - click.echo(click.style(text, fg='blue')) + click.echo(click.style(text, fg="blue")) cli() -if __name__ == '__main__': +if __name__ == "__main__": main() """ Set up the command line client with different commands to execute @@ -52,34 +65,36 @@ def cli(): @cli.command() -@click.argument('data_path', - type=click.Path(exists=True), - required=True, - metavar='') +@click.argument( + "data_path", type=click.Path(exists=True), required=True, metavar="" +) +@click.option( + "--train_datasets", + default="", + help="Comma-separated list of datasets used for training. Uses all by default.", +) +@click.option("--model_dir", default="./", help="Path to store the model in") +@click.option( + "--batch_size", default=128, help="Batch size to use for training. Default: 128" +) @click.option( - '--train_datasets', - default='', - help= - 'Comma-separated list of datasets used for training. Uses all by default.') -@click.option('--model_dir', default="./", help='Path to store the model in') -@click.option('--batch_size', - default=128, - help='Batch size to use for training. Default: 128') -@click.option('--learning_rate', - default=0.0001, - help='Learning rate used for training. Default: 0.0001') -@click.option('--steps', default=5000, help='Number of training steps') -@click.option('--seed', default=0, help="Set random seed") -def train(data_path, train_datasets, model_dir, batch_size, learning_rate, - steps, seed): + "--learning_rate", + default=0.0001, + help="Learning rate used for training. Default: 0.0001", +) +@click.option("--steps", default=5000, help="Number of training steps") +@click.option("--seed", default=0, help="Set random seed") +def train(data_path, train_datasets, model_dir, batch_size, learning_rate, steps, seed): """ Train a Scaden model """ - training(data_path=data_path, - train_datasets=train_datasets, - model_dir=model_dir, - batch_size=batch_size, - learning_rate=learning_rate, - num_steps=steps, - seed=seed) + training( + data_path=data_path, + train_datasets=train_datasets, + model_dir=model_dir, + batch_size=batch_size, + learning_rate=learning_rate, + num_steps=steps, + seed=seed, + ) """ @@ -88,21 +103,20 @@ def train(data_path, train_datasets, model_dir, batch_size, learning_rate, @cli.command() -@click.argument('data_path', - type=click.Path(exists=True), - required=True, - metavar='') -@click.option('--model_dir', default="./", help='Path to trained model') -@click.option('--outname', - default="scaden_predictions.txt", - help='Name of predictions file.') -@click.option('--seed', default=0, help="Set random seed") +@click.argument( + "data_path", + type=click.Path(exists=True), + required=True, + metavar="", +) +@click.option("--model_dir", default="./", help="Path to trained model") +@click.option( + "--outname", default="scaden_predictions.txt", help="Name of predictions file." +) +@click.option("--seed", default=0, help="Set random seed") def predict(data_path, model_dir, outname, seed): """ Predict cell type composition using a trained Scaden model""" - prediction(model_dir=model_dir, - data_path=data_path, - out_name=outname, - seed=seed) + prediction(model_dir=model_dir, data_path=data_path, out_name=outname, seed=seed) """ @@ -111,29 +125,37 @@ def predict(data_path, model_dir, outname, seed): @cli.command() -@click.argument('data_path', - type=click.Path(exists=True), - required=True, - metavar='') -@click.argument('prediction_data', - type=click.Path(exists=True), - required=True, - metavar='') -@click.option('--processed_path', - default="processed.h5ad", - help='Path of processed file. Must end with .h5ad') +@click.argument( + "data_path", + type=click.Path(exists=True), + required=True, + metavar="", +) +@click.argument( + "prediction_data", + type=click.Path(exists=True), + required=True, + metavar="", +) @click.option( - '--var_cutoff', + "--processed_path", + default="processed.h5ad", + help="Path of processed file. Must end with .h5ad", +) +@click.option( + "--var_cutoff", default=0.1, - help= - 'Filter out genes with a variance less than the specified cutoff. A low cutoff is recommended,' - 'this should only remove genes that are obviously uninformative.') + help="Filter out genes with a variance less than the specified cutoff. A low cutoff is recommended," + "this should only remove genes that are obviously uninformative.", +) def process(data_path, prediction_data, processed_path, var_cutoff): """ Process a dataset for training """ - processing(data_path=prediction_data, - training_data=data_path, - processed_path=processed_path, - var_cutoff=var_cutoff) + processing( + data_path=prediction_data, + training_data=data_path, + processed_path=processed_path, + var_cutoff=var_cutoff, + ) """ @@ -142,44 +164,46 @@ def process(data_path, prediction_data, processed_path, var_cutoff): @cli.command() -@click.option('--out', - '-o', - default='./', - help="Directory to store output files in") -@click.option('--data', '-d', default='.', help="Path to scRNA-seq dataset(s)") -@click.option('--cells', - '-c', - default=100, - help="Number of cells per sample [default: 100]") -@click.option('--n_samples', - '-n', - default=1000, - help="Number of samples to simulate [default: 1000]") +@click.option("--out", "-o", default="./", help="Directory to store output files in") +@click.option("--data", "-d", default=".", help="Path to scRNA-seq dataset(s)") +@click.option( + "--cells", "-c", default=100, help="Number of cells per sample [default: 100]" +) +@click.option( + "--n_samples", + "-n", + default=1000, + help="Number of samples to simulate [default: 1000]", +) @click.option( - '--pattern', + "--pattern", default="*_counts.txt", - help="File pattern to recognize your processed scRNA-seq count files") + help="File pattern to recognize your processed scRNA-seq count files", +) @click.option( - '--unknown', - '-u', + "--unknown", + "-u", multiple=True, - default=['unknown'], - help= - "Specifiy cell types to merge into the unknown category. Specify this flag for every cell type you want to merge in unknown. [default: unknown]" -) -@click.option('--prefix', - '-p', - default="data", - help="Prefix to append to training .h5ad file [default: data]") + default=["unknown"], + help="Specifiy cell types to merge into the unknown category. Specify this flag for every cell type you want to merge in unknown. [default: unknown]", +) +@click.option( + "--prefix", + "-p", + default="data", + help="Prefix to append to training .h5ad file [default: data]", +) def simulate(out, data, cells, n_samples, pattern, unknown, prefix): """ Create artificial bulk RNA-seq data from scRNA-seq dataset(s)""" - simulation(simulate_dir=out, - data_dir=data, - sample_size=cells, - num_samples=n_samples, - pattern=pattern, - unknown_celltypes=unknown, - out_prefix=prefix) + simulation( + simulate_dir=out, + data_dir=data, + sample_size=cells, + num_samples=n_samples, + pattern=pattern, + unknown_celltypes=unknown, + out_prefix=prefix, + ) """ @@ -188,25 +212,12 @@ def simulate(out, data, cells, n_samples, pattern, unknown, prefix): @cli.command() -@click.option('--out', - '-o', - default='./', - help="Directory to store output files in") -@click.option('--cells', - '-c', - default=10, - help="Number of cells [default: 10]") -@click.option('--genes', - '-g', - default=100, - help="Number of genes [default: 100]") -@click.option('--out', - '-o', - default="./", - help="Output directory [default: ./]") -@click.option('--samples', - '-n', - default=10, - help="Number of bulk samples [default: 10]") +@click.option("--out", "-o", default="./", help="Directory to store output files in") +@click.option("--cells", "-c", default=10, help="Number of cells [default: 10]") +@click.option("--genes", "-g", default=100, help="Number of genes [default: 100]") +@click.option("--out", "-o", default="./", help="Output directory [default: ./]") +@click.option( + "--samples", "-n", default=10, help="Number of bulk samples [default: 10]" +) def example(cells, genes, samples, out): exampleData(n_cells=cells, n_genes=genes, n_samples=samples, out_dir=out) diff --git a/scaden/example.py b/scaden/example.py index 311773a..9e44bd5 100644 --- a/scaden/example.py +++ b/scaden/example.py @@ -22,30 +22,28 @@ def exampleData(n_cells=10, n_genes=100, n_samples=10, out_dir="./"): # Generate example scRNA-seq data counts = np.random.randint(low=0, high=1000, size=(n_cells, n_genes)) - gene_names = ['gene'] * n_genes + gene_names = ["gene"] * n_genes for i in range(len(gene_names)): gene_names[i] = gene_names[i] + str(i) df = pd.DataFrame(counts, columns=gene_names) # Generate example celltype labels - celltypes = ['celltype'] * np.random.randint(low=2, high=n_cells - 1) + celltypes = ["celltype"] * np.random.randint(low=2, high=n_cells - 1) for i in range(len(celltypes)): celltypes[i] = celltypes[i] + str(i) celltype_list = random.choices(celltypes, k=n_cells) - ct_df = pd.DataFrame(celltype_list, columns=['Celltype']) + ct_df = pd.DataFrame(celltype_list, columns=["Celltype"]) # Generate example bulk RNA-seq data bulk = np.random.randint(low=0, high=1000, size=(n_genes, n_samples)) - samples = ['sample'] * n_samples + samples = ["sample"] * n_samples for i in range(len(samples)): samples[i] = samples[i] + str(i) bulk_df = pd.DataFrame(bulk, columns=samples, index=gene_names) # Save the data df.to_csv(os.path.join(out_dir, "example_counts.txt"), sep="\t") - ct_df.to_csv(os.path.join(out_dir, "example_celltypes.txt"), - sep="\t", - index=False) + ct_df.to_csv(os.path.join(out_dir, "example_celltypes.txt"), sep="\t", index=False) bulk_df.to_csv(os.path.join(out_dir, "example_bulk_data.txt"), sep="\t") - logger.warn(f"Example data has been created in {out_dir}") + logger.info(f"Example data has been created in [cyan]{out_dir}") diff --git a/scaden/model/functions.py b/scaden/model/functions.py index fd0b2de..907f7fc 100644 --- a/scaden/model/functions.py +++ b/scaden/model/functions.py @@ -1,6 +1,7 @@ """ Functions used for the scaden model """ +import logging import collections from anndata import read_h5ad import numpy as np @@ -8,7 +9,7 @@ from sklearn import preprocessing as pp import pandas as pd - +logger = logging.getLogger(__name__) def dummy_labels(m, labels): @@ -21,6 +22,7 @@ def dummy_labels(m, labels): n_l = len(labels) return np.zeros((m, n_l), dtype="float32") + def sample_scaling(x, scaling_option): """ Apply scaling of data @@ -42,8 +44,9 @@ def sample_scaling(x, scaling_option): return x - -def preprocess_h5ad_data(raw_input_path, processed_path, scaling_option="log_min_max", sig_genes=None): +def preprocess_h5ad_data( + raw_input_path, processed_path, scaling_option="log_min_max", sig_genes=None +): """ Preprocess raw input data for the model :param raw_input_path: @@ -52,21 +55,21 @@ def preprocess_h5ad_data(raw_input_path, processed_path, scaling_option="log_min :param signature_genes: :return: """ - print("Pre-processing raw data ...") + logger.info("Pre-processing raw data ...") raw_input = read_h5ad(raw_input_path) - print("Subsetting genes ...") + logger.info("Subsetting genes ...") # Select features go use raw_input = raw_input[:, sig_genes] - print("Scaling using " + str(scaling_option)) + logger.info("Scaling using " + str(scaling_option)) # Scaling raw_input.X = sample_scaling(raw_input.X, scaling_option) - print("Writing to disk ...") - # Write processed data to disk + logger.info("Writing to disk ...") raw_input.write(processed_path) - print("Data pre-processing done.") + logger.info("Data pre-processing done.") + logger.info(f"Created processed file: [cyan]{processed_path}[/]") def get_signature_genes(input_path, sig_genes_complete, var_cutoff=0.1): @@ -82,9 +85,5 @@ def get_signature_genes(input_path, sig_genes_complete, var_cutoff=0.1): available_genes = list(data.index) new_sig_genes = list(set(available_genes).intersection(sig_genes_complete)) n_sig_genes = len(new_sig_genes) - print(f"Found {n_sig_genes} common genes.") + logger.info(f"Found [cyan]{n_sig_genes}[/cyan] common genes.") return new_sig_genes - - - - diff --git a/scaden/model/scaden.py b/scaden/model/scaden.py index cbdffe4..03b1d4c 100644 --- a/scaden/model/scaden.py +++ b/scaden/model/scaden.py @@ -20,15 +20,18 @@ class Scaden(object): """ scaden class """ - def __init__(self, - model_dir, - model_name, - batch_size=128, - learning_rate=0.0001, - num_steps=1000, - seed=0, - hidden_units=[256, 128, 64, 32], - do_rates=[0, 0, 0, 0]): + + def __init__( + self, + model_dir, + model_name, + batch_size=128, + learning_rate=0.0001, + num_steps=1000, + seed=0, + hidden_units=[256, 128, 64, 32], + do_rates=[0, 0, 0, 0], + ): self.model_dir = model_dir self.batch_size = batch_size @@ -50,24 +53,20 @@ def __init__(self, # Set seeds for reproducibility tf.random.set_seed(seed) - os.environ['TF_DETERMINISTIC_OPS'] = '1' + os.environ["TF_DETERMINISTIC_OPS"] = "1" np.random.seed(seed) def scaden_model(self, n_classes): """Create the Scaden model""" model = tf.keras.Sequential() - model.add( - tf.keras.layers.Dense(self.hidden_units[0], activation=tf.nn.relu)) + model.add(tf.keras.layers.Dense(self.hidden_units[0], activation=tf.nn.relu)) model.add(tf.keras.layers.Dropout(self.do_rates[0])) - model.add( - tf.keras.layers.Dense(self.hidden_units[1], activation=tf.nn.relu)) + model.add(tf.keras.layers.Dense(self.hidden_units[1], activation=tf.nn.relu)) model.add(tf.keras.layers.Dropout(self.do_rates[1])) - model.add( - tf.keras.layers.Dense(self.hidden_units[2], activation=tf.nn.relu)) + model.add(tf.keras.layers.Dense(self.hidden_units[2], activation=tf.nn.relu)) model.add(tf.keras.layers.Dropout(self.do_rates[2])) - model.add( - tf.keras.layers.Dense(self.hidden_units[3], activation=tf.nn.relu)) + model.add(tf.keras.layers.Dense(self.hidden_units[3], activation=tf.nn.relu)) model.add(tf.keras.layers.Dropout(self.do_rates[3])) model.add(tf.keras.layers.Dense(n_classes, activation=tf.nn.softmax)) @@ -91,7 +90,8 @@ def compute_accuracy(self, logits, targets, pct_cut=0.05): :return: """ equality = tf.less_equal( - tf.math.abs(tf.math.subtract(logits, targets)), pct_cut) + tf.math.abs(tf.math.subtract(logits, targets)), pct_cut + ) accuracy = tf.reduce_mean(input_tensor=tf.cast(equality, tf.float32)) return accuracy @@ -107,8 +107,11 @@ def correlation_coefficient(self, logits, targets): xm, ym = logits - mx, targets - my r_num = tf.reduce_sum(input_tensor=tf.multiply(xm, ym)) r_den = tf.sqrt( - tf.multiply(tf.reduce_sum(input_tensor=tf.square(xm)), - tf.reduce_sum(input_tensor=tf.square(ym)))) + tf.multiply( + tf.reduce_sum(input_tensor=tf.square(xm)), + tf.reduce_sum(input_tensor=tf.square(ym)), + ) + ) r = tf.divide(r_num, r_den) r = tf.maximum(tf.minimum(r, 1.0), -1.0) return r @@ -127,38 +130,44 @@ def visualization(self, logits, targets, classes): for i in range(logits.shape[1]): eval_metrics[ - "mre_" + - str(classes[i])] = tf.compat.v1.metrics.mean_relative_error( - targets[:, i], logits[:, i], targets[:, i])[0] + "mre_" + str(classes[i]) + ] = tf.compat.v1.metrics.mean_relative_error( + targets[:, i], logits[:, i], targets[:, i] + )[ + 0 + ] eval_metrics[ - "mae_" + - str(classes[i])] = tf.compat.v1.metrics.mean_absolute_error( - targets[:, i], logits[:, i], targets[:, i])[0] - eval_metrics["pcor_" + - str(classes[i])] = self.correlation_coefficient( - targets[:, i], logits[:, i]) + "mae_" + str(classes[i]) + ] = tf.compat.v1.metrics.mean_absolute_error( + targets[:, i], logits[:, i], targets[:, i] + )[ + 0 + ] + eval_metrics["pcor_" + str(classes[i])] = self.correlation_coefficient( + targets[:, i], logits[:, i] + ) eval_metrics["mre_total"] = tf.compat.v1.metrics.mean_relative_error( - targets, logits, targets)[1] + targets, logits, targets + )[1] eval_metrics["mae_total"] = tf.compat.v1.metrics.mean_relative_error( - targets, logits, targets)[1] - - eval_metrics["accuracy01"] = self.compute_accuracy(logits, - targets, - pct_cut=0.01) - eval_metrics["accuracy05"] = self.compute_accuracy(logits, - targets, - pct_cut=0.05) - eval_metrics["accuracy1"] = self.compute_accuracy(logits, - targets, - pct_cut=0.1) + targets, logits, targets + )[1] + + eval_metrics["accuracy01"] = self.compute_accuracy( + logits, targets, pct_cut=0.01 + ) + eval_metrics["accuracy05"] = self.compute_accuracy( + logits, targets, pct_cut=0.05 + ) + eval_metrics["accuracy1"] = self.compute_accuracy(logits, targets, pct_cut=0.1) # Create summary scalars for key, value in eval_metrics.items(): tf.compat.v1.summary.scalar(key, value) - tf.compat.v1.summary.scalar('loss', self.loss) + tf.compat.v1.summary.scalar("loss", self.loss) merged_summary_op = tf.compat.v1.summary.merge_all() @@ -180,34 +189,34 @@ def load_h5ad_file(self, input_path, batch_size, datasets=[]): ) sys.exit() - # Subset dataset + # Subset dataset if --train_datasets is given if len(datasets) > 0: - all_ds = collections.Counter(raw_input.obs['ds']) + all_ds = collections.Counter(raw_input.obs["ds"]) + + # Check that given datasets are all actually available + for ds in datasets: + if not ds in all_ds: + logger.warn( + f"The dataset '[cyan]{ds}[/cyan]' could not be found in the training data! Is the name correct?" + ) + for ds in all_ds: if ds not in datasets: - raw_input = raw_input[raw_input.obs['ds'] != ds].copy() + raw_input = raw_input[raw_input.obs["ds"] != ds].copy() # Create training dataset - ratios = [ - raw_input.obs[ctype] for ctype in raw_input.uns['cell_types'] - ] + ratios = [raw_input.obs[ctype] for ctype in raw_input.uns["cell_types"]] self.x_data = raw_input.X.astype(np.float32) self.y_data = np.array(ratios, dtype=np.float32).transpose() - self.data = tf.data.Dataset.from_tensor_slices( - (self.x_data, self.y_data)) - self.data = self.data.shuffle(1000).repeat().batch( - batch_size=batch_size) + self.data = tf.data.Dataset.from_tensor_slices((self.x_data, self.y_data)) + self.data = self.data.shuffle(1000).repeat().batch(batch_size=batch_size) self.data_iter = iter(self.data) # Extract celltype and feature info - self.labels = raw_input.uns['cell_types'] + self.labels = raw_input.uns["cell_types"] self.sig_genes = list(raw_input.var_names) - def load_prediction_file(self, - input_path, - sig_genes, - labels, - scaling=None): + def load_prediction_file(self, input_path, sig_genes, labels, scaling=None): """ Load a file to perform prediction on it :param input_path: path to input file @@ -222,10 +231,10 @@ def load_prediction_file(self, # check for duplicates data_index = list(data.index) if not (len(data_index) == len(set(data_index))): - print( + logger.warn( "Scaden Warning: Your mixture file conatins duplicate genes! The first occuring gene will be used for every duplicate." ) - data = data.loc[~data.index.duplicated(keep='first')] + data = data.loc[~data.index.duplicated(keep="first")] data = data.loc[sig_genes] data = data.T @@ -244,13 +253,15 @@ def build_model(self, input_path, train_datasets, mode="train"): :param reuse: :return: """ - self.global_step = tf.Variable(0, name='global_step', trainable=False) + self.global_step = tf.Variable(0, name="global_step", trainable=False) # Load training data if mode == "train": - self.load_h5ad_file(input_path=input_path, - batch_size=self.batch_size, - datasets=train_datasets) + self.load_h5ad_file( + input_path=input_path, + batch_size=self.batch_size, + datasets=train_datasets, + ) # Load prediction data if mode == "predict": @@ -258,15 +269,15 @@ def build_model(self, input_path, train_datasets, mode="train"): input_path=input_path, sig_genes=self.sig_genes, labels=self.labels, - scaling=self.scaling) + scaling=self.scaling, + ) # Build the model or load if available self.n_classes = len(self.labels) try: - self.model = tf.keras.models.load_model(self.model_dir, - compile=False) - logger.info("Loaded pre-trained model") + self.model = tf.keras.models.load_model(self.model_dir, compile=False) + logger.info(f"Loaded pre-trained model: [cyan]{self.model_name}") except: self.model = self.scaden_model(n_classes=self.n_classes) @@ -281,9 +292,9 @@ def train(self, input_path, train_datasets): optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate) # Build model graph - self.build_model(input_path=input_path, - train_datasets=train_datasets, - mode="train") + self.build_model( + input_path=input_path, train_datasets=train_datasets, mode="train" + ) # Training loop pbar = tqdm(range(self.num_steps)) @@ -299,7 +310,7 @@ def train(self, input_path, train_datasets): optimizer.apply_gradients(zip(grads, self.model.trainable_weights)) - desc = (f"Step: {step}, Loss: {loss:.4f}") + desc = f"Step: {step}, Loss: {loss:.4f}" pbar.set_description(desc=desc) # Collect garbage after 100 steps - otherwise runs out of memory @@ -308,13 +319,12 @@ def train(self, input_path, train_datasets): # Save the trained model self.model.save(self.model_dir) - pd.DataFrame(self.labels).to_csv(os.path.join(self.model_dir, - "celltypes.txt"), - sep="\t") - pd.DataFrame(self.sig_genes).to_csv(os.path.join( - self.model_dir, "genes.txt"), - sep="\t") - + pd.DataFrame(self.labels).to_csv( + os.path.join(self.model_dir, "celltypes.txt"), sep="\t" + ) + pd.DataFrame(self.sig_genes).to_csv( + os.path.join(self.model_dir, "genes.txt"), sep="\t" + ) def predict(self, input_path, out_name="scaden_predictions.txt"): """ @@ -325,18 +335,16 @@ def predict(self, input_path, out_name="scaden_predictions.txt"): """ # Load signature genes and celltype labels sig_genes = pd.read_table(self.model_dir + "/genes.txt", index_col=0) - self.sig_genes = list(sig_genes['0']) + self.sig_genes = list(sig_genes["0"]) labels = pd.read_table(self.model_dir + "/celltypes.txt", index_col=0) - self.labels = list(labels['0']) + self.labels = list(labels["0"]) # Build model graph - self.build_model(input_path=input_path, - train_datasets=[], - mode="predict") + self.build_model(input_path=input_path, train_datasets=[], mode="predict") predictions = self.model.predict(self.data) - pred_df = pd.DataFrame(predictions, - columns=self.labels, - index=self.sample_names) + pred_df = pd.DataFrame( + predictions, columns=self.labels, index=self.sample_names + ) return pred_df \ No newline at end of file diff --git a/scaden/predict.py b/scaden/predict.py index c2d26dd..b0f567d 100644 --- a/scaden/predict.py +++ b/scaden/predict.py @@ -9,22 +9,26 @@ """ # Imports +import logging import tensorflow as tf from anndata import read_h5ad from scaden.model.architectures import architectures from scaden.model.scaden import Scaden + +logger = logging.getLogger(__name__) + """ PARAMETERS """ # ==========================================# # Extract architectures -M256_HIDDEN_UNITS = architectures['m256'][0] -M512_HIDDEN_UNITS = architectures['m512'][0] -M1024_HIDDEN_UNITS = architectures['m1024'][0] -M256_DO_RATES = architectures['m256'][1] -M512_DO_RATES = architectures['m512'][1] -M1024_DO_RATES = architectures['m1024'][1] +M256_HIDDEN_UNITS = architectures["m256"][0] +M512_HIDDEN_UNITS = architectures["m512"][0] +M1024_HIDDEN_UNITS = architectures["m1024"][0] +M256_DO_RATES = architectures["m256"][1] +M512_DO_RATES = architectures["m512"][1] +M1024_DO_RATES = architectures["m1024"][1] # ==========================================# @@ -39,35 +43,45 @@ def prediction(model_dir, data_path, out_name, seed=0): """ # Small model predictions - cdn256 = Scaden(model_dir=model_dir + "/m256", - model_name='m256', - seed=seed, - hidden_units=M256_HIDDEN_UNITS, - do_rates=M256_DO_RATES) + cdn256 = Scaden( + model_dir=model_dir + "/m256", + model_name="m256", + seed=seed, + hidden_units=M256_HIDDEN_UNITS, + do_rates=M256_DO_RATES, + ) # Predict ratios - preds_256 = cdn256.predict(input_path=data_path, - out_name='scaden_predictions_m256.txt') + preds_256 = cdn256.predict( + input_path=data_path, out_name="scaden_predictions_m256.txt" + ) # Mid model predictions - cdn512 = Scaden(model_dir=model_dir + "/m512", - model_name='m512', - seed=seed, - hidden_units=M512_HIDDEN_UNITS, - do_rates=M512_DO_RATES) + cdn512 = Scaden( + model_dir=model_dir + "/m512", + model_name="m512", + seed=seed, + hidden_units=M512_HIDDEN_UNITS, + do_rates=M512_DO_RATES, + ) # Predict ratios - preds_512 = cdn512.predict(input_path=data_path, - out_name='scaden_predictions_m512.txt') + preds_512 = cdn512.predict( + input_path=data_path, out_name="scaden_predictions_m512.txt" + ) # Large model predictions - cdn1024 = Scaden(model_dir=model_dir + "/m1024", - model_name='m1024', - seed=seed, - hidden_units=M1024_HIDDEN_UNITS, - do_rates=M256_DO_RATES) + cdn1024 = Scaden( + model_dir=model_dir + "/m1024", + model_name="m1024", + seed=seed, + hidden_units=M1024_HIDDEN_UNITS, + do_rates=M256_DO_RATES, + ) # Predict ratios - preds_1024 = cdn1024.predict(input_path=data_path, - out_name='scaden_predictions_m1024.txt') + preds_1024 = cdn1024.predict( + input_path=data_path, out_name="scaden_predictions_m1024.txt" + ) # Average predictions preds = (preds_256 + preds_512 + preds_1024) / 3 preds.to_csv(out_name, sep="\t") + logger.info(f"[bold]Created cell composition predictions: [green]{out_name}[/]") diff --git a/scaden/preprocessing/bulk_simulation.py b/scaden/preprocessing/bulk_simulation.py index 08961ed..c5d80a6 100644 --- a/scaden/preprocessing/bulk_simulation.py +++ b/scaden/preprocessing/bulk_simulation.py @@ -334,7 +334,7 @@ def simulate_bulk( logging.error("No datasets found! Have you specified the pattern correctly?") sys.exit(1) - print("Datasets: " + str(datasets)) + logger.info("Datasets: [cyan]" + str(datasets) + "[/]") # Load datasets xs, ys = [], [] @@ -345,22 +345,23 @@ def simulate_bulk( # Get common gene list all_genes = get_common_genes(xs, type="intersection") - print("No. of common genes: " + str(len(all_genes))) + logger.info("No. of common genes: " + str(len(all_genes))) xs = [filter_matrix_signature(m, all_genes) for m in xs] # Merge unknown celltypes - print("Merging unknown cell types: " + str(unknown_celltypes)) + logger.info("Merging unknown cell types: " + str(unknown_celltypes)) for i in range(len(ys)): ys[i] = merge_unkown_celltypes(ys[i], unknown_celltypes) # Collect all available celltypes celltypes = collect_celltypes(ys) - print("Available celltypes: " + str(celltypes)) + logger.info("Available celltypes: " + str(celltypes)) pd.DataFrame(celltypes).to_csv(out_dir + "celltypes.txt", sep="\t") # Create datasets for i in range(len(xs)): - print("Subsampling " + datasets[i] + "...") + logger.info("Subsampling " + datasets[i] + "...") + tmpx, tmpy = create_subsample_dataset( xs[i], ys[i], sample_size, celltypes, num_samples ) @@ -368,4 +369,4 @@ def simulate_bulk( tmpy.to_csv(out_dir + datasets[i] + "_labels.txt", sep="\t", index=False) gc.collect() - print("Finished!") + logger.info("[bold green]Finished data simulation!") diff --git a/scaden/preprocessing/create_h5ad_file.py b/scaden/preprocessing/create_h5ad_file.py index bf447d8..906f324 100644 --- a/scaden/preprocessing/create_h5ad_file.py +++ b/scaden/preprocessing/create_h5ad_file.py @@ -67,7 +67,7 @@ def create_h5ad_file(data_dir, out_path, unknown, pattern="*_samples.txt"): """ Create h5ad file from simulated data """ - + logger.info("[bold]Creating h5ad file") # List available datasets files = glob.glob(data_dir + pattern) files = [os.path.basename(x) for x in files] @@ -75,8 +75,8 @@ def create_h5ad_file(data_dir, out_path, unknown, pattern="*_samples.txt"): # get celltypes celltypes = load_celltypes(data_dir) - print(f"Celltypes: {celltypes}") - print(f"Found datasets: {datasets}") + logger.info(f"Celltypes: {celltypes}") + logger.info(f"Found datasets: {datasets}") adata = [] me_dict = {} @@ -97,7 +97,7 @@ def create_h5ad_file(data_dir, out_path, unknown, pattern="*_samples.txt"): ratios = pd.DataFrame(y, columns=celltypes) ratios["ds"] = pd.Series(np.repeat(train_file, y.shape[0]), index=ratios.index) - print("Processing " + str(train_file)) + logger.info("Processing " + str(train_file)) x = pd.DataFrame(x) adata.append( anndata.AnnData( @@ -106,11 +106,11 @@ def create_h5ad_file(data_dir, out_path, unknown, pattern="*_samples.txt"): ) for i in range(1, len(adata)): - print("Concatenating " + str(i)) + logger.info("Concatenating " + str(i)) adata[0] = adata[0].concatenate(adata[1]) del adata[1] gc.collect() - print(len(adata)) + logger.info(len(adata)) adata = adata[0] # add cell types and signature genes @@ -119,3 +119,4 @@ def create_h5ad_file(data_dir, out_path, unknown, pattern="*_samples.txt"): # save data adata.write(out_path) + logger.info(f"Created h5ad file: {out_path}") diff --git a/scaden/train.py b/scaden/train.py index 1694b54..7f082c1 100644 --- a/scaden/train.py +++ b/scaden/train.py @@ -9,33 +9,34 @@ """ # Imports +import logging import tensorflow as tf from anndata import read_h5ad from scaden.model.architectures import architectures from scaden.model.scaden import Scaden + + +logger = logging.getLogger(__name__) + """ PARAMETERS """ # ==========================================# # Extract architectures -M256_HIDDEN_UNITS = architectures['m256'][0] -M512_HIDDEN_UNITS = architectures['m512'][0] -M1024_HIDDEN_UNITS = architectures['m1024'][0] -M256_DO_RATES = architectures['m256'][1] -M512_DO_RATES = architectures['m512'][1] -M1024_DO_RATES = architectures['m1024'][1] +M256_HIDDEN_UNITS = architectures["m256"][0] +M512_HIDDEN_UNITS = architectures["m512"][0] +M1024_HIDDEN_UNITS = architectures["m1024"][0] +M256_DO_RATES = architectures["m256"][1] +M512_DO_RATES = architectures["m512"][1] +M1024_DO_RATES = architectures["m1024"][1] # ==========================================# -def training(data_path, - train_datasets, - model_dir, - batch_size, - learning_rate, - num_steps, - seed=0): +def training( + data_path, train_datasets, model_dir, batch_size, learning_rate, num_steps, seed=0 +): """ Perform training of three a scaden model ensemble consisting of three different models :param model_dir: @@ -45,49 +46,55 @@ def training(data_path, :return: """ # Convert training datasets - if train_datasets == '': + if train_datasets == "": train_datasets = [] else: - train_datasets = train_datasets.split(',') - print(f"Training on: {train_datasets}") + train_datasets = train_datasets.split(",") + logger.info(f"Training on: [cyan]{train_datasets}") # Training of M256 model - print("Training M256 Model ...") - cdn256 = Scaden(model_dir=model_dir + "/m256", - model_name='m256', - batch_size=batch_size, - learning_rate=learning_rate, - num_steps=num_steps, - seed=seed, - hidden_units=M256_HIDDEN_UNITS, - do_rates=M512_DO_RATES) + logger.info("[cyan]Training M256 Model ... [/]") + cdn256 = Scaden( + model_dir=model_dir + "/m256", + model_name="m256", + batch_size=batch_size, + learning_rate=learning_rate, + num_steps=num_steps, + seed=seed, + hidden_units=M256_HIDDEN_UNITS, + do_rates=M512_DO_RATES, + ) cdn256.train(input_path=data_path, train_datasets=train_datasets) del cdn256 # Training of M512 model - print("Training M512 Model ...") - cdn512 = Scaden(model_dir=model_dir + "/m512", - model_name='m512', - batch_size=batch_size, - learning_rate=learning_rate, - num_steps=num_steps, - seed=seed, - hidden_units=M512_HIDDEN_UNITS, - do_rates=M512_DO_RATES) + logger.info("[cyan]Training M512 Model ... [/]") + cdn512 = Scaden( + model_dir=model_dir + "/m512", + model_name="m512", + batch_size=batch_size, + learning_rate=learning_rate, + num_steps=num_steps, + seed=seed, + hidden_units=M512_HIDDEN_UNITS, + do_rates=M512_DO_RATES, + ) cdn512.train(input_path=data_path, train_datasets=train_datasets) del cdn512 # Training of M1024 model - print("Training M1024 Model ...") - cdn1024 = Scaden(model_dir=model_dir + "/m1024", - model_name='m1024', - batch_size=batch_size, - learning_rate=learning_rate, - num_steps=num_steps, - seed=seed, - hidden_units=M1024_HIDDEN_UNITS, - do_rates=M1024_DO_RATES) + logger.info("[cyan]Training M1024 Model ... [/]") + cdn1024 = Scaden( + model_dir=model_dir + "/m1024", + model_name="m1024", + batch_size=batch_size, + learning_rate=learning_rate, + num_steps=num_steps, + seed=seed, + hidden_units=M1024_HIDDEN_UNITS, + do_rates=M1024_DO_RATES, + ) cdn1024.train(input_path=data_path, train_datasets=train_datasets) del cdn1024 - print("Training finished.") \ No newline at end of file + logger.info("[green]Training finished.") \ No newline at end of file diff --git a/setup.py b/setup.py index 0b8f036..004ccd9 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,8 @@ from setuptools import setup, find_packages -version = "1.0.1" +version = "1.0.2" + with open("README.md", "r", encoding="UTF-8") as fh: long_description = fh.read() @@ -40,5 +41,6 @@ "tqdm", "click", "h5py~=2.10.0", + "rich", ], )