From e7dc7b859bba6082bcff5c286958c78c6b5d550d Mon Sep 17 00:00:00 2001 From: Daniel Munro Date: Thu, 6 Jun 2024 22:06:49 -0500 Subject: [PATCH] Bug fixes for integer sample IDs etc --- Pheast/scripts/fusion_compute_weights.sh | 2 +- Pheast/steps/covariates.smk | 13 ++++++++----- Project/scripts/assemble_bed.py | 2 +- Project/steps/align.smk | 10 +++++----- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/Pheast/scripts/fusion_compute_weights.sh b/Pheast/scripts/fusion_compute_weights.sh index 566a74b..642e9f0 100644 --- a/Pheast/scripts/fusion_compute_weights.sh +++ b/Pheast/scripts/fusion_compute_weights.sh @@ -44,7 +44,7 @@ zcat < $BED | awk -vs=$B_START -ve=$B_END 'NR >= s + 1 && NR <= e + 1' | while r # Get the gene positions +/- 500kb CHR=`echo $PARAM | awk '{ print $1 }'` - P0=`echo $PARAM | awk '{ print $3 - 0.5e6 }'` + P0=`echo $PARAM | awk '{ p = $3 - 0.5e6; if (p < 1) p = 1; print p }'` P1=`echo $PARAM | awk '{ print $3 + 0.5e6 }'` GNAME=`echo $PARAM | awk '{ print $4 }'` diff --git a/Pheast/steps/covariates.smk b/Pheast/steps/covariates.smk index 174444d..ed44eb1 100644 --- a/Pheast/steps/covariates.smk +++ b/Pheast/steps/covariates.smk @@ -14,12 +14,14 @@ rule prune_for_covar: pruned_dir = interm_dir / 'covar', pruned_prefix = interm_dir / 'covar' / 'geno_pruned', shell: - # --geno 0.05 filters variants with >5% missing values (the rest will be imputed) + # --geno 0.05 filters variants with >5% missing values (the rest will be imputed). + # Default is 0 so that samples with many more genotyped variants than others don't + # result in other samples having mostly missing values after pruning. """ mkdir -p {params.pruned_dir} plink2 \ --bfile {params.geno_prefix} \ - --geno 0.05 \ + --geno 0.00 \ --maf 0.05 \ --indep-pairwise 200 100 0.1 \ --out {params.pruned_prefix} @@ -60,15 +62,16 @@ rule plink_covariates: output: plink = interm_dir / 'covar' / '{modality}.covar.plink.tsv', run: - covar = pd.read_csv(input.covar, sep='\t', index_col=0) + covar = pd.read_csv(input.covar, sep='\t', index_col=0, dtype=str) covar.index.name = None covar = covar.T covar.index.name = 'IID' covar = covar.reset_index() # covar.insert(0, 'FID', 0) # Family must match that in the genotype files. ## Get FIDs from genotypes: - fam = pd.read_csv(input.fam, sep='\t', header=None, index_col=1) - fam = fam.to_dict()[0] + fam = pd.read_csv(input.fam, sep='\t', header=None, dtype=str) + # In some versions, dtype doesn't apply to index, so set index later: + fam = fam.set_index(1).to_dict()[0] ## Insert FIDs as first column of covar, joining by IID: covar.insert(0, 'FID', covar['IID'].map(fam)) covar.to_csv(output.plink, sep='\t', index=False, header=False) diff --git a/Project/scripts/assemble_bed.py b/Project/scripts/assemble_bed.py index d256f17..4085dd7 100644 --- a/Project/scripts/assemble_bed.py +++ b/Project/scripts/assemble_bed.py @@ -228,7 +228,7 @@ def assemble_stability(sample_ids: list, stab_dir: Path, ref_anno: Path, bed: Pa args = parser.parse_args() if args.samples is not None: - samples = pd.read_csv(args.samples, sep='\t', header=None)[0].tolist() + samples = pd.read_csv(args.samples, sep='\t', header=None, dtype=str)[0].tolist() if args.type == 'alt_TSS_polyA': assemble_alt_TSS_polyA(samples, args.input_dir, args.input_dir2, 'tpm', args.ref_anno, args.output, min_frac=0.05, max_frac=0.95) diff --git a/Project/steps/align.smk b/Project/steps/align.smk index 1bf2eda..af7e0bb 100644 --- a/Project/steps/align.smk +++ b/Project/steps/align.smk @@ -5,10 +5,10 @@ rule star_index: ref_anno = ref_anno, output: # Among other generated files: - ref_dir / 'star_index' / 'SAindex', - ref_dir / 'star_index' / 'chrNameLength.txt', + ref_dir / f'star_index_{read_length}' / 'SAindex', + ref_dir / f'star_index_{read_length}' / 'chrNameLength.txt', params: - index_dir = ref_dir / 'star_index', + index_dir = ref_dir / f'star_index_{read_length}', overhang = read_length - 1, genomeSAindexNbases = int(np.log2(genome_size) / 2 - 1), threads: 16 @@ -44,12 +44,12 @@ rule star_align: """Align RNA-Seq reads for a sample using STAR.""" input: fastq = fastq_inputs, - index = ref_dir / 'star_index' / 'SAindex', + index = ref_dir / f'star_index_{read_length}' / 'SAindex', output: interm_dir / 'bam' / '{sample_id}.Aligned.sortedByCoord.out.bam', params: fastq_list = fastq_star_param, - index_dir = ref_dir / 'star_index', + index_dir = ref_dir / f'star_index_{read_length}', bam_dir = interm_dir / 'bam', prefix = str(interm_dir / 'bam' / '{sample_id}.'), threads: 16