From e7dc7b859bba6082bcff5c286958c78c6b5d550d Mon Sep 17 00:00:00 2001
From: Daniel Munro <danbmunro@gmail.com>
Date: Thu, 6 Jun 2024 22:06:49 -0500
Subject: [PATCH] Bug fixes for integer sample IDs etc

---
 Pheast/scripts/fusion_compute_weights.sh |  2 +-
 Pheast/steps/covariates.smk              | 13 ++++++++-----
 Project/scripts/assemble_bed.py          |  2 +-
 Project/steps/align.smk                  | 10 +++++-----
 4 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/Pheast/scripts/fusion_compute_weights.sh b/Pheast/scripts/fusion_compute_weights.sh
index 566a74b..642e9f0 100644
--- a/Pheast/scripts/fusion_compute_weights.sh
+++ b/Pheast/scripts/fusion_compute_weights.sh
@@ -44,7 +44,7 @@ zcat < $BED | awk -vs=$B_START -ve=$B_END 'NR >= s + 1 && NR <= e + 1' | while r
 
 	# Get the gene positions +/- 500kb
 	CHR=`echo $PARAM | awk '{ print $1 }'`
-	P0=`echo $PARAM | awk '{ print $3 - 0.5e6 }'`
+	P0=`echo $PARAM | awk '{ p = $3 - 0.5e6; if (p < 1) p = 1; print p }'`
 	P1=`echo $PARAM | awk '{ print $3 + 0.5e6 }'`
 	GNAME=`echo $PARAM | awk '{ print $4 }'`
 
diff --git a/Pheast/steps/covariates.smk b/Pheast/steps/covariates.smk
index 174444d..ed44eb1 100644
--- a/Pheast/steps/covariates.smk
+++ b/Pheast/steps/covariates.smk
@@ -14,12 +14,14 @@ rule prune_for_covar:
         pruned_dir = interm_dir / 'covar',
         pruned_prefix = interm_dir / 'covar' / 'geno_pruned',
     shell:
-        # --geno 0.05 filters variants with >5% missing values (the rest will be imputed)
+        # --geno 0.05 filters variants with >5% missing values (the rest will be imputed).
+        # Default is 0 so that samples with many more genotyped variants than others don't
+        # result in other samples having mostly missing values after pruning.
         """
         mkdir -p {params.pruned_dir}
         plink2 \
             --bfile {params.geno_prefix} \
-            --geno 0.05 \
+            --geno 0.00 \
             --maf 0.05 \
             --indep-pairwise 200 100 0.1 \
             --out {params.pruned_prefix}
@@ -60,15 +62,16 @@ rule plink_covariates:
     output:
         plink = interm_dir / 'covar' / '{modality}.covar.plink.tsv',
     run:
-        covar = pd.read_csv(input.covar, sep='\t', index_col=0)
+        covar = pd.read_csv(input.covar, sep='\t', index_col=0, dtype=str)
         covar.index.name = None
         covar = covar.T
         covar.index.name = 'IID'
         covar = covar.reset_index()
         # covar.insert(0, 'FID', 0)  # Family must match that in the genotype files.
         ## Get FIDs from genotypes:
-        fam = pd.read_csv(input.fam, sep='\t', header=None, index_col=1)
-        fam = fam.to_dict()[0]
+        fam = pd.read_csv(input.fam, sep='\t', header=None, dtype=str)
+        # In some versions, dtype doesn't apply to index, so set index later:
+        fam = fam.set_index(1).to_dict()[0]
         ## Insert FIDs as first column of covar, joining by IID:
         covar.insert(0, 'FID', covar['IID'].map(fam))
         covar.to_csv(output.plink, sep='\t', index=False, header=False)
diff --git a/Project/scripts/assemble_bed.py b/Project/scripts/assemble_bed.py
index d256f17..4085dd7 100644
--- a/Project/scripts/assemble_bed.py
+++ b/Project/scripts/assemble_bed.py
@@ -228,7 +228,7 @@ def assemble_stability(sample_ids: list, stab_dir: Path, ref_anno: Path, bed: Pa
 args = parser.parse_args()
 
 if args.samples is not None:
-    samples = pd.read_csv(args.samples, sep='\t', header=None)[0].tolist()
+    samples = pd.read_csv(args.samples, sep='\t', header=None, dtype=str)[0].tolist()
 
 if args.type == 'alt_TSS_polyA':
     assemble_alt_TSS_polyA(samples, args.input_dir, args.input_dir2, 'tpm', args.ref_anno, args.output, min_frac=0.05, max_frac=0.95)
diff --git a/Project/steps/align.smk b/Project/steps/align.smk
index 1bf2eda..af7e0bb 100644
--- a/Project/steps/align.smk
+++ b/Project/steps/align.smk
@@ -5,10 +5,10 @@ rule star_index:
         ref_anno = ref_anno,
     output:
         # Among other generated files:
-        ref_dir / 'star_index' / 'SAindex',
-        ref_dir / 'star_index' / 'chrNameLength.txt',
+        ref_dir / f'star_index_{read_length}' / 'SAindex',
+        ref_dir / f'star_index_{read_length}' / 'chrNameLength.txt',
     params:
-        index_dir = ref_dir / 'star_index',
+        index_dir = ref_dir / f'star_index_{read_length}',
         overhang = read_length - 1,
         genomeSAindexNbases = int(np.log2(genome_size) / 2 - 1),
     threads: 16
@@ -44,12 +44,12 @@ rule star_align:
     """Align RNA-Seq reads for a sample using STAR."""
     input:
         fastq = fastq_inputs,
-        index = ref_dir / 'star_index' / 'SAindex',
+        index = ref_dir / f'star_index_{read_length}' / 'SAindex',
     output:
         interm_dir / 'bam' / '{sample_id}.Aligned.sortedByCoord.out.bam',
     params:
         fastq_list = fastq_star_param,
-        index_dir = ref_dir / 'star_index',
+        index_dir = ref_dir / f'star_index_{read_length}',
         bam_dir = interm_dir / 'bam',
         prefix = str(interm_dir / 'bam' / '{sample_id}.'),
     threads: 16