-
Notifications
You must be signed in to change notification settings - Fork 1
/
pipeline.py
executable file
·110 lines (91 loc) · 5.17 KB
/
pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python3
import shutil
import sys
import os
import random
from os import listdir
from os.path import isfile, join
import evaluate_rukki
import cluster
if len(sys.argv) < 3:
print(f'Usage: {sys.argv[0]} <input_dir> <output_dir>')
exit()
cur_dir = os.path.abspath(os.path.dirname(__file__))
input_dir = sys.argv[1]
output_dir = sys.argv[2]
os.system(f"mkdir -p {output_dir}")
eval_file = ""
if len(sys.argv) > 3:
eval_file = os.path.join(sys.argv[2], sys.argv[3])
os.makedirs(output_dir, exist_ok=True)
#here should be mashmap running and parsing
#TODO
#mashmap -r unitig-popped-unitig-normal-connected-tip.homopolymer-compressed.fasta -q unitig-popped-unitig-normal-connected-tip.homopolymer-compressed.fasta -t 8 -f none --pi 95 -s 10000
#cat mashmap.out |awk '{if ($NF > 99 && $4-$3 > 500000 && $1 != $6) print $1"\t"$6}'|sort |uniq > unitig-popped-unitig-normal-connected-tip.homopolymer-compressed.matches
matches_file = os.path.join(input_dir, "unitig-popped-unitig-normal-connected-tip.homopolymer-compressed.matches")
matches_file = os.path.join(input_dir, "unitigs.matches")
hic_file = os.path.join(input_dir, "hic_mapping.byread.output")
if not os.path.exists(hic_file):
hic_file = os.path.join(input_dir, "hic.byread.compressed")
compressed_hic = os.path.join(output_dir, "hic.byread.compressed")
if os.path.exists(compressed_hic):
hic_file = compressed_hic
noseq_gfa = os.path.join(input_dir, "unitig-popped-unitig-normal-connected-tip.homopolymer-compressed.noseq.gfa")
noseq_gfa = os.path.join(input_dir, "unitigs.hpc.noseq.gfa")
clustering_output = os.path.join(output_dir, "cluster.out")
cluster.run_clustering(noseq_gfa, matches_file, hic_file, output_dir)
#os.system(f'python3 {os.path.join(cur_dir, "cluster.py")} {noseq_gfa} {matches_file} {hic_file} {output_dir}> {clustering_output}')
csv_output = os.path.join(output_dir, "hicverkko.colors.tsv")
#BandageNG accepts tsv but only with tsv extension!
csv_copy = os.path.join(output_dir, "hicverkko.colors.csv")
shutil.copy(csv_output, csv_copy)
#Parsing clustering output
#echo -e "node\tmat\tpat\tmat:pat\tcolor" > unitig-popped-unitig-normal-connected-tip.colors.csv
'''csv_file = open(csv_output, 'w')
csv_file.write("node\tmat\tpat\tmat:pat\tcolor\n")
contig_names = set()
for line in open(noseq_gfa, 'r'):
arr = line.split()
if arr[0] == "S":
contig_names.add(arr[1])
for line in open (clustering_output, 'r'):
if len(line) > 3 and line[0:3] == "RES":
right = False
for lines in line.split('}, {'):
arr = lines.split('\'')
for contig in arr:
if contig in contig_names:
if right:
csv_file.write(f'{contig}\t0\t100000\t0:100000\t#8888FF\n')
else:
csv_file.write(f'{contig}\t100000\t0\t100000:0\t#FF8888\n')
right = True
csv_file.close()
'''
#cat cluster.out |grep -A 1 Seed|grep -v Initial | grep -v Seed |awk -F "}," '{alen=split($1, a, ","); blen=split($2, b, ","); for (i = 1; i<=alen; i++) { print a[i]"\t0\t100000\t0:100000\t#8888FF"} for (i = 1; i<= blen; i++) {print b[i]"\t100000\t0\t100000:0\t#FF8888"} }'|sed 's/({//g' |sed 's/})//g' |sed s/\'//g|sed s/\ //g |sed 's/{//g' | sort |uniq |grep -w -v -f unassigned >> unitig-popped-unitig-normal-connected-tip.colors.csv
#hi-c gfa(noseq) trio_colors
#../../devel/rukki/target/release/rukki trio --graph unitig-popped-unitig-normal-connected-tip.homopolymer-compressed.noseq.gfa --markers unitig-popped-unitig-normal-connected-tip.colors.csv -p out.path.tsv
rukki_output_tsv = os.path.join(output_dir, "unitig-popped-unitig-normal-connected-tip.paths.tsv")
rukki_output_gaf = os.path.join(output_dir, "unitig-popped-unitig-normal-connected-tip.paths.gaf")
rukki_line = f'rukki trio --graph {noseq_gfa} --markers {csv_output}'
rukki_line += " --issue-len 200000 --marker-ratio 5. --issue-ratio 3. --issue-cnt 100 "
#what about rukki options?
rukki_line += f'--init-assign {os.path.join(output_dir, "out_init_ann.csv")} --refined-assign {os.path.join(output_dir, "out_refined_ann.csv")} --final-assign {os.path.join(output_dir, "out_final_ann.csv")}'
rukki_line += " --marker-sparsity 5000 --issue-sparsity 1000 "
rukki_line += " --try-fill-bubbles --fillable-bubble-diff 1000 --fillable-bubble-len 500000 --assign-tangles --tangle-allow-deadend --solid-homozygous-cov-coeff 1.1 --solid-ratio 1.5 --hap-names haplotype1,haplotype2"
rukki_output_line = f' -p {rukki_output_tsv}'
os.system(rukki_line + rukki_output_line)
rukki_output_line = f' --gaf-format -p {rukki_output_gaf}'
os.system(rukki_line + rukki_output_line)
trio_file = os.path.join(input_dir, "unitig-popped-unitig-normal-connected-tip.trio.colors.csv")
print (rukki_output_tsv)
print(trio_file)
if os.path.exists(trio_file):
if eval_file != "":
e_file = open(eval_file, 'w')
else:
e_file = sys.stdout
e_file.write("Evaluating using all edges (including not phased with hi-c)\n")
evaluate_rukki.evaluate_rukki(rukki_output_tsv, trio_file, set(), e_file)
e_file.write("\n\nEvaluating using only long (hi-c-phased) edges\n")
evaluate_rukki.evaluate_rukki(rukki_output_tsv, trio_file, evaluate_rukki.get_phased_edges(csv_output), e_file)