-
Notifications
You must be signed in to change notification settings - Fork 0
/
maf_query_majorAllele.py
60 lines (53 loc) · 2.16 KB
/
maf_query_majorAllele.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 11 09:17:27 2019
@author: YudongCai
@Email: [email protected]
"""
import click
import numpy as np
from Bio import AlignIO
from collections import Counter
@click.command()
@click.option('--maffile')
@click.option('--query', help='query基因组的名字')
@click.option('--outfile')
def main(maffile, outfile):
"""
输出maf比对文件
输出每个位点的major allele类型(只包含在ref和query基因组中都存在的位点)
MAF http://genome.ucsc.edu/FAQ/FAQformat.html#format5
input start is zero-based
output pos is one-based
maf中染色体命名格式为 基因组名.染色体号
"""
query_genome = 'panTro2'
with open(outfile, 'w') as f:
f.write('refGenome\trefChrom\trefLoc\trefStrand\trefNuc\tqueryGenome\tqueryChom\tqueryLoc\tqueryStrand\tqueryNuc\tMajorAllele\n')
for multiple_alignment in AlignIO.parse(maffile, "maf"):
starts = []
seqs = []
genomes = []
strands = []
chroms = []
for seqrec in multiple_alignment:
starts.append((seqrec.annotations["start"]))
seqs.append(list(str(seqrec.seq.upper())))
genomes.append(seqrec.id.split('.')[0]) # 物种.染色体
chroms.append(seqrec.id.split('.')[-1])
strands.append(seqrec.annotations['strand'])
if query_genome in genomes:
seqs = np.array(seqs)
ref_loc = starts[0]
print(genomes)
query_index = genomes.index(query_genome)
query_loc = starts[query_index]
for seq in seqs.T:
if seq[0] != '-':
major = Counter(seq).most_common()[0][0]
ref_loc += strands[0] # 1正链,-1负链
if seq[query_index] != '-':
query_loc += strands[query_index]
f.write(f'{genomes[0]}\t{chroms[0]}\t{ref_loc + 1}\t{strands[0]}\t{seq[0]}\t{genomes[query_index]}\t{chroms[query_index]}\t{query_loc+1}\t{strands[query_index]}\t{seq[query_index]}\t{major}\n')
if __name__ == '__main__':
main()