-
Notifications
You must be signed in to change notification settings - Fork 0
/
stat_Sprime.py
73 lines (56 loc) · 2.1 KB
/
stat_Sprime.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 24 10:15:42 2020
@author: YudongCai
@Email: [email protected]
"""
import os
import click
CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
import allel
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from itertools import cycle
from pathlib import Path
def sites2regions(df):
"""
df 是直接sprime的.score输出文件转成的dataframe
"""
mdf = []
for i in range(df['SEGMENT'].max()+1):
tdf = df.loc[df['SEGMENT']==i, :]
chrom = tdf['CHROM'].values[0]
start = tdf['POS'].min()
end = tdf['POS'].max()
length = end - start + 1
score = tdf['SCORE'].max()
n_snps = tdf.shape[0]
mdf.append([chrom, start, end, length, i, score, n_snps])
mdf = pd.DataFrame(mdf, columns=['chrom', 'start', 'end', 'length', 'segment_index', 'score', 'n_snps'])
return mdf
@click.command()
@click.option('--scorefile', help='sprime的后缀为.score的输出文件')
@click.option('--outdir', help='输出文件目录(空目录或未创建的目录)')
def main(scorefile, outdir):
"""合并.score为区域,并进行基本统计"""
sdf = pd.read_csv(scorefile, sep='\t', dtype={'chrom': 'category'})
# 合并位点为区域
mdf = sites2regions(sdf)
outdir = Path(outdir)
outfile = outdir / 'sprime_regions.tsv.gz'
mdf.to_csv(outfile, sep='\t', index=False, compression='gzip')
# 区域分数分布情况
outfile = outdir / 'sprime_regions_stat.tsv.gz'
mdf[['length', 'n_snps', 'score']].describe().reset_index().to_csv(outfile, sep='\t', float_format='%.0f', index=False)
# 区域分数分布图
outfile = outdir / 'sprime_regions_pairplot.jpg'
g = sns.pairplot(sdf[['length', 'n_snps', 'score']], diag_kind="kde")
g.map_lower(sns.kdeplot, levels=4, color=".2")
g.savefig(outfile, dpi=300)
if __name__ == '__main__':
main()