From f3973fe556fd8542e2ee6020be9e07619e5368a8 Mon Sep 17 00:00:00 2001 From: Ali Ghaffaari Date: Thu, 7 Jun 2018 23:10:39 +0200 Subject: [PATCH 1/3] Add ksnper script --- vcfy/ksnper.py | 120 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 vcfy/ksnper.py diff --git a/vcfy/ksnper.py b/vcfy/ksnper.py new file mode 100644 index 0000000..e4c4dce --- /dev/null +++ b/vcfy/ksnper.py @@ -0,0 +1,120 @@ +# coding=utf-8 + +""" + vcfy.ksnper + ~~~~~~~~~~~ + + Report the number of SNPs in all k-mers. + + :copyright: (c) 2018 by Ali Ghaffaari. + :license: MIT, see LICENSE for more details. +""" + +import csv + +import click +import vcf +from Bio import SeqIO +from BitVector import BitVector + + +def reflen(ref_file): + """Get the length of reference genome sequence length. + + Args: + ref_file : file-like object or str + The reference genome file or file path. + """ + return len(next(SeqIO.parse(ref_file, 'fasta')).seq) + + +def compute_snpbv(vcf_reader, length): + """Compute SNP bitvector. The SNP bitvector is a bitvector of the length of + the reference genome where SNP loci are set. + + Args: + vcf_reader : vcf.Reader + VCF reader. + length : int + The length of reference genome. + + Return: + A BitVector.BitVector instance of size `length` where all SNP loci are + set. + """ + cnt = BitVector(size=length) + for record in vcf_reader: + cnt[record.POS-1] = 1 + return cnt + + +def ksnpcounts(snpbv, k): + """Generate number of SNPs in all k-mers in the reference genome. + + Args: + snpbv : BitVector.BitVector + The SNP bitvector. + k : int + The length of the k-mer. + + Return: + The number of SNPs in a k-mer. It yields this for all k-mers in the + reference genome. + """ + kcount = 0 + for i in range(k): + kcount += snpbv[i] + yield kcount + for i in range(k, len(snpbv)): + kcount += snpbv[i] + kcount -= snpbv[i-k] + yield kcount + + +def write_csv(output, vcf_file, ref_file, k, dialect='unix'): + """Write CSV file. + + Args: + output : file-like object + The output CSV file. + vcf_file : file-like object or str + The input VCF file or file path. + ref_file : file-like object or str + The input reference genome. If it is set to `None`, the reference + genome path will be inferred from VCF metadata (header). + k : int + The length of the k-mer. + dialect : str + This string specifies the dialect of the output CSV file. + """ + csv_writer = csv.DictWriter(output, + fieldnames=['k', 'count'], + dialect=dialect, + quoting=csv.QUOTE_NONE) + csv_writer.writeheader() + + vcf_reader = vcf.Reader(vcf_file) + if ref_file is None: + ref_file = open(vcf_reader.metadata['reference'], 'r') + bv = compute_snpbv(vcf_reader, reflen(ref_file)) + for count in ksnpcounts(bv, k): + csv_writer.writerow(dict(k=k, count=count)) + + +@click.command() +@click.argument('vcf', type=click.File('r'), default="-") +@click.option('-o', '--output', type=click.File('w'), default="-", + help="Write to this file instead of standard output.") +@click.option('-r', '--reference', type=click.File('r'), default=None, + help=("Reference genome FASTA file. It will be inferred from VCF " + "header, if not specified.")) +@click.option('-k', type=int, required=True, help="The value of k.") +@click.option('-d', '--dialect', type=click.Choice(csv.list_dialects()), + default='unix', show_default=True, + help="Use this CSV dialect.") +def cli(**kwargs): + write_csv(kwargs.pop('output'), + kwargs.pop('vcf'), + kwargs.pop('reference'), + kwargs.pop('k'), + kwargs.pop('dialect')) From afec230f98a5689a5a3c9ef2818a7e5d8ca2722f Mon Sep 17 00:00:00 2001 From: Ali Ghaffaari Date: Thu, 7 Jun 2018 23:11:39 +0200 Subject: [PATCH 2/3] Update package setup - Add new script dependencies. - Add `ksnper` to the list of console scripts. --- vcfy/release.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vcfy/release.py b/vcfy/release.py index d917e99..23609e3 100644 --- a/vcfy/release.py +++ b/vcfy/release.py @@ -71,6 +71,7 @@ 'PyVCF>=0.6.8', 'biopython>=1.71', 'numpy>=1.14.3', + 'BitVector>=3.4.8', 'click>=6.7', ] __tests_require__ = [] @@ -81,4 +82,5 @@ __entry_points__ = ''' [console_scripts] vcfy=vcfy.cli:cli +ksnper=vcfy.ksnper:cli ''' From d7b9e884c26bf09a95d44adedc9a76503f3b931b Mon Sep 17 00:00:00 2001 From: Ali Ghaffaari Date: Thu, 7 Jun 2018 23:15:52 +0200 Subject: [PATCH 3/3] Bump to version number v0.0.5 --- vcfy/release.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcfy/release.py b/vcfy/release.py index 23609e3..0632bae 100644 --- a/vcfy/release.py +++ b/vcfy/release.py @@ -38,7 +38,7 @@ __license__ = 'MIT' # Release -__version__ = '0.0.4' +__version__ = '0.0.5' __status__ = DS_PREALPHA # Package data