From feaeb69e9329f9b3538db5f3decef427519f6e53 Mon Sep 17 00:00:00 2001 From: Daniel Cameron Date: Tue, 14 Jul 2020 14:33:03 +1000 Subject: [PATCH] Rebased SAM and VCF strict specifications drafts --- Makefile | 22 +-- SAMstrict.tex | 397 ++++++++++++++++++++++++++++++++++++++++++++++++++ VCFstrict.tex | 254 ++++++++++++++++++++++++++++++++ 3 files changed, 664 insertions(+), 9 deletions(-) create mode 100644 SAMstrict.tex create mode 100644 VCFstrict.tex diff --git a/Makefile b/Makefile index 5692de00c..86e479b64 100644 --- a/Makefile +++ b/Makefile @@ -8,6 +8,8 @@ PDFS = BCFv1_qref.pdf \ CSIv1.pdf \ SAMv1.pdf \ SAMtags.pdf \ + SAMstrict.pdf \ + VCFstrict.pdf \ tabix.pdf \ VCFv4.1.pdf \ VCFv4.2.pdf \ @@ -19,15 +21,17 @@ pdf: $(PDFS:%=new/%) %.pdf: new/%.pdf cp $^ $@ -new/CRAMv2.1.pdf diff/CRAMv2.1.pdf: CRAMv2.1.tex new/CRAMv2.1.ver -new/CRAMv3.pdf diff/CRAMv3.pdf: CRAMv3.tex new/CRAMv3.ver -new/crypt4gh.pdf diff/crypt4gh.pdf: crypt4gh.tex new/crypt4gh.ver -new/SAMv1.pdf diff/SAMv1.pdf: SAMv1.tex new/SAMv1.ver -new/SAMtags.pdf diff/SAMtags.pdf: SAMtags.tex new/SAMtags.ver -new/VCFv4.1.pdf diff/VCFv4.1.pdf: VCFv4.1.tex new/VCFv4.1.ver -new/VCFv4.2.pdf diff/VCFv4.2.pdf: VCFv4.2.tex new/VCFv4.2.ver -new/VCFv4.3.pdf diff/VCFv4.3.pdf: VCFv4.3.tex new/VCFv4.3.ver -new/VCFv4.4.pdf diff/VCFv4.4.pdf: VCFv4.4.tex new/VCFv4.4.ver +new/CRAMv2.1.pdf diff/CRAMv2.1.pdf: CRAMv2.1.tex new/CRAMv2.1.ver +new/CRAMv3.pdf diff/CRAMv3.pdf: CRAMv3.tex new/CRAMv3.ver +new/crypt4gh.pdf diff/crypt4gh.pdf: crypt4gh.tex new/crypt4gh.ver +new/SAMv1.pdf diff/SAMv1.pdf: SAMv1.tex new/SAMv1.ver +new/SAMtags.pdf diff/SAMtags.pdf: SAMtags.tex new/SAMtags.ver +new/SAMstrict.pdf diff/SAMstrict.pdf: SAMstrict.tex new/SAMstrict.ver +new/VCFstrict.pdf diff/SAMstrict.pdf: VCFstrict.tex new/VCFstrict.ver +new/VCFv4.1.pdf diff/VCFv4.1.pdf: VCFv4.1.tex new/VCFv4.1.ver +new/VCFv4.2.pdf diff/VCFv4.2.pdf: VCFv4.2.tex new/VCFv4.2.ver +new/VCFv4.3.pdf diff/VCFv4.3.pdf: VCFv4.3.tex new/VCFv4.3.ver +new/VCFv4.4.pdf diff/VCFv4.4.pdf: VCFv4.4.tex new/VCFv4.4.ver PDFLATEX = pdflatex diff --git a/SAMstrict.tex b/SAMstrict.tex new file mode 100644 index 000000000..67c4ca40b --- /dev/null +++ b/SAMstrict.tex @@ -0,0 +1,397 @@ +\documentclass[10pt]{article} +\usepackage[margin=1in]{geometry} +\usepackage{longtable} +\usepackage[pdfborder={0 0 0},hyperfootnotes=false]{hyperref} +\usepackage[title]{appendix} + +\newcommand{\rulename}[1]{\tt #1} +\newcommand{\rulecategory}[1]{\tt #1} +\newcommand{\samrule}{\tt SAM} +\newcommand{\vonefive}{\tt v1.5} +\newcommand{\vonefivebestpractice}{\tt SAMv1.5 best practice} +\newcommand{\vcffourthree}{\tt VCFv4.3} +% #1: error message +% #2: rule description +% #3: categories +\newcommand{\samstrictrule}[3]{ + \paragraph{} #3 + % error message formatting + {\tt #1} + % + #2 +} +% #1: header +% #2: categories +\newcommand{\headerrequired}[2]{ + \samstrictrule{Missing #1 header}{A #1 header must be present.}{#2} +} +\newcommand{\headerunique}[2]{ + \samstrictrule{Only one #1 header may be present}{Multiple #1 headers must not be present.}{#2} +} +% #1: header +% #2: tag +% #3: categories +\newcommand{\headertagrequired}[3]{ + \samstrictrule{Missing #1 header #2 tag}{The #1 header #2 must be present.}{#3} +} +% #1: header +% #2: tag +% #4: categories +\newcommand{\headertagunique}[3]{ + \samstrictrule{Duplicate #1 header #2 tags.}{Each #1 header #2 tags must be unique.}{#2} +} +% #1: header +% #2: tag +% #3: regex +% #4: categories +\newcommand{\headertagregex}[4]{ + \samstrictrule{Malformed #1 header #2 tag}{The #1 header #2 tag must conform to the regex {\tt #3}}{#4} +} +% #1: header +% #2: tag +% #3: possible tag values +% #4: categories +\newcommand{\headertagvalues}[4]{ + \samstrictrule{Malformed #1 header #2 tag}{The #1 header #2 tag must contain one of #3}{#4} +} +% #1: header +% #2: tag +% #3: categories +\newcommand{\headertagmatchsamspecs}[3]{ + \samstrictrule{Malformed #1 header #2 tag}{The #1 header #2 tag must match the format defined defined in the SAM specifications.}{#3} +} + +\begin{document} + +\input{SAMstrict.ver} +\title{SAM Strict Specification} +\author{Daniel L Cameron} +\date{\headdate} +\maketitle +\begin{quote}\small +The master version of this document can be found at +\url{https://github.com/samtools/hts-specs}.\\ +This printing is version~\commitdesc\ from that repository, +last modified on the date shown above. +\end{quote} +\vspace*{1em} + +\noindent +This document is a companion to the {\sl Sequence Alignment/Map Format +Specification} that defines the SAM file format.\footnote{See +\href{http://samtools.github.io/hts-specs/SAMv1.pdf}{\tt SAMv1.pdf} at \url{https://github.com/samtools/hts-specs}.} +The SAM file format defines the syntax required for a file to be +a valid SAM file. It does not require such files to be semantically +valid and internally consistent. +This document describes a set of additional semantic restrictions +for which the subset of syntactically valid SAM files that comply +with these restrictions can be described as \textit{SAM strict +compliant}. + +\renewcommand{\abstractname}{Introduction} +\begin{abstract} + +The SAM specifications have been instrumental in standardising +the file formats used for sequence alignment. A large ecosystem of +bioinformatics tools is now capable of reading and/or writing +SAM files. Unfortunately, many tools that read SAM files are tightly +coupled to a particular upstream tool +and fail to correctly execute on valid SAM files written by other +tools. In part, this is due to the lack of semantic restrictions +inherent in the SAM file format. A syntactically valid SAM file +can be both internally inconsistent and semantically nonsensical. + +The purpose of this document is to provide a baseline of semantic +validity for which tools should comply with when outputing SAM +files, and tools which input SAM files can safely assume when +they require input files to be \textit{SAM strict compliant}. + +\end{abstract} + +\section{Headers} + +{paragraph} + +The first segment is is considered to be the "next" segment of the final segment in a template as per the SAM specifications. + + +\section{Headers} +\samstrictrule{Undefined reserved header present}{Upper-case header record type codes are not defined in the SAM specifications must not be used.} +\samstrictrule{Undefined header tag present}{Upper-case header tags not defined in the SAM specifications must not be used.} +\samstrictrule{Tag present as both lowercase and uppercase}{A file should not contain the same tag in both upper-case and lowercase format. See the SAM specifications header tags best practice footnote.} +\samstrictrule{Malformed header}{Header lines with conform to either the {\tt + /\char94@[A-Z][A-Z](\char92t[A-Za-z][A-Za-z0-9]:[ + -\char126]+)+\$/} or {\tt /\char94@CO\char92t.*/} or {\tt /\char94@CO\char92t.*/} regex.}{\samrule} + +\subsection{HD} +\headerrequired{HD}{\vonefivebestpractice} +\headerunique{HD} +\headertagrequired{HD}{VN}{\samrule} +\samstrictrule{File does not start with HD header.}{The first header defined must be the HD header.}{\vonefivebestpractice} +\headertagregex{HD}{VN}{/\char94[0-9]+\char92.[0-9]+\$/}{\samrule} +\samstrictrule{Unknown SAM version}{The HD header VN tag version number must match a published version of the SAM specifications.} +\headertagvalues{HD}{SO}{{\tt unknown}, {\tt unsorted}, {\tt queryname} and {\tt coordinate}}{\samrule} +\headertagvalues{HD}{GO}{{\tt none}, {\tt query}, {\tt reference}}{\samrule} +\samstrictrule{Inconsistent HD header SO and GO tags}{The record orderings defined in the HD header SO and GO tags must be consistent} + +\subsection{SQ} +\samstrictrule{Missing SQ header}{The SQ header must be present if any reads have been mapped.}{\vonefivebestpractice} +\headertagrequired{SQ}{SN}{\samrule} +\headertagregex{SQ}{SN}{[!-)+-\char60\char62-\char126][!-\char126]*}{\samrule} +\headertagunique{SQ}{SN} +\headertagrequired{SQ}{LN}{\samrule} +\samstrictrule{Malformed SQ header LN tag}{The SQ header LN tag value must be an integer.}{\samrule} +\samstrictrule{Unsupported reference sequence length}{The SQ header LN tag value must greater than zero and less than 2147483648}{\samrule} +\headertagrequired{SQ}{M5}{\samrule} +\headertagmatchsamspecs{SQ}{AH}{\samrule} +\samstrictrule{Alternate locus references unknown reference sequence name}{Sequence names in the SQ header AH tag must match a SQ header SN reference sequence name.}{\samrule} +\headertagmatchsamspecs{SQ}{AN}{\samrule} +\samstrictrule{Duplicate alternative reference sequence names.}{Alternative reference sequence names defined in SQ header AN tags must be unique. A single tag cannot define cannot define the same alternative reference sequence name multiple times and multiple SQ headers cannot define the same alternative reference sequence name.}{\samrule} +\samstrictrule{Invalid alternative reference sequence names.}{Sequence names in the SQ header AN tag must not match any SQ header SN reference sequence names. }{\samrule} +\headertagregex{SQ}{M5}{[a-f0-9]\{32\}} + +\subsection{RG} +\headertagrequired{RQ}{ID}{\samrule} +\headertagunique{RQ}{ID}{\samrule} +\samstrictrule{RQ header DT tag is not ISO8601}{RQ header DT tag must contain a valid date in ISO8601 format}{\samrule} +\headertagregex{RQ}{FO}{/\char92*|[ACMGRSVTWYHKDBN]+/}{\samrule} +\samstrictrule{Malformed RQ header PI tag}{The RQ header PI tag value must be a floating point value.}{\samrule} +\headertagvalues{RG}{PL}{{\tt CAPILLARY}, {\tt LS454}, {\tt ILLUMINA}, {\tt SOLID}, {\tt HELICOS}, {\tt IONTORRENT}, {\tt ONT}, and {\tt PACBIO}}{\samrule} +\samstrictrule{Invalid RG program group}{The RG header PG tag must contain one of the program groups specified in an ID tag of a PG header.} + +\subsection{PG} +\headertagrequired{PG}{ID}{\samrule} +\headertagunique{PG}{ID}{\samrule} +\samstrictrule{Invalid PG header PP tag}{The PG header PP tag must contain one of the program groups specified in an ID tag of a PG header.} + +\section{General} + +\subsection{File Format} + +\samstrictrule{File is not UTF-8}{The file must use UTF-8 encoding.}{\samrule} +\samstrictrule{Inconsistent line terminators}{All lines must be separated with the same new line character\(s\).} +\samstrictrule{Malformed floating point value}{All floating point values must conform to the regex {\tt [-+]?[0-9]*\char92.?[0-9]+([eE][-+]?[0-9]+)?}} +\samstrictrule{Malformed integer value}{All integer values must conform to the regex {\tt [-+]?[0-9]+}} + +\subsection{Ordering} + +\samstrictrule{Record ordering does not match HD header SO tag}{The order of records must be consistent with the HD header SO tag} +\samstrictrule{Record ordering does not match HD header GO tag}{The order of records must be consistent with the HD header GO tag} +\samstrictrule{Orphaned unmapped read}{If a read is unmapped, RNAME and POS must either be * and 0, or the RNAME and POS of another read from the same template.} + +\section{Records} + +\subsection{QNAME} +\samstrictrule{Empty QNAME}{QNAME fields must be at least one character in length}{\samrule} +\samstrictrule{QNAME too long}{QNAME fields must be at less than 255 characters in length}{\samrule} +\samstrictrule{Invalid character in QNAME}{QNAME fields must conform to the regex {TT \verb:[!-?A-~]}}{\samrule} + +\subsection{FLAG} +\samstrictrule{Incorrect FLAG 0x1}{All templates with multiple segments must have FLAG 0x1 set} +\samstrictrule{Incorrect FLAG 0x1}{All templates with a single segment must not have FLAG 0x1 set} +\samstrictrule{Inconsistent FLAG 0x1}{All records with the same QNAME must have the same FLAG 0x1 value} +\samstrictrule{Inconsistent FLAG 0x2}{All primary records with the same QNAME must have the same FLAG 0x2 value} +\samstrictrule{Missing primary alignment record}{No supplementary or secondary alignments may exist for reads with an unmapped with primary alignment.}{\samrule} +\samstrictrule{Inconsistent FLAGs 0x1 0x2}{The 0x2 FLAG must not be set if the 0x1 FLAG is not.} +\samstrictrule{Inconsistent FLAGs 0x1 0x8}{The 0x8 FLAG must not be set if the 0x1 FLAG is not.} +\samstrictrule{Inconsistent FLAGs 0x1 0x20}{The 0x20 FLAG must not be set if the 0x1 FLAG is not.} +\samstrictrule{Inconsistent FLAGs 0x1 0x40}{The 0x40 FLAG must not be set if the 0x1 FLAG is not.} +\samstrictrule{Inconsistent FLAGs 0x1 0x80}{The 0x80 FLAG must not be set if the 0x1 FLAG is not.} +\samstrictrule{Inconsistent FLAGs 0x2 0x4}{The 0x2 FLAG must not be set if 0x4 is set in any primary alignments in the template} +\samstrictrule{Inconsistent FLAGs 0x4 0x8}{The 0x8 FLAG for primary record for each segment must match the 0x4 FLAG of the primary record for the next segment in the template} +\samstrictrule{Inconsistent FLAG 0x10 0x20}{The 0x20 FLAG must match the 0x10 FLAG for the primary alignment of the next segment in the template} +\samstrictrule{FLAG 0x20 set on unmapped read}{The 0x10 FLAG must not be set if the 0x4 FLAG is is set.} +\samstrictrule{Duplicate first segment primary records}{Of the primary records with the same QNAME, at most one can have FLAG 0x40 set} +\samstrictrule{Missing first segment primary record}{Of the records with the same QNAME with 0x1 FLAG, at least one record must have FLAG 0x40 set.} +\samstrictrule{Duplicate last segment primary records}{Of the primary records with the same QNAME, at most one can have FLAG 0x80 set} +\samstrictrule{Missing last segment primary record}{Of the records with the same QNAME with 0x1 FLAG, at least one record must have FLAG 0x80 set.} +\samstrictrule{Multiple primary alignment records}{Each segment must have at most one record with FLAG 0x100 and 0x800 not set.}{\samrule} +\samstrictrule{Missing primary alignment record}{Each segment must have at least one record with FLAG 0x100 and 0x800 not set.}{\samrule} +\samstrictrule{Unknown FLAG bit set}{FLAG bits higher than 0x800 must not be set}{\samrule} +\samstrictrule{Unmapped reads should not have FLAG 0x10 set}{Unmapped reads should be stored in the orientation in which they came off the sequencing machine and have 0x10 unset.}{\vonefivebestpractice} + +\subsection{RNAME} +\samstrictrule{Malformed RNAME}{RNAME must conform to the regex {\tt \char92*|[!-()+-\char60\char62-\char126][!-\char126]*}}{\samrule} +\samstrictrule{RNAME not present in reference}{RNAME must be equal to the value of one of the SQ SN values defined in the header.} +\samstrictrule{RNAME contains character not supported by VCFv4.3}{RNAME must not contain any of the following characters: {\tt \char60\char62\char91\char93\char58\char42}}{\vcffourthree} +\samstrictrule{RNAME not supported by VCFv4.3}{RNAME must be not be one of {\tt DEL}, {\tt INS}, {\tt DUP}, {\tt INV}, {\tt CNV}, or {\tt BND}.}{\vcffourthree} +\samstrictrule{RNAME does not match mate}{For a unmapped paired-end or mate-pair read whose mate is mapped, the unmapped read should have RNAME identical to its mate}{\vonefivebestpractice} +\samstrictrule{RNAME specified for unmapped template}{If all segments in a template are unmapped, their RNAME should be set as *}{\vonefivebestpractice} + + +\subsection{POS} +\samstrictrule{Record placed outside of reference sequence}{If FLAG 0x4 is not set, POS must be between 0 and the length of the RNAME reference sequence inclusive. The length of the RNAME reference sequence can be found in the SQ header LN tag value for the SQ header with a SN tag matching the RNAME.}{\vonefivebestpractice} +\samstrictrule{Invalid POS}{POS cannot be 0 if FLAG 0x4 is set.} +\samstrictrule{Invalid POS}{POS cannot be negative.}{\samrule} +\samstrictrule{Invalid POS}{POS cannot be greater than 2147483647.}{\samrule} +\samstrictrule{POS specified without RNAME}{If RNAME is *, POS must be 0.} +\samstrictrule{POS does not match mate.}{For a unmapped paired-end or mate-pair read whose mate is mapped, the unmapped read should have POS identical to its mate}{\vonefivebestpractice} +\samstrictrule{POS specified for unmapped template}{If all segments in a template are unmapped, their POS should be set as 0.}{\vonefivebestpractice} + +\subsection{MAPQ} +\samstrictrule{Invalid MAPQ}{MAPQ must be between 0 and 255 inclusive.}{\samrule} +\samstrictrule{Missing MAPQ}{Aligned reads should not have 255 MAPQ.}{\vonefivebestpractice} + +\subsection{CIGAR} +\samstrictrule{Invalid CIGAR}{All CIGAR strings must conform to the regex {\tt \char92*|([0-9]+[MIDNSHPX=])+}}{\samrule} +\samstrictrule{Empty CIGAR}{All CIGAR strings must have at least one CIGAR operator} +\samstrictrule{Zero length CIGAR operator}{All CIGAR operators must have a non-zero positive length} +\samstrictrule{CIGAR contains operator repeat}{All adjacent CIGAR operators must be different.}{\vonefivebestpractice} +\samstrictrule{CIGAR does not contain any mapped bases}{All CIGARs must include a CIGAR operator that consumes a reference base.}{\samrule} + {\tt Should we allow alignments with zero mapped bases? Seven bridges has a graph-based aligner that will +emit CIGARs such as 100I for alignments that align to a known insertion that is not included in the reference. Useful for local assembly but technical voilates the SAM specifications} +\samstrictrule{Incorrect CIGAR length}{Sum of lengths of the M/I/S/=/X operations must equal the length of SEQ when both CIGAR and SEQ are available.}{\samrule} +\samstrictrule{Invalid CIGAR hard clip}{H must only be present as the first and/or last operation.}{\samrule} +\samstrictrule{Invalid CIGAR soft clip}{S must only have H operations between them and the ends of the CIGAR string.}{\samrule} +\samstrictrule{CIGAR overhangs reference sequence}{POS plus the number of reference bases consumed by the CIGAR must not exceed the length of the RNAME reference sequence.}{\vonefivebestpractice} +\samstrictrule{Inconsistent CIGAR read lengths}{All mapped alignments for a given segment must have matching read lengths. That is, the sum of lengths of the M/I/S/=/X/H operations must be equal.} + +\samstrictrule{TODO: Unusual indel positioning}{TODO: should we disallow I/D operators at the ends of reads? There was some ambiguity as to how deletions interacted with POS but I think the spec has been updated in favour of the BWA interpretation since that discussion.} + +\subsection{RNEXT} +\samstrictrule{Invalid RNEXT}{If the template contains one segment RNEXT must be *} +\samstrictrule{RNEXT not present in reference}{RNEXT must be equal to the value of one of the SQ SN values defined in the header.} +\samstrictrule{Invalid RNEXT}{If the primary alignment of the next read in the template is mapped, RNEXT must not be {\tt *}}{\samrule} +\samstrictrule{RNEXT not using =}{If the primary alignment of the next read in the template is mapped to the same reference sequence, RNEXT must be {\tt =}}{\samrule} +\samstrictrule{Incorrect RNEXT}{If this read is unmapped or the primary alignment of the next read in the template is mapped to the a different reference sequence, RNEXT must match the RNAME of the next read.} + +\subsection{PNEXT} +\samstrictrule{Invalid PNEXT}{If the template contains one segment PNEXT must be 0} +\samstrictrule{Incorrect PNEXT}{If the primary alignment of the next read in the template is unmapped, PNEXT must be 0} +\samstrictrule{Incorrect PNEXT}{If the primary alignment of the next read in the template is mapped, PNEXT must match the POS of that record.} + +\subsection{TLEN} +\samstrictrule{TLEN out of range}{TLEN cannot be greater than 2147483647.}{\samrule} +\samstrictrule{TLEN out of range}{TLEN cannot be less than -2147483648.}{\samrule} +\samstrictrule{Invalid TLEN}{TLEN must be 0 if flag 0x1 is not set.}{\samrule} + +\subsection{SEQ} +\samstrictrule{Inconsistent SEQ read lengths}{All alignments of a given segment must have consistent SEQ lengths. That is, for all non-* SEQ, SEQ + length of CIGAR hard clip must be equal. } +\samstrictrule{Inconsistent SEQ sequences}{All alignments of a given segment with non-* SEQ must have consistent base calls. A base cannot be called an A in one record, but a T in another. Note that to determine the read base, both the 0x10 FLAG, and any hard clipping CIGAR operators need to be taken into account.} +\samstrictrule{SEQ of secondary alignments specified.}{SEQ of secondary alignments (0x100 FLAG set) should be set to * to reduce the file size.}{\vonefivebestpractice} +\samstrictrule{Invalid sequence base}{Unless SEQ is "*", SEQ read bases must be one of the following characters: acmgrsvtwyhkdbnACMGRSVTWYHKDBN} +\samstrictrule{SEQ does not match reference when CIGAR indicates match.}{Unless SEQ is "*", read bases with CIGAR operator = must match the reference base. Bases are considered to match if overlap between the possible read and reference bases (based on their IUPAC codes) is non-zero.} +\samstrictrule{SEQ matches reference when CIGAR indicates mismatch.}{Unless SEQ is "*", read bases with CIGAR operator X must not match the reference base. Bases are considered to match on if, when ignoring case, the reference and read bases are the same character and the character is one of the following characters: acgtACGT.} + +\subsection{QUAL} +\samstrictrule{QUAL specified without SEQ}{QUAL must be * if SEQ is *}{\samrule} +\samstrictrule{SEQ QUAL length mismatch.}{The length of a non-* QUAL must match the length of SEQ.}{\samrule} +\samstrictrule{Invalid QUAL}{The ASCII value of all QUAL bases must be at least 33.} +\samstrictrule{QUAL of secondary alignments specified.}{QUAL of secondary alignments (0x100 FLAG set) should be set to * to reduce the file size.}{\vonefivebestpractice} +\samstrictrule{TODO: QUAL edge case}{What should we do when a read is length 1 and the QUAL encodes to "*" ?}{\samrule} +\samstrictrule{Inconsistent QUAL scores}{All alignments with non-* QUAL of a given segment must have consistent base quality scores. Note that to determine the base quality, both the 0x10 FLAG, and any hard clipping CIGAR operators need to be taken into account.} + +\section{SAM Tags} +\subsection{Tag format} +\samstrictrule{Malformed tag}{Tags must be a two character string conforming to the following regex: {\tt /[A-Za-z][A-Za-z0-9]/}|}{\samrule} +\samstrictrule{Invalid tag type}{Tag type must be one of AifZHB.}{\samrule} +\samstrictrule{Malformed A tag}{A tags must conform to the the regex {\tt [!-\char126]}}{\samrule} +\samstrictrule{Malformed i tag}{i tags must conform to the the regex {\tt [-+]?[0-9]+}}{\samrule} +\samstrictrule{Malformed f tag}{f tags must conform to the the regex {\tt [-+]?[0-9]*\char92.?[0-9]+([eE][-+]?[0-9]+)?}}{\samrule} +\samstrictrule{Malformed Z tag}{Z tags must conform to the the regex {\tt [\,\,\,!-\char126]*}}{\samrule} +\samstrictrule{Malformed H tag}{H tags must conform to the the regex {\tt ([0-9A-F][0-9A-F])*}}{\samrule} +\samstrictrule{Malformed B tag}{B tags must conform to the the regex {\tt [cCsSiIf](,[-+]?[0-9]*\char92.?[0-9]+([eE][-+]?[0-9]+)?)+}}{\samrule} +\samstrictrule{Non-integer value in integer array}{Type B tags starting with one of "cCsSiI" must contain integer values.} +\samstrictrule{Tag array value out of bounds}{Type B tags must not contain values that are greater than or less than the maximum or minimum value representable by the specified prefix.} +\samstrictrule{Tag integer out of bounds for BAM representation}{Type i tag values must be within the range {\tt ~$[-2^{31},2^{32})$}}{\bam} +\samstrictrule{Unknown reserved tag}{ + No record can include any reserved tags not defined in the + {\sl Sequence Alignment/Map Optional Fields Specification}. + Non-standard tags must start X, Y, Z or a lowercase letter as per the SAM specifications. +} +\samstrictrule{Incorrect tag type}{The \textit{type} of all \textit{standard tags} must match the type +defined in the {\sl Sequence Alignment/Map Optional Fields Specification}.} +\samstrictrule{Duplicate tag}{}{\samrule} +\samstrictrule{Invalid tag MAPQ}{All mapping quality scores, including those defined in tags must be within the range [0, 255]. +A value 255 indicates that the mapping quality is not available and must only be used if the +mapping quality field is required. For example, a mapping quality field value is required for the mapq portion of the SA tag, but as the AM tag is optional, a mapping quality +field value is not required and the AM tag should be omitted entirely if a mapping quality is +not available.} + + +TODO: Add validators for all standard tags (basic SAM validation done). + + + + + + + +\subsection{RG} +3 When a RG tag appears anywhere in the alignment section, there should be a single corresponding +@RG line with matching ID tag in the header.{\vonefivebestpractice} +\subsection{RG} +4 When a PG tag appears anywhere in the alignment section, there should be a single corresponding +@PG line with matching ID tag in the header.{\vonefivebestpractice} + + +\paragraph{} + +All tag values must be consistent with the format +defined in the {\sl Sequence Alignment/Map Optional Fields Specification}. + +\subsection{SA} + +For the purpose of this section, a \textit{SA record set} is a set of SAM records +from a single \textit{read} which collectively represent a single \textit{chimeric alignment}. + +\samstrictrule{Missing SA tag}{All records with FLAG 0x800 set must have a SA tag defined.}{\samrule} +\samstrictrule{Missing non-supplementary chimeric alignment record}{Each chimeric alignment must have a record with FLAG 0x800 not set.} + +\paragraph{} + +All SA tag values must satisfy the SA tag regular expression +defined in the {\sl Sequence Alignment/Map Optional Fields Specification}. + +\paragraph{} + +All records referenced in the SA tag of a given record must have a SA tag defined. + +\paragraph{} + +All records referenced in the SA tag of a given record must include the given +record in their SA tag. + +\paragraph{} + +All records in a \textit{SA record set} with a FI tag defined must have the same FI tag value. + +TODO: which other tags? + +\paragraph{} + +All records referenced in the SA tag must exist with matching \textit{rname, pos, strand, CIGAR}. + +\paragraph{} + +The SA \textit{mapq} of all references to a given record in a \textit{SA record set} must +match the record mapping quality. + +\paragraph{} + +The SA \textit{NM} of all references to a given record in a \textit{SA record set} must +match the record \textit{NM} tag value. + +\paragraph{} + +All records except 1 in a \textit{SA record set} must have the 0x800 (supplementary alignment) FLAG bit set. + +\paragraph{} + +The first SA record in all supplementary alignment records must be the canonical non-supplementary alignment. + +\paragraph{} + +All records in a \textit{SA record set} must have the same 0x100 (secondary alignment) FLAG value. + +\paragraph{} + +All records in a \textit{SA record set} must have FLAG bit 0x4 (segment unmapped) not set. + +\paragraph{} + +All records in a \textit{SA record set} must align at least one read base that does not +overlap with any other alignments in the \textit{SA record set}. +That is, a chimeric alignment cannot contain superfluous alignment records. + +\end{document} diff --git a/VCFstrict.tex b/VCFstrict.tex new file mode 100644 index 000000000..89db1a08c --- /dev/null +++ b/VCFstrict.tex @@ -0,0 +1,254 @@ +\documentclass[10pt]{article} +\usepackage[margin=1in]{geometry} +\usepackage{longtable} +\usepackage[pdfborder={0 0 0},hyperfootnotes=false]{hyperref} +\usepackage[title]{appendix} + +% #1: short error code +% #2: short error description +% #3: rule description +% #4: categories +% #5: additional explanatory text +\newcommand{\vcfstrictrule}[5]{ + \paragraph{#1} #2 #4 + #5 + \par +} +% Rule is part of the base VCF specifications +\newcommand{\vcfspec}{\tt VCF} +\newcommand{\SPECISSUE}[1]{\paragraph{} #1} +\newcommand{\TODO}[1]{\paragraph{TODO} complete this section: #1} +% #1: Meta-information key +% #2: Missing field +\newcommand{\structuredheadermissingfield}[2] { + \vcfstrictrule{mi.#1.#2.missing}{Missing meta-information \tt{#1} \tt{#2} key}{Missing \tt{#1} field for \tt{#2} meta-information line.}{\vcfspec}{} +} +\newcommand{\externalfilevalidation}[5] { + \TODO{Should we check external URL be included? My preference is that is it not and the validation should be entirely self-contained in the VCF} +} +\newcommand{\phredoob}[2] { + \vcfstrictrule{#1}{#2}{Phred-scaled fields must be greater than or equal to zero.}{}{} +} + +% Rule categories +% "global": multi-record validations +% "sv": +% "external": relies on an external file + +\begin{document} + +\input{VCFstrict.ver} +\title{VCF Strict Specification} +\author{Daniel L Cameron} +\date{\headdate} +\maketitle +\begin{quote}\small +The master version of this document can be found at +\url{https://github.com/samtools/hts-specs}.\\ +This printing is version~\commitdesc\ from that repository, +last modified on the date shown above. +\end{quote} +\vspace*{1em} + +\noindent +This document is a companion to the {\sl Variant Call Format Specification} that defines the VCF file format. +\footnote{See \href{http://samtools.github.io/hts-specs/VCFv4.3.pdf}{\tt VCFv4.3.pdf} at \url{https://github.com/samtools/hts-specs}.} +The VCF file format defines the syntax required for a file to be a valid VCF file. +It does not require such files to be semantically valid and internally consistent. +This document describes a set of additional semantic restrictions for which the subset of syntactically valid VCF files that comply with these restrictions can be described as \textit{VCF strict compliant}. + +\renewcommand{\abstractname}{Introduction} +\begin{abstract} + +The VCF specifications have been instrumental in standardising the file formats used for variant calling. +A large ecosystem of bioinformatics tools is now capable of reading and/or writing VCF files. +Unfortunately, many tools that read VCF files are tightly coupled to a particular upstream tool and fail to correctly execute on valid VCF files written by other tools. +In part, this is due to the lack of semantic restrictions inherent in the VCF file format. +A syntactically valid VCF file can be both internally inconsistent and semantically nonsensical. + +The purpose of this document is to provide a baseline of semantic validity for which tools should comply with when outputing VCF files, and tools which input VCF files can safely assume when they require input files to be \textit{VCF strict compliant}. + +\end{abstract} + +\section{Format} + +\vcfstrictrule{file.encoding}{Invalid file encoding}{File is not a valid UTF-8 file.}{\vcfspec}{} +\vcfstrictrule{file.newlines}{Inconsistent newlines}{File mixes CR and CR+LF line terminators.}{}{} +\vcfstrictrule{file.blankline}{Blank line}{File contains a blank line.}{}{} + +\section{Meta-information Lines} + + +\vcfstrictrule{mi.keyvalue.malformed}{Malformed Meta-information line}{Meta-information line is not of the form key=value}{\vcfspec}{} +\vcfstrictrule{mi.key.malformed}{Invalid Meta-information key}{Meta-information line key must conform to the regex \tt{[::alpha::]+}}{}{} + +\subsection{Structured fields} + +\vcfstrictrule{mi.structured.value.malformed}{Malformed structured meta-information}{Structured meta-information line value does not start with \tt{<} and end with \tt{>}.}{\vcfspec}{} +\vcfstrictrule{mi.structured.extrafield.position}{Incorrrectly placed structured meta-information extra field }{Structured meta-information extra field located before a default field.}{\vcfspec}{} +\vcfstrictrule{mi.structured.extrafield.malformed}{Incorrectly type structured meta-information extra field}{Structured meta-information extra field not start and end with \tt{"}.}{\vcfspec}{} +\vcfstrictrule{mi.structured.duplicated}{Duplicate structured meta-information line}{Multiple meta-information lines with with same key and \tt{ID} found.}{\vcfspec}{} + +\SPECISSUE{What's the point of quotes in structured header fields? Just so they can contain commas?} + +\subsection{fileformat} +\vcfstrictrule{mi.fileformat.missing}{Missing fileformat}{fileformat meta-information line is missing}{\vcfspec}{} +\vcfstrictrule{mi.fileformat.position}{fileformat not first}{fileformat meta-information line is not the first line}{\vcfspec}{} +\vcfstrictrule{mi.fileformat.invalid}{Malformed fileformat}{fileformat value is not one of \tt{VCFv4.1}}{\vcf41}{} +\vcfstrictrule{mi.fileformat.invalid}{Malformed fileformat}{fileformat value is not one of \tt{VCFv4.2}}{\vcf42}{} +\vcfstrictrule{mi.fileformat.invalid}{Malformed fileformat}{fileformat value is not one of \tt{VCFv4.3}}{\vcf43}{} +\vcfstrictrule{mi.fileformat.invalid}{Malformed fileformat}{fileformat value is not one of \tt{VCFv4.4}}{\vcf44}{} +\vcfstrictrule{old.rule}{Malformed fileformat}{fileformat value is not one of \tt{VCFv4.4}}{\vcf41, \vcf42}{} + + +\subsection{INFO} + +\structuredheadermissingfield{INFO}{ID} +\structuredheadermissingfield{INFO}{Number} +\structuredheadermissingfield{INFO}{Type} +\structuredheadermissingfield{INFO}{Description} + +\vcfstrictrule{mi.INFO.ID.malformed}{Malformed meta-information \tt{INFO} \tt{ID} field}{INFO ID field does not match to regex \tt{\^([A-Za-z\_][0-9A-Za-z\_.]*|1000G)\$}}{\vcfspec}{} +\vcfstrictrule{mi.INFO.Number.malformed}{Malformed meta-information \tt{INFO} \tt{Number} field}{INFO Number field is not a positive integer, \tt{A}, \tt{R}, \tt{G}, or \tt{.}.}{\vcfspec}{} +\TODO{Check fields are valid for each VCF version} +\vcfstrictrule{mi.INFO.Type.malformed}{Malformed meta-information \tt{INFO} \tt{Type} field}{INFO Type field is not one of \tt{Integer}, \tt{Float}, \tt{Flag}, \tt{Character}, \tt{String}.}{\vcfspec}{} + +\subsection{FILTER} + +\structuredheadermissingfield{FILTER}{ID} +\structuredheadermissingfield{FILTER}{Description} + +\subsection{FORMAT} + +\structuredheadermissingfield{FORMAT}{ID} +\structuredheadermissingfield{FORMAT}{Number} +\structuredheadermissingfield{FORMAT}{Type} +\structuredheadermissingfield{FORMAT}{Description} + +\vcfstrictrule{mi.INFO.ID.malformed}{Malformed meta-information \tt{INFO} \tt{ID} field}{INFO ID field does not match the regex \tt{\^[A-Za-z\_][0-9A-Za-z\_.]*}}{\vcfspec}{} +\vcfstrictrule{mi.INFO.Number.malformed}{Malformed meta-information \tt{INFO} \tt{Number} field}{INFO Number field is not a positive integer, \tt{A}, \tt{R}, \tt{G}, or \tt{.}.}{\vcfspec}{} +\TODO{Check fields are valid for each VCF version} +\vcfstrictrule{mi.INFO.Type.malformed}{Malformed meta-information \tt{INFO} \tt{Type} field}{INFO Type field is not one of \tt{Integer}, \tt{Float}, \tt{Character}, \tt{String}.}{\vcfspec}{} + + +\subsection{ALT} + +\structuredheadermissingfield{FILTER}{ID} +\structuredheadermissingfield{FILTER}{Description} + +\SPECISSUE{CNV, BND are a valid 3-base IUPAC code. Very bad. DUP also problematic for RNA} +\SPECISSUE{Why are IUPAC codes here? Seems like a bad idea to have to define every possible IUPAC indel used} +\SPECISSUE{BND is not actually a valid ALT allele.} +\SPECISSUE{DUP/DEL is defined as SVCLAIM=CN} + +\subsection{assembly} +\TODO{Should checking the URL be included? My preference is that is it not and the validation should be entirely self-contained in the VCF} +\externalfilevalidation{assembly.missingfile} + +\SPECISSUE{What happens if there are multiple assembly files specified?} +\SPECISSUE{Why must the assembly file be a fasta file? GRIDSS uses a BAM file for breakpoint assembly contigs.} +\SPECISSUE{This is defined as a breakpoint assembly file, but 1.6.1.1 refers to it directly. Is this an inconsisent double-use of this header field?} + +\vcfstrictrule{mi.assembly.contig.reserved}{Assembly contig name is reserved.}{ The assembly file contains a reserved contig name.}{\vcfspec}{ +Reserved contigs names are contigs named, or containing a colon and starting with any of \tt{DEL}, \tt{DUP}, \tt{INV}, \tt{INS}, \tt{CNV}, \tt{*}. +} + +\subsection{contig} + +\structuredheadermissingfield{contig}{ID} +\structuredheadermissingfield{length}{ID} % Should not be in \vcfspec category since it's not required by the specs +\vcfstrictrule{mi.contig.ID.malformed}{Malformed meta-information \tt{contig} \tt{ID} field}{contig ID field does not match the regex \tt{[0-9A-Za-z!\#\$\%\&+./:;?@\^\_|\~-][0-9A-Za-z!\#\$\%\&*+./:;=?@\^\_|\~-]*}.}{\vcfspec}{} +\vcfstrictrule{mi.contig.length.malformed}{Malformed meta-information \tt{contig} \tt{length} field}{contig length field is not an integer.}{\vcfspec}{} +\vcfstrictrule{mi.contig.length.outofbounds}{Meta-information \tt{contig} \tt{length} field out of bounds.}{Out of bounds contig length field. Minimum value is 0. Maximum value is 2,147,483,647 (2^31-1). }{\bcf}{ +BCF encodes position using a signed 32 bit integer. +} +\externalfilevalidation{mi.contig.url} + +\subsection{SAMPLE / META / PEDIGREE } + +\SPECISSUE{These aren't specified nearly well enough.} +\SPECISSUE{META and SAMPLE are not defined as structured fields in s1.4.0} + +\subsection{pedigreeDB } + +\externalfilevalidation{mi.pedigreeDB} + +\section{Header} + +\vcfstrictrule{header.sampleID.duplicate}{Duplicate sample ID}{Duplicated sample ID found.}{\vcfspec}{} +\vcfstrictrule{header.sampleID.empty}{Empty sample ID}{Header sample ID must be at least 1 character in length.}{}{} + +\section{Data lines} + +\vcfstrictrule{line.length.mismatch}{Mismatching sample count}{The number of sample genotype information has been provided for does not match the number of samples defined in the header.}{}{} +\vcfstrictrule{line.CHROM.grouped}{CHROM ungrouped}{Records are not grouped by CHROM.}{}{\vcfspec}{} +\vcfstrictrule{line.POS.outoforder}{CHROM unsorted}{Records grouped by CHROM are not in ascending order by POS.}{}{} +\vcfstrictrule{line.CHROM.outoforder}{CHROM unsorted}{CHROM ordering does not match the order of the meta-information contig records.}{}{} + +\subsection{CHROM} + +\vcfstrictrule{CHROM.missing}{Missing contig}{ No ##contig meta-information line found for this records. Does not apply to angle-bracketed ID Strings. }{}{} +\vcfstrictrule{CHROM.assembly.assembly.missing}{Missing assembly file}{ No assembly file specified using ##assembly. Applies only to angle-bracketed ID Strings. }{\vcfspec}{} +\vcfstrictrule{CHROM.assembly.missing}{Missing assembly contig}{ Assembly file contig identifier not found in the assembly file. Applies only to angle-bracketed ID Strings. }{\vcfspec}{} + +\subsection{POS} + +\vcfstrictrule{POS.outofbounds}{POS out of bounds }{ Value of out of representable bounds. Minimum value is 0. Maximum value is 2,147,483,647 (2^31-1). }{\vcfspec}{} +\vcfstrictrule{POS.contig.outofbounds}{POS exceeds contig length. }{ POS must be less than, or equal to, the contig length + 1 }{}{} +\vcfstrictrule{POS.telomere.nonbnd}{Telomeric records must be BND }{ Telomeric with POS of 0 or contig length + 1 must be BND symbolic alleles.}{}{} + +\subsection{POS} + +\vcfstrictrule{ID.duplicate}{ Duplicate ID }{ One or more of the semi-colon separated IDs in this field is not unique. }{\vcfspec}{} +\SPECISSUE{VCF merging problematic if different VCFs have different INFO.} + +\subsection{REF} + +\vcfstrictrule{REF.malformed}{ Invalid REF }{ REF does not match the regex [ACGTNacgtn]+. }{\vcfspec}{} +\vcfstrictrule{REF.reference.mismatch}{REF does not match reference }{ REF does not match reference genome sequence. }{}{} + +\subsection{ALT} + +\vcfstrictrule{ALT.malformed}{ Invalid ALT }{ ALT does not match the regex [ACGTNacgtn]+, \tt{.}, \tt{*}, a breakpoint string, a single breakend string, or a symbolic allele. }{\vcfspec}{} +\SPECISSUE{Is "ACT,." a valid ALT? The definition of QUAL could be read to mean that . must be the only ALT if it is supplied.} +\SPECISSUE{Single breakend are not explicitly required to have at least one base - "." could be interpreted as a single breakend.} +\vcfstrictrule{ALT.duplicate}{ Duplicated ALT }{ ALT alleles are not unique within this record. }{}{} +\vcfstrictrule{ALT.breakpoint.POS.outofbounds }{ The breakpoint is out of bounds. }{ The position of the other side of the breakpoint occurs is greater than the relevant contig length + 1}{}{} +\vcfstrictrule{ALT.breakpoint.CHROM.missing }{ The breakpoint contig is not valid. }{ No ##contig meta-information line found for this record, and, if angle bracketed, is not found in the assembly file. }{}{} +\vcfstrictrule{ALT.breakpoint.telomere.orientation }{The breakpoint orientation is invalid.}{ If the breakpoint position is 0 or contig length + 1, the breakpoint orientation must be towards to telomere. }{}{} + +\subsection{QUAL} + +\phredoob{QUAL.outofbounds}{QUAL out of bounds} + +\subsection{FILTER} + +\vcfstrictrule{FILTER.malformed}{Malformed FILTER}{ Filter cannot be \tt{0}.}{\vcfspec}{} +\vcfstrictrule{FILTER.empty}{Empty FILTER}{ Filter cannot be the empty string.}{}{} +\vcfstrictrule{FILTER.missing}{Invalid FILTER}{ Missing FILTER meta-information line. }{}{} +\vcfstrictrule{FILTER.missingvalue.notsolo}{FILTER MISSING value must only record.}{ No other values can be present if the MISSING value is present.}{}{} +\vcfstrictrule{FILTER.duplicate}{Duplicated FILTER}{ FILTER is not unique within this record. }{}{} + +\subsection{INFO} + +- Duplicate tags +- + +\section{Sematics} + +\subsection { Breakpoints } +\vcfstrictrule{breakpoint.MATEID.missing}{Breakpoint MATEID required}{Breakpoint record must have MATEID specified.}{}{} +\vcfstrictrule{breakpoint.MATEID.malformed}{Malformed MATEID}{MATEID cannot be the MISSING value.}{}{} +\vcfstrictrule{breakpoint.ID.missing}{Breakpoint ID missing}{Breakpoint ID cannot be the MISSING value \tt{.}.} +\vcfstrictrule{breakpoint.mate.missing}{Missing breakpoint mate}{A breakpoint record with ID matching MATEID must exist.}{}{} +\vcfstrictrule{breakpoint.POS.mismatch}{Breakpoint POS mismatch}{The POS of the matching breakpoint record does not match the position in the ALT field}{}{} +\vcfstrictrule{breakpoint.CHROM.mismatch}{Breakpoint CHROM mismatch}{The CHROM of the matching breakpoint record does not match the contig in the ALT field}{}{} +\vcfstrictrule{breakpoint.HOMLEN.mismatch}{Breakpoint HOMLEN mismatch}{The HOMLEN of the matching breakpoint record does not match the HOMLEN of this record.}{}{} +\vcfstrictrule{breakpoint.HOMSEQ.mismatch}{Breakpoint HOMSEQ mismatch}{After adjusting for breakend orientations, the HOMSEQ of the matching breakpoint record does not match the HOMSEQ of this record.}{}{} +\vcfstrictrule{breakpoint.CIPOS.mismatch}{Breakpoint CIPOS mismatch}{After adjusting for breakend orientations, the CIPOS of the matching breakpoint record does not match the CIPOS of this record.}{}{ +Note: this rule only applies to events that are not IMPRECISE. +A breakpoint can validly have an different confidence intervals on either side of a breakpoint. +For example, a breakpoint into a poly-A stretch of indeterminate length can have the position known exactly on one side, but a wide CIPOS on the side with a poly-A reference sequence. +} + +\end{document}