Skip to content

Commit

Permalink
Add MZ:i tag as a check for base modification validity. (samtools#1590)
Browse files Browse the repository at this point in the history
If a sequence is hard-clipped after calling the base modifications,
then the tool may, or may not, update the MM and ML tags accordingly.
We have no way of distinguishing these two cases.  While the base
modification parsing code already detects overflows where the
coordinates go beyond the sequence end, this isn't fool proof,
especially if the clipping is short.

So instead we have an (as yet unwritten) proposal of MZ:i tag holding
the sequence length, to be written at the same time as the MM and ML
tags.  This can then be used as a sanity check later on, to detect
cases where the sequence has changed length via a tool that is unaware
of base modifications.

TODO: as a separate PR, we should add a new API that can trim bases
off the start/end of MM/ML strings to make it trivial for tools that
are doing hard clipping via htslib.  (Indeed we don't even have an API
for SEQ/QUAL either, so it can do all together).  This would make it
far easier for people to keep everything in sync, and this code could
then also update MZ while it's at it.  That's new API though so it can
arrive as a separate commit.

See samtools/hts-specs#646
  • Loading branch information
jkbonfield authored and vasudeva8 committed May 30, 2023
1 parent 7016fd3 commit 7ee1324
Show file tree
Hide file tree
Showing 8 changed files with 49 additions and 12 deletions.
30 changes: 22 additions & 8 deletions sam.c
Original file line number Diff line number Diff line change
Expand Up @@ -6230,14 +6230,24 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
if (!mm)
return 0;
if (mm[0] != 'Z') {
hts_log_error("MM tag is not of type Z");
hts_log_error("%s: MM tag is not of type Z", bam_get_qname(b));
return -1;
}

uint8_t *mi = bam_aux_get(b, "MZ");
if (mi && bam_aux2i(mi) != b->core.l_qseq) {
// bam_aux2i with set errno = EINVAL and return 0 if the tag
// isn't integer, but 0 will be a seq-length mismatch anyway so
// triggers an error here too.
hts_log_error("%s: MM/MZ data length is incompatible with"
" SEQ length", bam_get_qname(b));
return -1;
}

uint8_t *ml = bam_aux_get(b, "ML");
if (!ml) ml = bam_aux_get(b, "Ml");
if (ml && (ml[0] != 'B' || ml[1] != 'C')) {
hts_log_error("ML tag is not of type B,C");
hts_log_error("%s: ML tag is not of type B,C", bam_get_qname(b));
return -1;
}
uint8_t *ml_end = ml ? ml+6 + le_to_u32(ml+2) : NULL;
Expand Down Expand Up @@ -6321,7 +6331,8 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {

delta = strtol(cp, &cp_end, 10);
if (cp_end == cp) {
hts_log_error("Hit end of MM tag. Missing semicolon?");
hts_log_error("%s: Hit end of MM tag. Missing "
"semicolon?", bam_get_qname(b));
return -1;
}

Expand Down Expand Up @@ -6350,8 +6361,8 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
state->implicit [mod_num] = implicit;

if (delta < 0) {
hts_log_error("MM tag refers to bases beyond sequence "
"length");
hts_log_error("%s: MM tag refers to bases beyond sequence "
"length", bam_get_qname(b));
return -1;
}
state->MMcount [mod_num] = delta;
Expand All @@ -6366,7 +6377,8 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
}

if (++mod_num >= MAX_BASE_MOD) {
hts_log_error("Too many base modification types");
hts_log_error("%s: Too many base modification types",
bam_get_qname(b));
return -1;
}
ms++; n++;
Expand All @@ -6384,7 +6396,8 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
}
}
if (ml > ml_end) {
hts_log_error("Insufficient number of entries in ML tag");
hts_log_error("%s: Insufficient number of entries in ML "
"tag", bam_get_qname(b));
return -1;
}
} else {
Expand All @@ -6396,7 +6409,8 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
cp++;
}
if (!*cp) {
hts_log_error("Hit end of MM tag. Missing semicolon?");
hts_log_error("%s: Hit end of MM tag. Missing semicolon?",
bam_get_qname(b));
return -1;
}
}
Expand Down
5 changes: 5 additions & 0 deletions test/base_mods/MM-MZf1.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
@SQ SN:I LN:999
r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:37
r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215
r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240
r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MZ:i:36
5 changes: 5 additions & 0 deletions test/base_mods/MM-MZf2.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
@SQ SN:I LN:999
r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:36
r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215
r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240
r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MZ:f:36
5 changes: 5 additions & 0 deletions test/base_mods/MM-MZp.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
@SQ SN:I LN:999
r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:36
r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215
r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240
r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MZ:i:36
2 changes: 1 addition & 1 deletion test/base_mods/MM-multi.sam
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
@CO r2 has them combined together, for example as produced by
@CO a joint basecaller which assigns probabilities to all
@CO trained events simultaneously.
r1 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240
r1 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:36
r2 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+mh,2,2,0,0,4,1;N+n,15; Ml:B:C,77,159,103,133,128,108,154,82,179,57,204,31,240
1 change: 1 addition & 0 deletions test/base_mods/base-mods.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,6 @@ test_mod="../test_mod"
pileup_mod="../pileup_mod"

test_driver $@
rm _err.tmp _out.tmp

exit $?
6 changes: 6 additions & 0 deletions test/base_mods/base-mods.tst
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,9 @@ P MM-explicit-x.out $test_mod -x MM-explicit.sam
# Pileup testing
P MM-pileup.out $pileup_mod < MM-pileup.sam
P MM-pileup2.out $pileup_mod < MM-pileup2.sam

# Validation testing. We just care about exit status here, but the
# test data is a copy of MM-pileup.sam so that suffices too.
P MM-pileup.out $pileup_mod < MM-MZp.sam
F MM-pileup.out $pileup_mod < MM-MZf1.sam
F MM-pileup.out $pileup_mod < MM-MZf2.sam
7 changes: 4 additions & 3 deletions test/pileup_mod.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ void process_pileup(sam_hdr_t *h, const bam_pileup1_t *p,
// as each new read is added or removed from the pileups.
int pileup_cd_create(void *data, const bam1_t *b, bam_pileup_cd *cd) {
hts_base_mod_state *m = hts_base_mod_state_alloc();
bam_parse_basemod(b, m);
if (bam_parse_basemod(b, m) < 0)
return -1;
cd->p = m;
return 0;
}
Expand Down Expand Up @@ -201,7 +202,7 @@ int main(int argc, char **argv) {
bam_plp_destructor(iter, pileup_cd_destroy);

const bam_pileup1_t *p;
int tid, pos, n;
int tid, pos, n = 0;
while ((p = bam_plp_auto(iter, &tid, &pos, &n)) != 0) {
switch (compact) {
case 0:
Expand All @@ -221,5 +222,5 @@ int main(int argc, char **argv) {
bam_destroy1(b);
sam_hdr_destroy(h);

return 0;
return n != 0;
}

0 comments on commit 7ee1324

Please sign in to comment.