From 5f6bbdf39dd5f83209e965e17b2c77bbc8e9b1ce Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 31 May 2018 10:33:26 +0100 Subject: [PATCH 1/9] Work around colons in reference names (GRCh38 HLA). This extends the callback function used by the hts iterator code to the ref:start-end range parser, permitting the parser to validate whether the whole thing matches a reference name. This works around parsing conflicts when given range queries like "HLA-DRB1*12:17", which is either the whole of "HLA-DRB1*12:17" (this exists in GRCh38) or base 17 onwards of ""HLA-DRB1*12" (which does not). Note there are still some undecided questions here. Do we want to handle the special names used in the iterator at this level to? Eg "*" meaning unmapped data and "." meaning whole file? --- hts.c | 67 +++++++++++++++++++++++++++++++++++++--------------- htslib/hts.h | 19 +++++++++++++-- htslib/sam.h | 1 + sam.c | 5 ++++ 4 files changed, 71 insertions(+), 21 deletions(-) diff --git a/hts.c b/hts.c index bcacb4451..b648b11ba 100644 --- a/hts.c +++ b/hts.c @@ -2624,15 +2624,39 @@ long long hts_parse_decimal(const char *str, char **strend, int flags) return (sign == '+')? n : -n; } -const char *hts_parse_reg(const char *s, int *beg, int *end) +/* + * A variant of hts_parse_reg which is reference-id aware. It uses + * the iterator name2id callbacks to validate the region tokenisation works. + * + * This is necessary due to GRCh38 HLA additions which have reference names + * like "HLA-DRB1*12:17". + * + * On success the end of the reference is returned (colon or end of string). + * On failure NULL is returned, and if tid/getid are supplied *tid will be -1. + */ +const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end, + hts_name2id_f getid, void *hdr) { - char *hyphen; + // FIXME: do we need to permit tid=-1 for reference "*" to indicate unmapped + // reads, and strictly have NULL as failure test? + int tid_, s_len = strlen(s); // int is sufficient given beg/end types + if (!tid) tid = &tid_; // simplifies code below + const char *colon = strrchr(s, ':'); if (colon == NULL) { *beg = 0; *end = INT_MAX; - return s + strlen(s); + *tid = getid ? getid(hdr, s) : 0; + return *tid >= 0 ? s + s_len : NULL; } + // Has a colon, but check whole name first + if (getid) { + *beg = 0; *end = INT_MAX; + if ((*tid = getid(hdr, s)) >= 0) + return s + s_len; + } + + char *hyphen; *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1; if (*beg < 0) *beg = 0; @@ -2641,37 +2665,42 @@ const char *hts_parse_reg(const char *s, int *beg, int *end) else return NULL; if (*beg >= *end) return NULL; + if (getid) { + kstring_t ks = { 0, 0, NULL }; + kputsn(s, colon-s, &ks); // convert to nul terminated string + if (!ks.s) { + *tid = -1; + return NULL; + } + *tid = getid(hdr, ks.s); + free(ks.s); + } else { + *tid = 0; + } return colon; } +const char *hts_parse_reg(const char *s, int *beg, int *end) +{ + return hts_parse_reg2(s, NULL, beg, end, NULL, NULL); +} + hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec) { int tid, beg, end; - const char *q; if (strcmp(reg, ".") == 0) return itr_query(idx, HTS_IDX_START, 0, 0, readrec); else if (strcmp(reg, "*") == 0) return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec); - q = hts_parse_reg(reg, &beg, &end); - if (q) { - char tmp_a[1024], *tmp = tmp_a; - if (q - reg + 1 > 1024) - if (!(tmp = malloc(q - reg + 1))) - return NULL; - strncpy(tmp, reg, q - reg); - tmp[q - reg] = 0; - tid = getid(hdr, tmp); - if (tmp != tmp_a) - free(tmp); - } - else { - // not parsable as a region, but possibly a sequence named "foo:a" - tid = getid(hdr, reg); + if ((tid = getid(hdr, reg)) >= 0) { beg = 0; end = INT_MAX; + return itr_query(idx, tid, beg, end, readrec); } + hts_parse_reg2(reg, &tid, &beg, &end, getid, hdr); + if (tid < 0) return NULL; return itr_query(idx, tid, beg, end, readrec); } diff --git a/htslib/hts.h b/htslib/hts.h index c3c1c3e9f..07953f365 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -791,8 +791,25 @@ long long hts_parse_decimal(const char *str, char **strend, int flags); @return Pointer to the colon or '\0' after the reference sequence name, or NULL if @a str could not be parsed. */ + +typedef int (*hts_name2id_f)(void*, const char*); +typedef const char *(*hts_id2name_f)(void*, int); const char *hts_parse_reg(const char *str, int *beg, int *end); +/// Parse a "CHR:START-END"-style region string +/** @param str String to be parsed + @param tid Set on return (if not NULL) to be reference index (-1 if invalid) + @param beg Set on return to the 0-based start of the region + @param end Set on return to the 1-based end of the region + @param getid Function pointer. Called if not NULL to set tid. + @param hdr Caller data passed to getid. + @return Pointer to the colon or '\0' after the reference sequence name, + or NULL if @a str could not be parsed. +*/ +const char *hts_parse_reg2(const char *str, int *tid, int *beg, int *end, + hts_name2id_f getid, void *hdr); + + /////////////////////////////////////////////////////////// // Generic iterators // @@ -818,8 +835,6 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_re */ void hts_itr_destroy(hts_itr_t *iter); -typedef int (*hts_name2id_f)(void*, const char*); -typedef const char *(*hts_id2name_f)(void*, int); typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec); /// Create a single-region iterator from a text region specification diff --git a/htslib/sam.h b/htslib/sam.h index ea7baa909..1e54f31df 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -281,6 +281,7 @@ typedef struct { int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) HTS_RESULT_USED; void bam_hdr_destroy(bam_hdr_t *h); int bam_name2id(bam_hdr_t *h, const char *ref); + const char *sam_parse_reg(bam_hdr_t *h, const char *s, int *tid, int *beg, int *end); bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0); bam1_t *bam_init1(void); diff --git a/sam.c b/sam.c index 5bc6facc1..c81aaf9da 100644 --- a/sam.c +++ b/sam.c @@ -298,6 +298,11 @@ int bam_name2id(bam_hdr_t *h, const char *ref) return k == kh_end(d)? -1 : kh_val(d, k); } +const char *sam_parse_reg(bam_hdr_t *h, const char *s, int *tid, int *beg, int *end) { + // FIXME: do we need to also use cram_name2id? + return hts_parse_reg2(s, tid, beg, end, (hts_name2id_f)bam_name2id, h); +} + /************************* *** BAM alignment I/O *** *************************/ From 1bb085a6ce2f12936fe40f86e92dd01b9667104e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 1 Jun 2018 10:02:27 +0100 Subject: [PATCH 2/9] Now region parsing fails on ambiguous cases. Also improved return values so NULL is primary way of detecting failure rather than tid. This is more in line with the old hts_parse_reg code. See https://github.com/samtools/hts-specs/issues/124#issuecomment-393777566 for heuristic suggestions. --- hts.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/hts.c b/hts.c index b648b11ba..f13402420 100644 --- a/hts.c +++ b/hts.c @@ -2631,8 +2631,13 @@ long long hts_parse_decimal(const char *str, char **strend, int flags) * This is necessary due to GRCh38 HLA additions which have reference names * like "HLA-DRB1*12:17". * - * On success the end of the reference is returned (colon or end of string). - * On failure NULL is returned, and if tid/getid are supplied *tid will be -1. + * getid is optional and may be passed in as NULL. If given it is used to + * validate the reference name exists and is unambiguously parseable. If not + * given the best guess will be made but no has guarantees in validity. + * + * On success the end of the reference is returned (colon or end of string) + * beg/end will be set, plus tid if getid has been supplied. + * On failure NULL is returned. */ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end, hts_name2id_f getid, void *hdr) @@ -2642,6 +2647,7 @@ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end, int tid_, s_len = strlen(s); // int is sufficient given beg/end types if (!tid) tid = &tid_; // simplifies code below + // No colon implies entirety of the reference const char *colon = strrchr(s, ':'); if (colon == NULL) { *beg = 0; *end = INT_MAX; @@ -2649,11 +2655,29 @@ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end, return *tid >= 0 ? s + s_len : NULL; } - // Has a colon, but check whole name first + // Has a colon, but check whole name first. if (getid) { *beg = 0; *end = INT_MAX; - if ((*tid = getid(hdr, s)) >= 0) + if ((*tid = getid(hdr, s)) >= 0) { + // Entire name matches, but also check this isn't + // ambiguous. eg we have ref chr1 and ref chr1:100-200 + // both present. + kstring_t ks = { 0, 0, NULL }; + kputsn(s, colon-s, &ks); // convert to nul terminated string + if (!ks.s) { + *tid = -1; + return NULL; + } + if (getid(hdr, ks.s) >= 0) { + free(ks.s); + *tid = -1; + hts_log_error("Range %s is ambiguous", s); + return NULL; + } + free(ks.s); + return s + s_len; + } } char *hyphen; @@ -2674,6 +2698,8 @@ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end, } *tid = getid(hdr, ks.s); free(ks.s); + if (*tid < 0) + return NULL; } else { *tid = 0; } @@ -2694,14 +2720,9 @@ hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f g else if (strcmp(reg, "*") == 0) return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec); - if ((tid = getid(hdr, reg)) >= 0) { - beg = 0; end = INT_MAX; - return itr_query(idx, tid, beg, end, readrec); - } - - hts_parse_reg2(reg, &tid, &beg, &end, getid, hdr); + if (!hts_parse_reg2(reg, &tid, &beg, &end, getid, hdr)) + return NULL; - if (tid < 0) return NULL; return itr_query(idx, tid, beg, end, readrec); } From f1a6a3f06383366c62817b0d18db7a75a409a7a7 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 1 Jun 2018 11:35:12 +0100 Subject: [PATCH 3/9] Added support for brace-quoting of reference names. Eg with contigs named "chr1" and "chr1:100-200" we can specify "{chr1}:100-200" and "{chr1:100-200}" to disambiguate. --- hts.c | 47 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/hts.c b/hts.c index f13402420..f451c1ee2 100644 --- a/hts.c +++ b/hts.c @@ -2635,6 +2635,10 @@ long long hts_parse_decimal(const char *str, char **strend, int flags) * validate the reference name exists and is unambiguously parseable. If not * given the best guess will be made but no has guarantees in validity. * + * To work around these issues quoting is also permitted via {ref}:start-end. + * In this case, the return value will point to '}' and not the end of the + * reference (but this is a useful indication that it started with '{'). + * * On success the end of the reference is returned (colon or end of string) * beg/end will be set, plus tid if getid has been supplied. * On failure NULL is returned. @@ -2647,16 +2651,45 @@ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end, int tid_, s_len = strlen(s); // int is sufficient given beg/end types if (!tid) tid = &tid_; // simplifies code below - // No colon implies entirety of the reference - const char *colon = strrchr(s, ':'); + const char *colon = NULL; + int quoted = 0; + + // Braced quoting of references is permitted to resolve ambiguities. + if (*s == '{') { + const char *close = strrchr(s, '}'); + if (!close) { + hts_log_error("Mismatching braces in \"%s\"", s); + return NULL; + } + s++; + s_len--; + if (close[1] == ':') + colon = close+1; + quoted = 1; // number of trailing characters to trim + } else { + colon = strrchr(s, ':'); + } + if (colon == NULL) { *beg = 0; *end = INT_MAX; - *tid = getid ? getid(hdr, s) : 0; + if (getid) { + kstring_t ks = { 0, 0, NULL }; + kputsn(s, s_len-quoted, &ks); // convert to nul terminated string + if (!ks.s) { + *tid = -1; + return NULL; + } + + *tid = getid(hdr, ks.s); + free(ks.s); + } else { + *tid = 0; + } return *tid >= 0 ? s + s_len : NULL; } // Has a colon, but check whole name first. - if (getid) { + if (!quoted && getid) { *beg = 0; *end = INT_MAX; if ((*tid = getid(hdr, s)) >= 0) { // Entire name matches, but also check this isn't @@ -2671,7 +2704,9 @@ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end, if (getid(hdr, ks.s) >= 0) { free(ks.s); *tid = -1; - hts_log_error("Range %s is ambiguous", s); + hts_log_error("Range is ambiguous. " + "Use {%s} or {%.*s}%s instead", + s, (int)(colon-s), s, colon); return NULL; } free(ks.s); @@ -2691,7 +2726,7 @@ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end, if (*beg >= *end) return NULL; if (getid) { kstring_t ks = { 0, 0, NULL }; - kputsn(s, colon-s, &ks); // convert to nul terminated string + kputsn(s, colon-s-quoted, &ks); // convert to nul terminated string if (!ks.s) { *tid = -1; return NULL; From d224a135cb9d94567c36358640cf2fa83192e7db Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 7 Jun 2018 11:50:49 +0100 Subject: [PATCH 4/9] Major rewrite of hts_parse_region. The old function (hts_parse_reg) has been put back, meaning the new API can now have the getid func pointer as a mandatory requirement. Added tests. Tweaked sam_parse_region to have the flags parameter. This is still required as bcftools and samtools use different parameters (parsing style is tool based rather than file format based). --- Makefile | 8 +- hts.c | 224 +++++++++++++++++++++++++++++++----------- htslib/hts.h | 13 ++- htslib/sam.h | 2 +- sam.c | 5 +- test/colons.bam | Bin 0 -> 268 bytes test/colons.bam.bai | Bin 0 -> 424 bytes test/test-parse-reg.c | 204 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 392 insertions(+), 64 deletions(-) create mode 100644 test/colons.bam create mode 100644 test/colons.bam.bai create mode 100644 test/test-parse-reg.c diff --git a/Makefile b/Makefile index 19c0579ad..b3e72e848 100644 --- a/Makefile +++ b/Makefile @@ -82,7 +82,8 @@ BUILT_TEST_PROGRAMS = \ test/test-vcf-sweep \ test/test-bcf-sr \ test/fuzz/hts_open_fuzzer.o \ - test/test-bcf-translate + test/test-bcf-translate \ + test/test-parse-reg BUILT_THRASH_PROGRAMS = \ test/thrash_threads1 \ @@ -377,6 +378,7 @@ check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) test/fieldarith test/fieldarith.sam test/hfile test/test_bgzf test/bgziptest.txt + test/test-parse-reg -t test/colons.bam cd test/tabix && ./test-tabix.sh tabix.tst cd test/mpileup && ./test-pileup.sh mpileup.tst REF_PATH=: test/sam test/ce.fa test/faidx.fa test/fastqs.fq @@ -413,6 +415,9 @@ test/test_realn: test/test_realn.o libhts.a test/test-regidx: test/test-regidx.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test-regidx.o libhts.a $(LIBS) -lpthread +test/test-parse-reg: test/test-parse-reg.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test-parse-reg.o libhts.a $(LIBS) -lpthread + test/test_view: test/test_view.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_view.o libhts.a $(LIBS) -lpthread @@ -439,6 +444,7 @@ test/pileup.o: test/pileup.c config.h $(htslib_sam_h) $(htslib_kstring_h) test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hfile_internal_h) test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h) +test/test-parse-reg.o: test/test-parse-reg.c $(htslib_hts_h) $(htslib_sam_h) test/test-realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h) test/test-regidx.o: test/test-regidx.c config.h $(htslib_regidx_h) $(hts_internal_h) test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) $(htslib_vcf_h) diff --git a/hts.c b/hts.c index f451c1ee2..0aa6ec1e7 100644 --- a/hts.c +++ b/hts.c @@ -2607,6 +2607,12 @@ long long hts_parse_decimal(const char *str, char **strend, int flags) if (esign == '-') e = -e; } + switch (*s) { + case 'k': case 'K': e += 3; s++; break; + case 'm': case 'M': e += 6; s++; break; + case 'g': case 'G': e += 9; s++; break; + } + e -= decimals; while (e > 0) n *= 10, e--; while (e < 0) lost += n % 10, n /= 10, e++; @@ -2618,12 +2624,24 @@ long long hts_parse_decimal(const char *str, char **strend, int flags) if (strend) { *strend = (char *)s; } else if (*s) { - hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s); + if ((flags & HTS_PARSE_THOUSANDS_SEP) || (!(flags & HTS_PARSE_THOUSANDS_SEP) && *s != ',')) + hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s); } return (sign == '+')? n : -n; } +static void *hts_memrchr(const void *s, int c, size_t n) { + size_t i; + unsigned char *u = (unsigned char *)s; + for (i = n; i > 0; i--) { + if (u[i-1] == c) + return u+i-1; + } + + return NULL; +} + /* * A variant of hts_parse_reg which is reference-id aware. It uses * the iterator name2id callbacks to validate the region tokenisation works. @@ -2631,32 +2649,66 @@ long long hts_parse_decimal(const char *str, char **strend, int flags) * This is necessary due to GRCh38 HLA additions which have reference names * like "HLA-DRB1*12:17". * - * getid is optional and may be passed in as NULL. If given it is used to - * validate the reference name exists and is unambiguously parseable. If not - * given the best guess will be made but no has guarantees in validity. + * All parameters are mandatory. + * + * To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" + * are reference names, we may quote using curly braces. + * Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. + * + * Flags are used to control how parsing works, and can be one of the below. + * + * HTS_PARSE_LIST: + * If present, the region is assmed to be a comma separated list and + * position parsing will not contain commas (this implicitly + * clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal). + * On success the return pointer will be the start of the next region, ie + * the character after the comma. (If *ret != '\0' then the caller can + * assume another region is present in the list.) + * + * If not set then positions may contain commas. In this case the return + * value should point to the end of the string, or NULL on failure. * - * To work around these issues quoting is also permitted via {ref}:start-end. - * In this case, the return value will point to '}' and not the end of the - * reference (but this is a useful indication that it started with '{'). + * HTS_PARSE_ONE_COORD: + * If present, X:100 is treated as the single base pair region X:100-100. + * In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-. + * (This is the standard bcftools region convention.) * - * On success the end of the reference is returned (colon or end of string) - * beg/end will be set, plus tid if getid has been supplied. + * When not set X:100 is considered to be X:100- where is + * the end of chromosome X (set to INT_MAX here). X:100- and X:-100 are + * invalid. + * (This is the standard samtools region convention.) + * + * Note the supplied string expects 1 based inclusive coordinates, but the + * returned coordinates start from 0 and are half open, so pos0 is valid + * for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}" + * + * On success a pointer to the byte after the end of the entire region + * specifier is returned (plus any trailing comma), and tid, + * beg & end will be set. * On failure NULL is returned. */ -const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end, - hts_name2id_f getid, void *hdr) +const char *hts_parse_region(const char *s, int *tid, int *beg, int *end, + hts_name2id_f getid, void *hdr, int flags) { - // FIXME: do we need to permit tid=-1 for reference "*" to indicate unmapped - // reads, and strictly have NULL as failure test? - int tid_, s_len = strlen(s); // int is sufficient given beg/end types - if (!tid) tid = &tid_; // simplifies code below + if (!s || !tid || !beg || !end || !getid) + return NULL; - const char *colon = NULL; + int s_len = strlen(s); // int is sufficient given beg/end types + kstring_t ks = { 0, 0, NULL }; + + const char *colon = NULL, *comma = NULL; int quoted = 0; + if (flags & HTS_PARSE_LIST) + flags &= ~HTS_PARSE_THOUSANDS_SEP; + else + flags |= HTS_PARSE_THOUSANDS_SEP; + + const char *s_end = s + s_len; + // Braced quoting of references is permitted to resolve ambiguities. if (*s == '{') { - const char *close = strrchr(s, '}'); + const char *close = memchr(s, '}', s_len); if (!close) { hts_log_error("Mismatching braces in \"%s\"", s); return NULL; @@ -2666,36 +2718,56 @@ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end, if (close[1] == ':') colon = close+1; quoted = 1; // number of trailing characters to trim + + // Truncate to this item only, if appropriate. + if (flags & HTS_PARSE_LIST) { + comma = strchr(close, ','); + if (comma) { + s_len = comma-s; + s_end = comma+1; + } + } } else { - colon = strrchr(s, ':'); + // Truncate to this item only, if appropriate. + if (flags & HTS_PARSE_LIST) { + comma = strchr(s, ','); + if (comma) { + s_len = comma-s; + s_end = comma+1; + } + } + + colon = hts_memrchr(s, ':', s_len); } + // No colon is simplest case; just check and return. if (colon == NULL) { *beg = 0; *end = INT_MAX; - if (getid) { - kstring_t ks = { 0, 0, NULL }; - kputsn(s, s_len-quoted, &ks); // convert to nul terminated string - if (!ks.s) { - *tid = -1; - return NULL; - } - - *tid = getid(hdr, ks.s); - free(ks.s); - } else { - *tid = 0; + kputsn(s, s_len-quoted, &ks); // convert to nul terminated string + if (!ks.s) { + *tid = -1; + return NULL; } - return *tid >= 0 ? s + s_len : NULL; + + *tid = getid(hdr, ks.s); + free(ks.s); + + return *tid >= 0 ? s_end : NULL; } // Has a colon, but check whole name first. - if (!quoted && getid) { + if (!quoted) { *beg = 0; *end = INT_MAX; - if ((*tid = getid(hdr, s)) >= 0) { + kputsn(s, s_len, &ks); // convert to nul terminated string + if (!ks.s) { + *tid = -1; + return NULL; + } + if ((*tid = getid(hdr, ks.s)) >= 0) { // Entire name matches, but also check this isn't // ambiguous. eg we have ref chr1 and ref chr1:100-200 // both present. - kstring_t ks = { 0, 0, NULL }; + ks.l = 0; kputsn(s, colon-s, &ks); // convert to nul terminated string if (!ks.s) { *tid = -1; @@ -2711,39 +2783,81 @@ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end, } free(ks.s); - return s + s_len; + return s_end; } } - char *hyphen; - *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1; - if (*beg < 0) *beg = 0; - - if (*hyphen == '\0') *end = INT_MAX; - else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP); - else return NULL; + // Quoted, or unquoted and whole string isn't a name. + // Check the pre-colon part is valid. + ks.l = 0; + kputsn(s, colon-s-quoted, &ks); // convert to nul terminated string + if (!ks.s) { + *tid = -1; + return NULL; + } + *tid = getid(hdr, ks.s); + free(ks.s); + if (*tid < 0) + return NULL; - if (*beg >= *end) return NULL; - if (getid) { - kstring_t ks = { 0, 0, NULL }; - kputsn(s, colon-s-quoted, &ks); // convert to nul terminated string - if (!ks.s) { - *tid = -1; + // Finally parse the post-colon coordinates + char *hyphen; + *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1; + if (*beg < 0) { + if (isdigit(*hyphen) || *hyphen == '\0' || *hyphen == ',') { + // interpret chr:-100 as chr:1-100 + *end = *beg==-1 ? INT_MAX : -(*beg+1); + *beg = 0; + return s_end; + } else if (*hyphen == '-') { + *beg = 0; + } else { + hts_log_error("Unexpected string \"%s\" after region", hyphen); return NULL; } - *tid = getid(hdr, ks.s); - free(ks.s); - if (*tid < 0) + } + + if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) { + *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : INT_MAX; + } else if (*hyphen == '-') { + *end = hts_parse_decimal(hyphen+1, &hyphen, flags); + if (*hyphen != '\0' && *hyphen != ',') { + hts_log_error("Unexpected string \"%s\" after region", hyphen); return NULL; + } } else { - *tid = 0; + hts_log_error("Unexpected string \"%s\" after region", hyphen); + return NULL; } - return colon; + + if (*end == 0) + *end = INT_MAX; // interpret chr:100- as chr:100- + + if (*beg >= *end) return NULL; + + return s_end; } +// Next release we should mark this as deprecated? +// Use hts_parse_region above instead. const char *hts_parse_reg(const char *s, int *beg, int *end) { - return hts_parse_reg2(s, NULL, beg, end, NULL, NULL); + char *hyphen; + const char *colon = strrchr(s, ':'); + if (colon == NULL) { + *beg = 0; *end = INT_MAX; + return s + strlen(s); + } + + *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1; + if (*beg < 0) *beg = 0; + + if (*hyphen == '\0') *end = INT_MAX; + else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP); + else return NULL; + + if (*beg >= *end) return NULL; + return colon; } hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec) @@ -2755,7 +2869,7 @@ hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f g else if (strcmp(reg, "*") == 0) return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec); - if (!hts_parse_reg2(reg, &tid, &beg, &end, getid, hdr)) + if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP)) return NULL; return itr_query(idx, tid, beg, end, readrec); diff --git a/htslib/hts.h b/htslib/hts.h index 07953f365..2f6c613be 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -769,6 +769,8 @@ uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx); // Region parsing #define HTS_PARSE_THOUSANDS_SEP 1 ///< Ignore ',' separators within numbers +#define HTS_PARSE_ONE_COORD 2 ///< chr:pos means chr:pos-pos and not chr:pos-end +#define HTS_PARSE_LIST 4 ///< Expect a comma separated list of regions. (Disables HTS_PARSE_THOUSANDS_SEP) /// Parse a numeric string /** The number may be expressed in scientific notation, and optionally may @@ -794,6 +796,7 @@ long long hts_parse_decimal(const char *str, char **strend, int flags); typedef int (*hts_name2id_f)(void*, const char*); typedef const char *(*hts_id2name_f)(void*, int); + const char *hts_parse_reg(const char *str, int *beg, int *end); /// Parse a "CHR:START-END"-style region string @@ -803,11 +806,13 @@ const char *hts_parse_reg(const char *str, int *beg, int *end); @param end Set on return to the 1-based end of the region @param getid Function pointer. Called if not NULL to set tid. @param hdr Caller data passed to getid. - @return Pointer to the colon or '\0' after the reference sequence name, - or NULL if @a str could not be parsed. + @param flags Bitwise HTS_PARSE_* flags listed above. + @return Pointer to the byte after the end of the entire region + specifier (including any trailing comma) on success, + or NULL if @a str could not be parsed. */ -const char *hts_parse_reg2(const char *str, int *tid, int *beg, int *end, - hts_name2id_f getid, void *hdr); +const char *hts_parse_region(const char *str, int *tid, int *beg, int *end, + hts_name2id_f getid, void *hdr, int flags); /////////////////////////////////////////////////////////// diff --git a/htslib/sam.h b/htslib/sam.h index 1e54f31df..8065991f3 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -281,7 +281,7 @@ typedef struct { int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) HTS_RESULT_USED; void bam_hdr_destroy(bam_hdr_t *h); int bam_name2id(bam_hdr_t *h, const char *ref); - const char *sam_parse_reg(bam_hdr_t *h, const char *s, int *tid, int *beg, int *end); + const char *sam_parse_region(bam_hdr_t *h, const char *s, int *tid, int *beg, int *end, int flags); bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0); bam1_t *bam_init1(void); diff --git a/sam.c b/sam.c index c81aaf9da..45d63c997 100644 --- a/sam.c +++ b/sam.c @@ -298,9 +298,8 @@ int bam_name2id(bam_hdr_t *h, const char *ref) return k == kh_end(d)? -1 : kh_val(d, k); } -const char *sam_parse_reg(bam_hdr_t *h, const char *s, int *tid, int *beg, int *end) { - // FIXME: do we need to also use cram_name2id? - return hts_parse_reg2(s, tid, beg, end, (hts_name2id_f)bam_name2id, h); +const char *sam_parse_region(bam_hdr_t *h, const char *s, int *tid, int *beg, int *end, int flags) { + return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags); } /************************* diff --git a/test/colons.bam b/test/colons.bam new file mode 100644 index 0000000000000000000000000000000000000000..53b60313074e247143592d15c7282375894ee314 GIT binary patch literal 268 zcmb2|=3rp}f&Xj_PR>jW#SFzoUsC6sIB?*BfXoTWH|bO71g4yL`8Yi_BO&2Hs|uUY z0o8TcEurT)`sWy$Z7vB=xzW&<#F{jL=P-8~8=Id7y9%$^16Lu2o6R~IElM1w$0g1@ zY|6Ua*v-ID+Zc6+g^__l9?i~bhRq2M3}I~kif#&LS&#f*$e=P0Wa|co69OScMK6Ew zvrh}Ct#mlk=AJKPcGk>6aY9>)!m}fbZ=SkvfMIqW`vL*s5PN;bzq{OaoJ~;6X%ncku+I; literal 0 HcmV?d00001 diff --git a/test/colons.bam.bai b/test/colons.bam.bai new file mode 100644 index 0000000000000000000000000000000000000000..71dbd1a8959fbaca13aa8fe0624624d4c87ef80d GIT binary patch literal 424 zcmZwC%ME}a5CqU!4;ElYc*A + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +/* + Test region description parser. + Usage: test-parse-reg [-c] file.bam region + test-parse-reg [-c] -m file.bam region,region... + test-parse-reg -t + + -c is chr:pos is a single base coordinate, ie chr:pos-pos, + otherwise it is chr:pos- + -m is multi-region list. + -t runs built-in tests + + ./test/test-parse-reg -c -m test/colons.bam "{chr1:100-200},{chr1}:100-200,{chr1:100-200}:100,{chr1,chr3},chr1:" +*/ + +#include +#include +#include +#include +#include + +#include +#include + +void reg_expected(bam_hdr_t *hdr, const char *reg, int flags, + char *reg_exp, int tid_exp, int beg_exp, int end_exp) { + const char *reg_out; + int tid_out = -1, beg_out = -1, end_out = -1; + + reg_out = sam_parse_region(hdr, reg, &tid_out, &beg_out, &end_out, flags); + + if ((reg_out != NULL) != (reg_exp != NULL) || + (reg_out && reg_exp && strcmp(reg_out, reg_exp) != 0) || + (reg_exp && tid_out != tid_exp) || + (reg_exp && beg_out != beg_exp) || + (reg_exp && end_out != end_exp)) { + fprintf(stderr, "Parsing \"%s\" expected return \"%s\", %d:%d-%d, " + "but got \"%s\", %d:%d-%d\n", + reg, + reg_exp?reg_exp:"(null)", tid_exp, beg_exp, end_exp, + reg_out?reg_out:"(null)", tid_out, beg_out, end_out); + exit(1); +// } else { +// fprintf(stderr, "%s parsed as expected result: \"%s\" %d:%d-%d\n", +// reg, +// reg_out?reg_out:"(null)", tid_out, beg_out, end_out); + } +} + +int reg_test(char *fn) { + samFile *fp; + bam_hdr_t *hdr; + + if (!(fp = sam_open(fn, "r"))) + return 1; + + if (!(hdr = sam_hdr_read(fp))) + return 1; + + // 0 chr1 + // 1 chr1:100 + // 2 chr1:100-200 + // 3 chr2:100-200 + // 4 chr3 + // 5 chr1,chr3 + + // Check range extensions. + reg_expected(hdr, "chr1", 0, "", 0, 0, INT_MAX); + reg_expected(hdr, "chr1:50", 0, "", 0, 49, INT_MAX); + reg_expected(hdr, "chr1:50", HTS_PARSE_ONE_COORD, "", 0, 49, 50); + reg_expected(hdr, "chr1:50-100", 0, "", 0, 49, 100); + reg_expected(hdr, "chr1:50-", 0, "", 0, 49, INT_MAX); + reg_expected(hdr, "chr1:-50", 0, "", 0, 0, 50); + + // Check quoting + fprintf(stderr, "Expected error: "); + reg_expected(hdr, "chr1:100-200", 0, NULL, 0, 0, 0); // ambiguous + reg_expected(hdr, "{chr1}:100-200", 0, "", 0, 99, 200); + reg_expected(hdr, "{chr1:100-200}", 0, "", 2, 0, INT_MAX); + reg_expected(hdr, "{chr1:100-200}:100-200", 0, "", 2, 99, 200); + reg_expected(hdr, "{chr2:100-200}:100-200", 0, "", 3, 99, 200); + reg_expected(hdr, "chr2:100-200:100-200", 0, "", 3, 99, 200); + reg_expected(hdr, "chr2:100-200", 0, "", 3, 0, INT_MAX); + + // Check numerics + reg_expected(hdr, "chr3", 0, "", 4, 0, INT_MAX); + reg_expected(hdr, "chr3:", 0, "", 4, 0, INT_MAX); + reg_expected(hdr, "chr3:1000-1500", 0, "", 4, 999, 1500); + reg_expected(hdr, "chr3:1,000-1,500", 0, "", 4, 999, 1500); + reg_expected(hdr, "chr3:1k-1.5K", 0, "", 4, 999, 1500); + reg_expected(hdr, "chr3:1e3-1.5e3", 0, "", 4, 999, 1500); + reg_expected(hdr, "chr3:1e3-15e2", 0, "", 4, 999, 1500); + + // Check list mode + reg_expected(hdr, "chr1,chr3", HTS_PARSE_LIST, "chr3", 0, 0, INT_MAX); + fprintf(stderr, "Expected error: "); + reg_expected(hdr, "chr1:100-200,chr3", HTS_PARSE_LIST, NULL, 0, 0, 0); // ambiguous + reg_expected(hdr, "{chr1,chr3}", HTS_PARSE_LIST, "", 5, 0, INT_MAX); + reg_expected(hdr, "{chr1,chr3},chr1", HTS_PARSE_LIST, "chr1", 5, 0, INT_MAX); + // incorrect usage; first reg is valid (but not what user expects). + reg_expected(hdr, "chr3:1,000-1,500", HTS_PARSE_LIST | HTS_PARSE_ONE_COORD, "000-1,500", 4, 0, 1); + + // More expected failures + reg_expected(hdr, "chr2", 0, NULL, 0, 0, 0); + reg_expected(hdr, "chr1,", 0, NULL, 0, 0, 0); + fprintf(stderr, "Expected error: "); + reg_expected(hdr, "{chr1", 0, NULL, 0, 0, 0); + reg_expected(hdr, "chr1:10-10", 0, "", 0, 9, 10); // OK + reg_expected(hdr, "chr1:10-9", 0, NULL, 0, 0, 0); // Issue#353 + fprintf(stderr, "Expected error: "); + reg_expected(hdr, "chr1:x", 0, NULL, 0, 0, 0); + fprintf(stderr, "Expected error: "); + reg_expected(hdr, "chr1:1-y", 0, NULL, 0, 0, 0); + fprintf(stderr, "Expected error: "); + reg_expected(hdr, "chr1:1,chr3", 0, NULL, 0, 0, 0); + + bam_hdr_destroy(hdr); + sam_close(fp); + + exit(0); +} + +int main(int argc, char **argv) { + bam_hdr_t *hdr; + samFile *fp; + int flags = 0; + + while (argc > 1) { + if (strcmp(argv[1], "-m") == 0) { + flags |= HTS_PARSE_LIST; + argc--; argv++; + continue; + } + + if (strcmp(argv[1], "-c") == 0) { + flags |= HTS_PARSE_ONE_COORD; + argc--; argv++; + continue; + } + + // Automatic mode for test harness + if (strcmp(argv[1], "-t") == 0) + reg_test(argv[2]); + + break; + } + + // Interactive mode for debugging + if (argc != 3) { + fprintf(stderr, "Usage: test-parse-reg [-m] [-c] region[,region]...\n"); + exit(1); + } + + if (!(fp = sam_open(argv[1], "r"))) { + perror(argv[1]); + exit(1); + } + + if (!(hdr = sam_hdr_read(fp))) { + fprintf(stderr, "Couldn't read header\n"); + exit(1); + } + + const char *reg = argv[2]; + while (*reg) { + int tid, beg, end; + reg = sam_parse_region(hdr, reg, &tid, &beg, &end, flags); + if (!reg) { + fprintf(stderr, "Failed to parse region\n"); + exit(1); + } + printf("%-20s %12d %12d\n", + tid == -1 ? "*" : hdr->target_name[tid], + beg, end); + } + + bam_hdr_destroy(hdr); + sam_close(fp); + + return 0; +} From 878cec33946cc6584ce204bea59bedf02ba23260 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 7 Jun 2018 15:03:30 +0100 Subject: [PATCH 5/9] Changes faidx to use the standard region parser. Previously it had its own custom parser which nearly, but not quite, matched the old hts_parse_reg code in functionality. Note this changes the fasta coordinate type from long to int, which is a backwards step. However it is expected this will subsequently change to hts_pos_t in another PR. --- faidx.c | 88 +++++++++++++------------------------------------- htslib/faidx.h | 11 +++++++ 2 files changed, 34 insertions(+), 65 deletions(-) diff --git a/faidx.c b/faidx.c index 547d1c3b1..eb668471e 100644 --- a/faidx.c +++ b/faidx.c @@ -43,6 +43,7 @@ DEALINGS IN THE SOFTWARE. */ #include "hts_internal.h" typedef struct { + int id; // faidx_t->name[id] is for this struct. uint32_t line_len, line_blen; uint64_t len; uint64_t seq_offset; @@ -62,6 +63,13 @@ struct __faidx_t { #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif +static int fai_name2id(void *v, const char *ref) +{ + faidx_t *fai = (faidx_t *)v; + khint_t k = kh_get(s, fai->hash, ref); + return k == kh_end(fai->hash) ? -1 : kh_val(fai->hash, k).id; +} + static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, uint32_t line_len, uint32_t line_blen, uint64_t seq_offset, uint64_t qual_offset) { if (!name) { @@ -89,6 +97,7 @@ static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, } idx->name = tmp; } + v->id = idx->n; idx->name[idx->n++] = name_key; v->len = len; v->line_len = line_len; @@ -721,85 +730,30 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, return s; } - static int fai_get_val(const faidx_t *fai, const char *str, int *len, faidx1_t *val, long *fbeg, long *fend) { - char *s, *ep; - size_t i, l, k, name_end; khiter_t iter; khash_t(s) *h; - long beg, end; + int id, beg, end; - beg = end = -1; - h = fai->hash; - name_end = l = strlen(str); - s = (char*)malloc(l+1); - if (!s) { - *len = -1; + if (!fai_parse_region(fai, str, &id, &beg, &end, 0)) { + hts_log_warning("Reference %s not found in FASTA file, returning empty sequence", str); + *len = -2; return 1; } - // remove space - for (i = k = 0; i < l; ++i) - if (!isspace_c(str[i])) s[k++] = str[i]; - s[k] = 0; - name_end = l = k; - // determine the sequence name - for (i = l; i > 0; --i) if (s[i - 1] == ':') break; // look for colon from the end - if (i > 0) name_end = i - 1; - if (name_end < l) { // check if this is really the end - int n_hyphen = 0; - for (i = name_end + 1; i < l; ++i) { - if (s[i] == '-') ++n_hyphen; - else if (!isdigit_c(s[i]) && s[i] != ',') break; - } - if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name - s[name_end] = 0; - iter = kh_get(s, h, s); - if (iter == kh_end(h)) { // cannot find the sequence name - iter = kh_get(s, h, str); // try str as the name - if (iter != kh_end(h)) { - s[name_end] = ':'; - name_end = l; - } - } - } else iter = kh_get(s, h, str); - if(iter == kh_end(h)) { - hts_log_warning("Reference %s not found in file, returning empty sequence", str); - free(s); + h = fai->hash; + iter = kh_get(s, h, faidx_iseq(fai, id)); + if (!iter) { + // should have already been caught above + abort(); *len = -2; return 1; } *val = kh_value(h, iter); - // parse the interval - if (name_end < l) { - int save_errno = errno; - errno = 0; - for (i = k = name_end + 1; i < l; ++i) - if (s[i] != ',') s[k++] = s[i]; - s[k] = 0; - if (s[name_end + 1] == '-') { - beg = 0; - i = name_end + 2; - } else { - beg = strtol(s + name_end + 1, &ep, 10); - for (i = ep - s; i < k;) if (s[i++] == '-') break; - } - end = i < k? strtol(s + i, &ep, 10) : val->len; - if (beg > 0) --beg; - // Check for out of range numbers. Only going to be a problem on - // 32-bit platforms with >2Gb sequence length. - if (errno == ERANGE && (uint64_t) val->len > LONG_MAX) { - hts_log_error("Positions in range %s are too large for this platform", s); - free(s); - *len = -3; - return 1; - } - errno = save_errno; - } else beg = 0, end = val->len; + if (beg >= val->len) beg = val->len; if (end >= val->len) end = val->len; if (beg > end) beg = end; - free(s); *fbeg = beg; *fend = end; @@ -924,3 +878,7 @@ int faidx_has_seq(const faidx_t *fai, const char *seq) return 1; } +const char *fai_parse_region(const faidx_t *fai, const char *s, int *tid, int *beg, int *end, int flags) +{ + return hts_parse_region(s, tid, beg, end, (hts_name2id_f)fai_name2id, (void *)fai, flags); +} diff --git a/htslib/faidx.h b/htslib/faidx.h index e83143a9c..883b19e7c 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -225,6 +225,17 @@ const char *faidx_iseq(const faidx_t *fai, int i); /// Return sequence length, -1 if not present int faidx_seq_len(const faidx_t *fai, const char *seq); +/// Parses a region string. +/** @param fai Pointer to the faidx_t struct + @param s Region string + @param tid Returns which i-th sequence is described in the region. + @param beg Returns the start of the region (0 based) + @param end Returns the one past last of the region (0 based) + @param flags Parsing method, see HTS_PARSE_* in hts.h. + @return pointer to end of parsed s if successs, NULL if not. +*/ +const char *fai_parse_region(const faidx_t *fai, const char *s, int *tid, int *beg, int *end, int flags); + #ifdef __cplusplus } #endif From 0ce670254ef271eb99b7d6cb1a06149722673a96 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 29 May 2019 11:37:04 +0100 Subject: [PATCH 6/9] Changed *_parse_region functions to use 64-bit coords. --- faidx.c | 31 +++++++++++++++++++++---------- hts.c | 19 +++++++++++-------- htslib/faidx.h | 3 ++- htslib/hts.h | 2 +- htslib/sam.h | 2 +- sam.c | 2 +- test/test-parse-reg.c | 40 ++++++++++++++++++++-------------------- 7 files changed, 57 insertions(+), 42 deletions(-) diff --git a/faidx.c b/faidx.c index eb668471e..839e05939 100644 --- a/faidx.c +++ b/faidx.c @@ -693,14 +693,22 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format) { static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, - uint64_t offset, long beg, long end, int *len) { + uint64_t offset, int64_t beg, int64_t end, int *len) { char *s; size_t l; int c = 0; - int ret = bgzf_useek(fai->bgzf, - offset - + beg / val->line_blen * val->line_len - + beg % val->line_blen, SEEK_SET); + int ret; + + if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) { + hts_log_error("Range %"PRId64"..%"PRId64" too big", beg, end); + *len = -1; + return NULL; + } + + ret = bgzf_useek(fai->bgzf, + offset + + beg / val->line_blen * val->line_len + + beg % val->line_blen, SEEK_SET); if (ret < 0) { *len = -1; @@ -730,10 +738,12 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, return s; } -static int fai_get_val(const faidx_t *fai, const char *str, int *len, faidx1_t *val, long *fbeg, long *fend) { +static int fai_get_val(const faidx_t *fai, const char *str, + int *len, faidx1_t *val, int64_t *fbeg, int64_t *fend) { khiter_t iter; khash_t(s) *h; - int id, beg, end; + int id; + int64_t beg, end; if (!fai_parse_region(fai, str, &id, &beg, &end, 0)) { hts_log_warning("Reference %s not found in FASTA file, returning empty sequence", str); @@ -765,7 +775,7 @@ static int fai_get_val(const faidx_t *fai, const char *str, int *len, faidx1_t * char *fai_fetch(const faidx_t *fai, const char *str, int *len) { faidx1_t val; - long beg, end; + int64_t beg, end; if (fai_get_val(fai, str, len, &val, &beg, &end)) { return NULL; @@ -778,7 +788,7 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len) char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) { faidx1_t val; - long beg, end; + int64_t beg, end; if (fai_get_val(fai, str, len, &val, &beg, &end)) { return NULL; @@ -878,7 +888,8 @@ int faidx_has_seq(const faidx_t *fai, const char *seq) return 1; } -const char *fai_parse_region(const faidx_t *fai, const char *s, int *tid, int *beg, int *end, int flags) +const char *fai_parse_region(const faidx_t *fai, const char *s, + int *tid, int64_t *beg, int64_t *end, int flags) { return hts_parse_region(s, tid, beg, end, (hts_name2id_f)fai_name2id, (void *)fai, flags); } diff --git a/hts.c b/hts.c index 0aa6ec1e7..05aaa26be 100644 --- a/hts.c +++ b/hts.c @@ -2687,13 +2687,13 @@ static void *hts_memrchr(const void *s, int c, size_t n) { * beg & end will be set. * On failure NULL is returned. */ -const char *hts_parse_region(const char *s, int *tid, int *beg, int *end, +const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end, hts_name2id_f getid, void *hdr, int flags) { if (!s || !tid || !beg || !end || !getid) return NULL; - int s_len = strlen(s); // int is sufficient given beg/end types + size_t s_len = strlen(s); kstring_t ks = { 0, 0, NULL }; const char *colon = NULL, *comma = NULL; @@ -2742,7 +2742,7 @@ const char *hts_parse_region(const char *s, int *tid, int *beg, int *end, // No colon is simplest case; just check and return. if (colon == NULL) { - *beg = 0; *end = INT_MAX; + *beg = 0; *end = INT64_MAX; kputsn(s, s_len-quoted, &ks); // convert to nul terminated string if (!ks.s) { *tid = -1; @@ -2757,7 +2757,7 @@ const char *hts_parse_region(const char *s, int *tid, int *beg, int *end, // Has a colon, but check whole name first. if (!quoted) { - *beg = 0; *end = INT_MAX; + *beg = 0; *end = INT64_MAX; kputsn(s, s_len, &ks); // convert to nul terminated string if (!ks.s) { *tid = -1; @@ -2806,7 +2806,7 @@ const char *hts_parse_region(const char *s, int *tid, int *beg, int *end, if (*beg < 0) { if (isdigit(*hyphen) || *hyphen == '\0' || *hyphen == ',') { // interpret chr:-100 as chr:1-100 - *end = *beg==-1 ? INT_MAX : -(*beg+1); + *end = *beg==-1 ? INT64_MAX : -(*beg+1); *beg = 0; return s_end; } else if (*hyphen == '-') { @@ -2818,7 +2818,7 @@ const char *hts_parse_region(const char *s, int *tid, int *beg, int *end, } if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) { - *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : INT_MAX; + *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : INT64_MAX; } else if (*hyphen == '-') { *end = hts_parse_decimal(hyphen+1, &hyphen, flags); if (*hyphen != '\0' && *hyphen != ',') { @@ -2831,7 +2831,7 @@ const char *hts_parse_region(const char *s, int *tid, int *beg, int *end, } if (*end == 0) - *end = INT_MAX; // interpret chr:100- as chr:100- + *end = INT64_MAX; // interpret chr:100- as chr:100- if (*beg >= *end) return NULL; @@ -2862,7 +2862,8 @@ const char *hts_parse_reg(const char *s, int *beg, int *end) hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec) { - int tid, beg, end; + int tid; + int64_t beg, end; if (strcmp(reg, ".") == 0) return itr_query(idx, HTS_IDX_START, 0, 0, readrec); @@ -2872,6 +2873,8 @@ hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f g if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP)) return NULL; + if (end > INT_MAX) end = INT_MAX; // Remove when fully 64-bit compliant + return itr_query(idx, tid, beg, end, readrec); } diff --git a/htslib/faidx.h b/htslib/faidx.h index 883b19e7c..032f3377e 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -29,6 +29,7 @@ #ifndef HTSLIB_FAIDX_H #define HTSLIB_FAIDX_H +#include #include "hts_defs.h" #ifdef __cplusplus @@ -234,7 +235,7 @@ int faidx_seq_len(const faidx_t *fai, const char *seq); @param flags Parsing method, see HTS_PARSE_* in hts.h. @return pointer to end of parsed s if successs, NULL if not. */ -const char *fai_parse_region(const faidx_t *fai, const char *s, int *tid, int *beg, int *end, int flags); +const char *fai_parse_region(const faidx_t *fai, const char *s, int *tid, int64_t *beg, int64_t *end, int flags); #ifdef __cplusplus } diff --git a/htslib/hts.h b/htslib/hts.h index 2f6c613be..d353431c4 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -811,7 +811,7 @@ const char *hts_parse_reg(const char *str, int *beg, int *end); specifier (including any trailing comma) on success, or NULL if @a str could not be parsed. */ -const char *hts_parse_region(const char *str, int *tid, int *beg, int *end, +const char *hts_parse_region(const char *str, int *tid, int64_t *beg, int64_t *end, hts_name2id_f getid, void *hdr, int flags); diff --git a/htslib/sam.h b/htslib/sam.h index 8065991f3..002f894dc 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -281,7 +281,7 @@ typedef struct { int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) HTS_RESULT_USED; void bam_hdr_destroy(bam_hdr_t *h); int bam_name2id(bam_hdr_t *h, const char *ref); - const char *sam_parse_region(bam_hdr_t *h, const char *s, int *tid, int *beg, int *end, int flags); + const char *sam_parse_region(bam_hdr_t *h, const char *s, int *tid, int64_t *beg, int64_t *end, int flags); bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0); bam1_t *bam_init1(void); diff --git a/sam.c b/sam.c index 45d63c997..a1e9172e8 100644 --- a/sam.c +++ b/sam.c @@ -298,7 +298,7 @@ int bam_name2id(bam_hdr_t *h, const char *ref) return k == kh_end(d)? -1 : kh_val(d, k); } -const char *sam_parse_region(bam_hdr_t *h, const char *s, int *tid, int *beg, int *end, int flags) { +const char *sam_parse_region(bam_hdr_t *h, const char *s, int *tid, int64_t *beg, int64_t *end, int flags) { return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags); } diff --git a/test/test-parse-reg.c b/test/test-parse-reg.c index 44b9f3679..4c64cb2fe 100644 --- a/test/test-parse-reg.c +++ b/test/test-parse-reg.c @@ -41,14 +41,17 @@ #include #include #include +#include +#include #include #include void reg_expected(bam_hdr_t *hdr, const char *reg, int flags, - char *reg_exp, int tid_exp, int beg_exp, int end_exp) { + char *reg_exp, int tid_exp, int64_t beg_exp, int64_t end_exp) { const char *reg_out; - int tid_out = -1, beg_out = -1, end_out = -1; + int tid_out = -1; + int64_t beg_out = -1, end_out = -1; reg_out = sam_parse_region(hdr, reg, &tid_out, &beg_out, &end_out, flags); @@ -57,16 +60,12 @@ void reg_expected(bam_hdr_t *hdr, const char *reg, int flags, (reg_exp && tid_out != tid_exp) || (reg_exp && beg_out != beg_exp) || (reg_exp && end_out != end_exp)) { - fprintf(stderr, "Parsing \"%s\" expected return \"%s\", %d:%d-%d, " - "but got \"%s\", %d:%d-%d\n", + fprintf(stderr, "Parsing \"%s\" expected return \"%s\", %d:%"PRId64"-%"PRId64", " + "but got \"%s\", %d:%"PRId64"-%"PRId64"\n", reg, reg_exp?reg_exp:"(null)", tid_exp, beg_exp, end_exp, reg_out?reg_out:"(null)", tid_out, beg_out, end_out); exit(1); -// } else { -// fprintf(stderr, "%s parsed as expected result: \"%s\" %d:%d-%d\n", -// reg, -// reg_out?reg_out:"(null)", tid_out, beg_out, end_out); } } @@ -88,26 +87,26 @@ int reg_test(char *fn) { // 5 chr1,chr3 // Check range extensions. - reg_expected(hdr, "chr1", 0, "", 0, 0, INT_MAX); - reg_expected(hdr, "chr1:50", 0, "", 0, 49, INT_MAX); + reg_expected(hdr, "chr1", 0, "", 0, 0, INT64_MAX); + reg_expected(hdr, "chr1:50", 0, "", 0, 49, INT64_MAX); reg_expected(hdr, "chr1:50", HTS_PARSE_ONE_COORD, "", 0, 49, 50); reg_expected(hdr, "chr1:50-100", 0, "", 0, 49, 100); - reg_expected(hdr, "chr1:50-", 0, "", 0, 49, INT_MAX); + reg_expected(hdr, "chr1:50-", 0, "", 0, 49, INT64_MAX); reg_expected(hdr, "chr1:-50", 0, "", 0, 0, 50); // Check quoting fprintf(stderr, "Expected error: "); reg_expected(hdr, "chr1:100-200", 0, NULL, 0, 0, 0); // ambiguous reg_expected(hdr, "{chr1}:100-200", 0, "", 0, 99, 200); - reg_expected(hdr, "{chr1:100-200}", 0, "", 2, 0, INT_MAX); + reg_expected(hdr, "{chr1:100-200}", 0, "", 2, 0, INT64_MAX); reg_expected(hdr, "{chr1:100-200}:100-200", 0, "", 2, 99, 200); reg_expected(hdr, "{chr2:100-200}:100-200", 0, "", 3, 99, 200); reg_expected(hdr, "chr2:100-200:100-200", 0, "", 3, 99, 200); - reg_expected(hdr, "chr2:100-200", 0, "", 3, 0, INT_MAX); + reg_expected(hdr, "chr2:100-200", 0, "", 3, 0, INT64_MAX); // Check numerics - reg_expected(hdr, "chr3", 0, "", 4, 0, INT_MAX); - reg_expected(hdr, "chr3:", 0, "", 4, 0, INT_MAX); + reg_expected(hdr, "chr3", 0, "", 4, 0, INT64_MAX); + reg_expected(hdr, "chr3:", 0, "", 4, 0, INT64_MAX); reg_expected(hdr, "chr3:1000-1500", 0, "", 4, 999, 1500); reg_expected(hdr, "chr3:1,000-1,500", 0, "", 4, 999, 1500); reg_expected(hdr, "chr3:1k-1.5K", 0, "", 4, 999, 1500); @@ -115,11 +114,11 @@ int reg_test(char *fn) { reg_expected(hdr, "chr3:1e3-15e2", 0, "", 4, 999, 1500); // Check list mode - reg_expected(hdr, "chr1,chr3", HTS_PARSE_LIST, "chr3", 0, 0, INT_MAX); + reg_expected(hdr, "chr1,chr3", HTS_PARSE_LIST, "chr3", 0, 0, INT64_MAX); fprintf(stderr, "Expected error: "); reg_expected(hdr, "chr1:100-200,chr3", HTS_PARSE_LIST, NULL, 0, 0, 0); // ambiguous - reg_expected(hdr, "{chr1,chr3}", HTS_PARSE_LIST, "", 5, 0, INT_MAX); - reg_expected(hdr, "{chr1,chr3},chr1", HTS_PARSE_LIST, "chr1", 5, 0, INT_MAX); + reg_expected(hdr, "{chr1,chr3}", HTS_PARSE_LIST, "", 5, 0, INT64_MAX); + reg_expected(hdr, "{chr1,chr3},chr1", HTS_PARSE_LIST, "chr1", 5, 0, INT64_MAX); // incorrect usage; first reg is valid (but not what user expects). reg_expected(hdr, "chr3:1,000-1,500", HTS_PARSE_LIST | HTS_PARSE_ONE_COORD, "000-1,500", 4, 0, 1); @@ -186,13 +185,14 @@ int main(int argc, char **argv) { const char *reg = argv[2]; while (*reg) { - int tid, beg, end; + int tid; + int64_t beg, end; reg = sam_parse_region(hdr, reg, &tid, &beg, &end, flags); if (!reg) { fprintf(stderr, "Failed to parse region\n"); exit(1); } - printf("%-20s %12d %12d\n", + printf("%-20s %12"PRId64" %12"PRId64"\n", tid == -1 ? "*" : hdr->target_name[tid], beg, end); } From 0eac47dd7589125edc0c2f718aead2910e253530 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 29 May 2019 14:30:51 +0100 Subject: [PATCH 7/9] Avoid warning from gcc8 about memcpy overflow --- htslib/kstring.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/htslib/kstring.h b/htslib/kstring.h index ad8e280c0..c76a50878 100644 --- a/htslib/kstring.h +++ b/htslib/kstring.h @@ -156,6 +156,9 @@ static inline char *ks_release(kstring_t *s) static inline int kputsn(const char *p, size_t l, kstring_t *s) { + if (l > SIZE_MAX-2) + return EOF; + size_t new_sz = s->l + l + 2; if (new_sz < s->l || ks_resize(s, new_sz) < 0) return EOF; From 73eee10fa443e207c881434760b0ef6269ed3bdb Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 31 May 2019 10:18:09 +0100 Subject: [PATCH 8/9] Expand hts_parse_region documentation in the header file. Mainly copied from the useful (but not so visible) comment before the function definition in hts.c. --- htslib/faidx.h | 14 ++++++++++++-- htslib/hts.h | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/htslib/faidx.h b/htslib/faidx.h index 032f3377e..d18bff792 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -164,6 +164,10 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format); The returned sequence is allocated by `malloc()` family and should be destroyed by end users by calling `free()` on it. + +To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" +are reference names, quote using curly braces. +Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. */ char *fai_fetch(const faidx_t *fai, const char *reg, int *len); @@ -173,8 +177,10 @@ char *fai_fetch(const faidx_t *fai, const char *reg, int *len); @param len Length of the region; -2 if seq not present, -1 general error @return Pointer to the quality string; null on failure -The returned quality string is allocated by `malloc()` family and should be destroyed -by end users by calling `free()` on it. +The returned quality string is allocated by `malloc()` family and should be +destroyed by end users by calling `free()` on it. + +Region names can be quoted with curly braces, as for fai_fetch(). */ char *fai_fetchqual(const faidx_t *fai, const char *reg, int *len); @@ -234,6 +240,10 @@ int faidx_seq_len(const faidx_t *fai, const char *seq); @param end Returns the one past last of the region (0 based) @param flags Parsing method, see HTS_PARSE_* in hts.h. @return pointer to end of parsed s if successs, NULL if not. + + To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" + are reference names, quote using curly braces. + Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. */ const char *fai_parse_region(const faidx_t *fai, const char *s, int *tid, int64_t *beg, int64_t *end, int flags); diff --git a/htslib/hts.h b/htslib/hts.h index d353431c4..fb400e5a4 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -810,6 +810,47 @@ const char *hts_parse_reg(const char *str, int *beg, int *end); @return Pointer to the byte after the end of the entire region specifier (including any trailing comma) on success, or NULL if @a str could not be parsed. + + A variant of hts_parse_reg which is reference-id aware. It uses + the iterator name2id callbacks to validate the region tokenisation works. + + This is necessary due to GRCh38 HLA additions which have reference names + like "HLA-DRB1*12:17". + + To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" + are reference names, quote using curly braces. + Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. + + Flags are used to control how parsing works, and can be one of the below. + + HTS_PARSE_THOUSANDS_SEP: + Ignore commas in numbers. For example with this flag 1,234,567 + is interpreted as 1234567. + + HTS_PARSE_LIST: + If present, the region is assmed to be a comma separated list and + position parsing will not contain commas (this implicitly + clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal). + On success the return pointer will be the start of the next region, ie + the character after the comma. (If *ret != '\0' then the caller can + assume another region is present in the list.) + + If not set then positions may contain commas. In this case the return + value should point to the end of the string, or NULL on failure. + + HTS_PARSE_ONE_COORD: + If present, X:100 is treated as the single base pair region X:100-100. + In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-. + (This is the standard bcftools region convention.) + + When not set X:100 is considered to be X:100- where is + the end of chromosome X (set to INT_MAX here). X:100- and X:-100 are + invalid. + (This is the standard samtools region convention.) + + Note the supplied string expects 1 based inclusive coordinates, but the + returned coordinates start from 0 and are half open, so pos0 is valid + for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}" */ const char *hts_parse_region(const char *str, int *tid, int64_t *beg, int64_t *end, hts_name2id_f getid, void *hdr, int flags); From d26300ef473847db0df8f0c1f13658d97180af42 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 31 May 2019 14:05:01 +0100 Subject: [PATCH 9/9] Update hts_reglist_create() to use hts_parse_region() Hash table can now use tid (cast to khash32_t, which is unsigned) as key instead of the region name, avoiding some string copying. --- region.c | 82 ++++++++++++++++++++++++-------------------------------- 1 file changed, 35 insertions(+), 47 deletions(-) diff --git a/region.c b/region.c index 51593cd32..c2043334d 100644 --- a/region.c +++ b/region.c @@ -31,9 +31,10 @@ typedef struct reglist { uint32_t n, m; uint64_t *a; + int tid; } reglist_t; -KHASH_MAP_INIT_STR(reg, reglist_t) +KHASH_MAP_INIT_INT(reg, reglist_t) typedef kh_reg_t reghash_t; static int compare_uint64 (const void * a, const void * b) @@ -52,7 +53,7 @@ static void reg_print(reghash_t *h) { reglist_t *p; khint_t k; uint32_t i; - const char *reg; + khint32_t key; uint32_t beg, end; if (!h) { @@ -61,8 +62,8 @@ static void reg_print(reghash_t *h) { } for (k = kh_begin(h); k < kh_end(h); k++) { if (kh_exist(h,k)) { - reg = kh_key(h,k); - fprintf(stderr, "Region: '%s'\n", reg); + key = kh_key(h,k); + fprintf(stderr, "Region: key %u tid %d\n", key, p->tid); if ((p = &kh_val(h,k)) != NULL && p->n > 0) { for (i=0; in; i++) { beg = (uint32_t)(p->a[i]>>32); @@ -70,7 +71,7 @@ static void reg_print(reghash_t *h) { fprintf(stderr, "\tinterval[%d]: %d-%d\n", i, beg, end); } } else { - fprintf(stderr, "Region '%s' has no intervals!\n", reg); + fprintf(stderr, "Region key %u has no intervals!\n", key); } } } @@ -109,7 +110,7 @@ static int reg_compact(reghash_t *h) { return count; } -static int reg_insert(reghash_t *h, char *reg, unsigned int beg, unsigned int end) { +static int reg_insert(reghash_t *h, int tid, unsigned int beg, unsigned int end) { khint_t k; reglist_t *p; @@ -118,17 +119,15 @@ static int reg_insert(reghash_t *h, char *reg, unsigned int beg, unsigned int en return -1; // Put reg in the hash table if not already there - k = kh_get(reg, h, reg); //looks strange, but only the second reg is the actual region name. + k = kh_get(reg, h, tid); if (k == kh_end(h)) { // absent from the hash table int ret; - char *s = strdup(reg); - if (NULL == s) return -1; - k = kh_put(reg, h, s, &ret); + k = kh_put(reg, h, tid, &ret); if (-1 == ret) { - free(s); return -1; } memset(&kh_val(h, k), 0, sizeof(reglist_t)); + kh_val(h, k).tid = tid; } p = &kh_val(h, k); @@ -156,7 +155,6 @@ static void reg_destroy(reghash_t *h) { for (k = 0; k < kh_end(h); ++k) { if (kh_exist(h, k)) { free(kh_val(h, k).a); - free((char*)kh_key(h, k)); } } kh_destroy(reg, h); @@ -175,11 +173,10 @@ hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr hts_reglist_t *h_reglist = NULL; khint_t k; - int i, l_count = 0; + int i, l_count = 0, tid; uint32_t j; - char reg[1024]; const char *q; - int beg, end; + int64_t beg, end; /* First, transform the char array into a hash table */ h = kh_init(reg); @@ -189,25 +186,26 @@ hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr } for (i=0; i sizeof(reg) - 1) { - hts_log_error("Region name '%s' is too long (bigger than %d)", argv[i], (int) sizeof(reg) - 1); - continue; - } - memcpy(reg, argv[i], q - argv[i]); - reg[q - argv[i]] = 0; + if (!strcmp(argv[i], ".")) { + q = argv[i] + 1; + tid = HTS_IDX_START; beg = 0; end = INT64_MAX; + } else if (!strcmp(argv[i], "*")) { + q = argv[i] + 1; + tid = HTS_IDX_NOCOOR; beg = 0; end = INT64_MAX; } else { - // not parsable as a region, but possibly a sequence named "foo:a" - if (strlen(argv[i]) > sizeof(reg) - 1) { - hts_log_error("Region name '%s' is too long (bigger than %d)", argv[i], (int) sizeof(reg) - 1); - continue; - } - strcpy(reg, argv[i]); - beg = 0; end = INT_MAX; + q = hts_parse_region(argv[i], &tid, &beg, &end, getid, hdr, + HTS_PARSE_THOUSANDS_SEP); } + if (!q) { + // not parsable as a region + hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", argv[i]); + continue; + } + + if (beg > INT_MAX) beg = INT_MAX; // Remove when fully 64-bit compliant + if (end > INT_MAX) end = INT_MAX; // Remove when fully 64-bit compliant - if (reg_insert(h, reg, beg, end) != 0) { + if (reg_insert(h, tid, beg, end) != 0) { hts_log_error("Error when inserting region='%s' in the bed hash table at address=%p", argv[i], (void *) h); goto fail; } @@ -215,31 +213,21 @@ hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr *r_count = reg_compact(h); if (!*r_count) - return NULL; + goto fail; /* Transform the hash table into a list */ h_reglist = (hts_reglist_t *)calloc(*r_count, sizeof(hts_reglist_t)); if (!h_reglist) - return NULL; + goto fail; for (k = kh_begin(h); k < kh_end(h) && l_count < *r_count; k++) { if (!kh_exist(h,k) || !(p = &kh_val(h,k))) continue; - char *reg_name = (char *)kh_key(h,k); - if (!strcmp(reg_name, ".")) { - h_reglist[l_count].tid = HTS_IDX_START; - } else if (!strcmp(reg_name, "*")) { - h_reglist[l_count].tid = HTS_IDX_NOCOOR; - } else { - h_reglist[l_count].tid = getid(hdr, reg_name); - if (h_reglist[l_count].tid < 0) - hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", reg_name); - } - - h_reglist[l_count].intervals = (hts_pair32_t *)calloc(p->n, sizeof(hts_pair32_t)); + h_reglist[l_count].tid = p->tid; + h_reglist[l_count].intervals = calloc(p->n, sizeof(h_reglist[l_count].intervals[0])); if(!(h_reglist[l_count].intervals)) { - hts_log_error("Could not allocate memory for intervals for region='%s'", kh_key(h,k)); + hts_log_error("Could not allocate memory for intervals"); goto fail; } h_reglist[l_count].count = p->n; @@ -247,7 +235,7 @@ hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr for (j = 0; j < p->n; j++) { h_reglist[l_count].intervals[j].beg = (uint32_t)(p->a[j]>>32); - h_reglist[l_count].intervals[j].end = (uint32_t)(p->a[j]); + h_reglist[l_count].intervals[j].end = (uint32_t)(p->a[j] & 0xffffffffU); if (h_reglist[l_count].intervals[j].end > h_reglist[l_count].max_end) h_reglist[l_count].max_end = h_reglist[l_count].intervals[j].end;