diff --git a/Makefile b/Makefile index 19c0579ad..b3e72e848 100644 --- a/Makefile +++ b/Makefile @@ -82,7 +82,8 @@ BUILT_TEST_PROGRAMS = \ test/test-vcf-sweep \ test/test-bcf-sr \ test/fuzz/hts_open_fuzzer.o \ - test/test-bcf-translate + test/test-bcf-translate \ + test/test-parse-reg BUILT_THRASH_PROGRAMS = \ test/thrash_threads1 \ @@ -377,6 +378,7 @@ check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) test/fieldarith test/fieldarith.sam test/hfile test/test_bgzf test/bgziptest.txt + test/test-parse-reg -t test/colons.bam cd test/tabix && ./test-tabix.sh tabix.tst cd test/mpileup && ./test-pileup.sh mpileup.tst REF_PATH=: test/sam test/ce.fa test/faidx.fa test/fastqs.fq @@ -413,6 +415,9 @@ test/test_realn: test/test_realn.o libhts.a test/test-regidx: test/test-regidx.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test-regidx.o libhts.a $(LIBS) -lpthread +test/test-parse-reg: test/test-parse-reg.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test-parse-reg.o libhts.a $(LIBS) -lpthread + test/test_view: test/test_view.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_view.o libhts.a $(LIBS) -lpthread @@ -439,6 +444,7 @@ test/pileup.o: test/pileup.c config.h $(htslib_sam_h) $(htslib_kstring_h) test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hfile_internal_h) test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h) +test/test-parse-reg.o: test/test-parse-reg.c $(htslib_hts_h) $(htslib_sam_h) test/test-realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h) test/test-regidx.o: test/test-regidx.c config.h $(htslib_regidx_h) $(hts_internal_h) test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) $(htslib_vcf_h) diff --git a/faidx.c b/faidx.c index 547d1c3b1..839e05939 100644 --- a/faidx.c +++ b/faidx.c @@ -43,6 +43,7 @@ DEALINGS IN THE SOFTWARE. */ #include "hts_internal.h" typedef struct { + int id; // faidx_t->name[id] is for this struct. uint32_t line_len, line_blen; uint64_t len; uint64_t seq_offset; @@ -62,6 +63,13 @@ struct __faidx_t { #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif +static int fai_name2id(void *v, const char *ref) +{ + faidx_t *fai = (faidx_t *)v; + khint_t k = kh_get(s, fai->hash, ref); + return k == kh_end(fai->hash) ? -1 : kh_val(fai->hash, k).id; +} + static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, uint32_t line_len, uint32_t line_blen, uint64_t seq_offset, uint64_t qual_offset) { if (!name) { @@ -89,6 +97,7 @@ static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, } idx->name = tmp; } + v->id = idx->n; idx->name[idx->n++] = name_key; v->len = len; v->line_len = line_len; @@ -684,14 +693,22 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format) { static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, - uint64_t offset, long beg, long end, int *len) { + uint64_t offset, int64_t beg, int64_t end, int *len) { char *s; size_t l; int c = 0; - int ret = bgzf_useek(fai->bgzf, - offset - + beg / val->line_blen * val->line_len - + beg % val->line_blen, SEEK_SET); + int ret; + + if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) { + hts_log_error("Range %"PRId64"..%"PRId64" too big", beg, end); + *len = -1; + return NULL; + } + + ret = bgzf_useek(fai->bgzf, + offset + + beg / val->line_blen * val->line_len + + beg % val->line_blen, SEEK_SET); if (ret < 0) { *len = -1; @@ -721,85 +738,32 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, return s; } - -static int fai_get_val(const faidx_t *fai, const char *str, int *len, faidx1_t *val, long *fbeg, long *fend) { - char *s, *ep; - size_t i, l, k, name_end; +static int fai_get_val(const faidx_t *fai, const char *str, + int *len, faidx1_t *val, int64_t *fbeg, int64_t *fend) { khiter_t iter; khash_t(s) *h; - long beg, end; + int id; + int64_t beg, end; - beg = end = -1; - h = fai->hash; - name_end = l = strlen(str); - s = (char*)malloc(l+1); - if (!s) { - *len = -1; + if (!fai_parse_region(fai, str, &id, &beg, &end, 0)) { + hts_log_warning("Reference %s not found in FASTA file, returning empty sequence", str); + *len = -2; return 1; } - // remove space - for (i = k = 0; i < l; ++i) - if (!isspace_c(str[i])) s[k++] = str[i]; - s[k] = 0; - name_end = l = k; - // determine the sequence name - for (i = l; i > 0; --i) if (s[i - 1] == ':') break; // look for colon from the end - if (i > 0) name_end = i - 1; - if (name_end < l) { // check if this is really the end - int n_hyphen = 0; - for (i = name_end + 1; i < l; ++i) { - if (s[i] == '-') ++n_hyphen; - else if (!isdigit_c(s[i]) && s[i] != ',') break; - } - if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name - s[name_end] = 0; - iter = kh_get(s, h, s); - if (iter == kh_end(h)) { // cannot find the sequence name - iter = kh_get(s, h, str); // try str as the name - if (iter != kh_end(h)) { - s[name_end] = ':'; - name_end = l; - } - } - } else iter = kh_get(s, h, str); - if(iter == kh_end(h)) { - hts_log_warning("Reference %s not found in file, returning empty sequence", str); - free(s); + h = fai->hash; + iter = kh_get(s, h, faidx_iseq(fai, id)); + if (!iter) { + // should have already been caught above + abort(); *len = -2; return 1; } *val = kh_value(h, iter); - // parse the interval - if (name_end < l) { - int save_errno = errno; - errno = 0; - for (i = k = name_end + 1; i < l; ++i) - if (s[i] != ',') s[k++] = s[i]; - s[k] = 0; - if (s[name_end + 1] == '-') { - beg = 0; - i = name_end + 2; - } else { - beg = strtol(s + name_end + 1, &ep, 10); - for (i = ep - s; i < k;) if (s[i++] == '-') break; - } - end = i < k? strtol(s + i, &ep, 10) : val->len; - if (beg > 0) --beg; - // Check for out of range numbers. Only going to be a problem on - // 32-bit platforms with >2Gb sequence length. - if (errno == ERANGE && (uint64_t) val->len > LONG_MAX) { - hts_log_error("Positions in range %s are too large for this platform", s); - free(s); - *len = -3; - return 1; - } - errno = save_errno; - } else beg = 0, end = val->len; + if (beg >= val->len) beg = val->len; if (end >= val->len) end = val->len; if (beg > end) beg = end; - free(s); *fbeg = beg; *fend = end; @@ -811,7 +775,7 @@ static int fai_get_val(const faidx_t *fai, const char *str, int *len, faidx1_t * char *fai_fetch(const faidx_t *fai, const char *str, int *len) { faidx1_t val; - long beg, end; + int64_t beg, end; if (fai_get_val(fai, str, len, &val, &beg, &end)) { return NULL; @@ -824,7 +788,7 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len) char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) { faidx1_t val; - long beg, end; + int64_t beg, end; if (fai_get_val(fai, str, len, &val, &beg, &end)) { return NULL; @@ -924,3 +888,8 @@ int faidx_has_seq(const faidx_t *fai, const char *seq) return 1; } +const char *fai_parse_region(const faidx_t *fai, const char *s, + int *tid, int64_t *beg, int64_t *end, int flags) +{ + return hts_parse_region(s, tid, beg, end, (hts_name2id_f)fai_name2id, (void *)fai, flags); +} diff --git a/hts.c b/hts.c index bcacb4451..05aaa26be 100644 --- a/hts.c +++ b/hts.c @@ -2607,6 +2607,12 @@ long long hts_parse_decimal(const char *str, char **strend, int flags) if (esign == '-') e = -e; } + switch (*s) { + case 'k': case 'K': e += 3; s++; break; + case 'm': case 'M': e += 6; s++; break; + case 'g': case 'G': e += 9; s++; break; + } + e -= decimals; while (e > 0) n *= 10, e--; while (e < 0) lost += n % 10, n /= 10, e++; @@ -2618,12 +2624,222 @@ long long hts_parse_decimal(const char *str, char **strend, int flags) if (strend) { *strend = (char *)s; } else if (*s) { - hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s); + if ((flags & HTS_PARSE_THOUSANDS_SEP) || (!(flags & HTS_PARSE_THOUSANDS_SEP) && *s != ',')) + hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s); } return (sign == '+')? n : -n; } +static void *hts_memrchr(const void *s, int c, size_t n) { + size_t i; + unsigned char *u = (unsigned char *)s; + for (i = n; i > 0; i--) { + if (u[i-1] == c) + return u+i-1; + } + + return NULL; +} + +/* + * A variant of hts_parse_reg which is reference-id aware. It uses + * the iterator name2id callbacks to validate the region tokenisation works. + * + * This is necessary due to GRCh38 HLA additions which have reference names + * like "HLA-DRB1*12:17". + * + * All parameters are mandatory. + * + * To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" + * are reference names, we may quote using curly braces. + * Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. + * + * Flags are used to control how parsing works, and can be one of the below. + * + * HTS_PARSE_LIST: + * If present, the region is assmed to be a comma separated list and + * position parsing will not contain commas (this implicitly + * clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal). + * On success the return pointer will be the start of the next region, ie + * the character after the comma. (If *ret != '\0' then the caller can + * assume another region is present in the list.) + * + * If not set then positions may contain commas. In this case the return + * value should point to the end of the string, or NULL on failure. + * + * HTS_PARSE_ONE_COORD: + * If present, X:100 is treated as the single base pair region X:100-100. + * In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-. + * (This is the standard bcftools region convention.) + * + * When not set X:100 is considered to be X:100- where is + * the end of chromosome X (set to INT_MAX here). X:100- and X:-100 are + * invalid. + * (This is the standard samtools region convention.) + * + * Note the supplied string expects 1 based inclusive coordinates, but the + * returned coordinates start from 0 and are half open, so pos0 is valid + * for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}" + * + * On success a pointer to the byte after the end of the entire region + * specifier is returned (plus any trailing comma), and tid, + * beg & end will be set. + * On failure NULL is returned. + */ +const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end, + hts_name2id_f getid, void *hdr, int flags) +{ + if (!s || !tid || !beg || !end || !getid) + return NULL; + + size_t s_len = strlen(s); + kstring_t ks = { 0, 0, NULL }; + + const char *colon = NULL, *comma = NULL; + int quoted = 0; + + if (flags & HTS_PARSE_LIST) + flags &= ~HTS_PARSE_THOUSANDS_SEP; + else + flags |= HTS_PARSE_THOUSANDS_SEP; + + const char *s_end = s + s_len; + + // Braced quoting of references is permitted to resolve ambiguities. + if (*s == '{') { + const char *close = memchr(s, '}', s_len); + if (!close) { + hts_log_error("Mismatching braces in \"%s\"", s); + return NULL; + } + s++; + s_len--; + if (close[1] == ':') + colon = close+1; + quoted = 1; // number of trailing characters to trim + + // Truncate to this item only, if appropriate. + if (flags & HTS_PARSE_LIST) { + comma = strchr(close, ','); + if (comma) { + s_len = comma-s; + s_end = comma+1; + } + } + } else { + // Truncate to this item only, if appropriate. + if (flags & HTS_PARSE_LIST) { + comma = strchr(s, ','); + if (comma) { + s_len = comma-s; + s_end = comma+1; + } + } + + colon = hts_memrchr(s, ':', s_len); + } + + // No colon is simplest case; just check and return. + if (colon == NULL) { + *beg = 0; *end = INT64_MAX; + kputsn(s, s_len-quoted, &ks); // convert to nul terminated string + if (!ks.s) { + *tid = -1; + return NULL; + } + + *tid = getid(hdr, ks.s); + free(ks.s); + + return *tid >= 0 ? s_end : NULL; + } + + // Has a colon, but check whole name first. + if (!quoted) { + *beg = 0; *end = INT64_MAX; + kputsn(s, s_len, &ks); // convert to nul terminated string + if (!ks.s) { + *tid = -1; + return NULL; + } + if ((*tid = getid(hdr, ks.s)) >= 0) { + // Entire name matches, but also check this isn't + // ambiguous. eg we have ref chr1 and ref chr1:100-200 + // both present. + ks.l = 0; + kputsn(s, colon-s, &ks); // convert to nul terminated string + if (!ks.s) { + *tid = -1; + return NULL; + } + if (getid(hdr, ks.s) >= 0) { + free(ks.s); + *tid = -1; + hts_log_error("Range is ambiguous. " + "Use {%s} or {%.*s}%s instead", + s, (int)(colon-s), s, colon); + return NULL; + } + free(ks.s); + + return s_end; + } + } + + // Quoted, or unquoted and whole string isn't a name. + // Check the pre-colon part is valid. + ks.l = 0; + kputsn(s, colon-s-quoted, &ks); // convert to nul terminated string + if (!ks.s) { + *tid = -1; + return NULL; + } + *tid = getid(hdr, ks.s); + free(ks.s); + if (*tid < 0) + return NULL; + + // Finally parse the post-colon coordinates + char *hyphen; + *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1; + if (*beg < 0) { + if (isdigit(*hyphen) || *hyphen == '\0' || *hyphen == ',') { + // interpret chr:-100 as chr:1-100 + *end = *beg==-1 ? INT64_MAX : -(*beg+1); + *beg = 0; + return s_end; + } else if (*hyphen == '-') { + *beg = 0; + } else { + hts_log_error("Unexpected string \"%s\" after region", hyphen); + return NULL; + } + } + + if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) { + *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : INT64_MAX; + } else if (*hyphen == '-') { + *end = hts_parse_decimal(hyphen+1, &hyphen, flags); + if (*hyphen != '\0' && *hyphen != ',') { + hts_log_error("Unexpected string \"%s\" after region", hyphen); + return NULL; + } + } else { + hts_log_error("Unexpected string \"%s\" after region", hyphen); + return NULL; + } + + if (*end == 0) + *end = INT64_MAX; // interpret chr:100- as chr:100- + + if (*beg >= *end) return NULL; + + return s_end; +} + +// Next release we should mark this as deprecated? +// Use hts_parse_region above instead. const char *hts_parse_reg(const char *s, int *beg, int *end) { char *hyphen; @@ -2646,33 +2862,19 @@ const char *hts_parse_reg(const char *s, int *beg, int *end) hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec) { - int tid, beg, end; - const char *q; + int tid; + int64_t beg, end; if (strcmp(reg, ".") == 0) return itr_query(idx, HTS_IDX_START, 0, 0, readrec); else if (strcmp(reg, "*") == 0) return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec); - q = hts_parse_reg(reg, &beg, &end); - if (q) { - char tmp_a[1024], *tmp = tmp_a; - if (q - reg + 1 > 1024) - if (!(tmp = malloc(q - reg + 1))) - return NULL; - strncpy(tmp, reg, q - reg); - tmp[q - reg] = 0; - tid = getid(hdr, tmp); - if (tmp != tmp_a) - free(tmp); - } - else { - // not parsable as a region, but possibly a sequence named "foo:a" - tid = getid(hdr, reg); - beg = 0; end = INT_MAX; - } + if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP)) + return NULL; + + if (end > INT_MAX) end = INT_MAX; // Remove when fully 64-bit compliant - if (tid < 0) return NULL; return itr_query(idx, tid, beg, end, readrec); } diff --git a/htslib/faidx.h b/htslib/faidx.h index e83143a9c..d18bff792 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -29,6 +29,7 @@ #ifndef HTSLIB_FAIDX_H #define HTSLIB_FAIDX_H +#include #include "hts_defs.h" #ifdef __cplusplus @@ -163,6 +164,10 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format); The returned sequence is allocated by `malloc()` family and should be destroyed by end users by calling `free()` on it. + +To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" +are reference names, quote using curly braces. +Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. */ char *fai_fetch(const faidx_t *fai, const char *reg, int *len); @@ -172,8 +177,10 @@ char *fai_fetch(const faidx_t *fai, const char *reg, int *len); @param len Length of the region; -2 if seq not present, -1 general error @return Pointer to the quality string; null on failure -The returned quality string is allocated by `malloc()` family and should be destroyed -by end users by calling `free()` on it. +The returned quality string is allocated by `malloc()` family and should be +destroyed by end users by calling `free()` on it. + +Region names can be quoted with curly braces, as for fai_fetch(). */ char *fai_fetchqual(const faidx_t *fai, const char *reg, int *len); @@ -225,6 +232,21 @@ const char *faidx_iseq(const faidx_t *fai, int i); /// Return sequence length, -1 if not present int faidx_seq_len(const faidx_t *fai, const char *seq); +/// Parses a region string. +/** @param fai Pointer to the faidx_t struct + @param s Region string + @param tid Returns which i-th sequence is described in the region. + @param beg Returns the start of the region (0 based) + @param end Returns the one past last of the region (0 based) + @param flags Parsing method, see HTS_PARSE_* in hts.h. + @return pointer to end of parsed s if successs, NULL if not. + + To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" + are reference names, quote using curly braces. + Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. +*/ +const char *fai_parse_region(const faidx_t *fai, const char *s, int *tid, int64_t *beg, int64_t *end, int flags); + #ifdef __cplusplus } #endif diff --git a/htslib/hts.h b/htslib/hts.h index c3c1c3e9f..fb400e5a4 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -769,6 +769,8 @@ uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx); // Region parsing #define HTS_PARSE_THOUSANDS_SEP 1 ///< Ignore ',' separators within numbers +#define HTS_PARSE_ONE_COORD 2 ///< chr:pos means chr:pos-pos and not chr:pos-end +#define HTS_PARSE_LIST 4 ///< Expect a comma separated list of regions. (Disables HTS_PARSE_THOUSANDS_SEP) /// Parse a numeric string /** The number may be expressed in scientific notation, and optionally may @@ -791,8 +793,69 @@ long long hts_parse_decimal(const char *str, char **strend, int flags); @return Pointer to the colon or '\0' after the reference sequence name, or NULL if @a str could not be parsed. */ + +typedef int (*hts_name2id_f)(void*, const char*); +typedef const char *(*hts_id2name_f)(void*, int); + const char *hts_parse_reg(const char *str, int *beg, int *end); +/// Parse a "CHR:START-END"-style region string +/** @param str String to be parsed + @param tid Set on return (if not NULL) to be reference index (-1 if invalid) + @param beg Set on return to the 0-based start of the region + @param end Set on return to the 1-based end of the region + @param getid Function pointer. Called if not NULL to set tid. + @param hdr Caller data passed to getid. + @param flags Bitwise HTS_PARSE_* flags listed above. + @return Pointer to the byte after the end of the entire region + specifier (including any trailing comma) on success, + or NULL if @a str could not be parsed. + + A variant of hts_parse_reg which is reference-id aware. It uses + the iterator name2id callbacks to validate the region tokenisation works. + + This is necessary due to GRCh38 HLA additions which have reference names + like "HLA-DRB1*12:17". + + To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" + are reference names, quote using curly braces. + Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. + + Flags are used to control how parsing works, and can be one of the below. + + HTS_PARSE_THOUSANDS_SEP: + Ignore commas in numbers. For example with this flag 1,234,567 + is interpreted as 1234567. + + HTS_PARSE_LIST: + If present, the region is assmed to be a comma separated list and + position parsing will not contain commas (this implicitly + clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal). + On success the return pointer will be the start of the next region, ie + the character after the comma. (If *ret != '\0' then the caller can + assume another region is present in the list.) + + If not set then positions may contain commas. In this case the return + value should point to the end of the string, or NULL on failure. + + HTS_PARSE_ONE_COORD: + If present, X:100 is treated as the single base pair region X:100-100. + In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-. + (This is the standard bcftools region convention.) + + When not set X:100 is considered to be X:100- where is + the end of chromosome X (set to INT_MAX here). X:100- and X:-100 are + invalid. + (This is the standard samtools region convention.) + + Note the supplied string expects 1 based inclusive coordinates, but the + returned coordinates start from 0 and are half open, so pos0 is valid + for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}" +*/ +const char *hts_parse_region(const char *str, int *tid, int64_t *beg, int64_t *end, + hts_name2id_f getid, void *hdr, int flags); + + /////////////////////////////////////////////////////////// // Generic iterators // @@ -818,8 +881,6 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_re */ void hts_itr_destroy(hts_itr_t *iter); -typedef int (*hts_name2id_f)(void*, const char*); -typedef const char *(*hts_id2name_f)(void*, int); typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec); /// Create a single-region iterator from a text region specification diff --git a/htslib/kstring.h b/htslib/kstring.h index ad8e280c0..c76a50878 100644 --- a/htslib/kstring.h +++ b/htslib/kstring.h @@ -156,6 +156,9 @@ static inline char *ks_release(kstring_t *s) static inline int kputsn(const char *p, size_t l, kstring_t *s) { + if (l > SIZE_MAX-2) + return EOF; + size_t new_sz = s->l + l + 2; if (new_sz < s->l || ks_resize(s, new_sz) < 0) return EOF; diff --git a/htslib/sam.h b/htslib/sam.h index ea7baa909..002f894dc 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -281,6 +281,7 @@ typedef struct { int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) HTS_RESULT_USED; void bam_hdr_destroy(bam_hdr_t *h); int bam_name2id(bam_hdr_t *h, const char *ref); + const char *sam_parse_region(bam_hdr_t *h, const char *s, int *tid, int64_t *beg, int64_t *end, int flags); bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0); bam1_t *bam_init1(void); diff --git a/region.c b/region.c index 51593cd32..c2043334d 100644 --- a/region.c +++ b/region.c @@ -31,9 +31,10 @@ typedef struct reglist { uint32_t n, m; uint64_t *a; + int tid; } reglist_t; -KHASH_MAP_INIT_STR(reg, reglist_t) +KHASH_MAP_INIT_INT(reg, reglist_t) typedef kh_reg_t reghash_t; static int compare_uint64 (const void * a, const void * b) @@ -52,7 +53,7 @@ static void reg_print(reghash_t *h) { reglist_t *p; khint_t k; uint32_t i; - const char *reg; + khint32_t key; uint32_t beg, end; if (!h) { @@ -61,8 +62,8 @@ static void reg_print(reghash_t *h) { } for (k = kh_begin(h); k < kh_end(h); k++) { if (kh_exist(h,k)) { - reg = kh_key(h,k); - fprintf(stderr, "Region: '%s'\n", reg); + key = kh_key(h,k); + fprintf(stderr, "Region: key %u tid %d\n", key, p->tid); if ((p = &kh_val(h,k)) != NULL && p->n > 0) { for (i=0; in; i++) { beg = (uint32_t)(p->a[i]>>32); @@ -70,7 +71,7 @@ static void reg_print(reghash_t *h) { fprintf(stderr, "\tinterval[%d]: %d-%d\n", i, beg, end); } } else { - fprintf(stderr, "Region '%s' has no intervals!\n", reg); + fprintf(stderr, "Region key %u has no intervals!\n", key); } } } @@ -109,7 +110,7 @@ static int reg_compact(reghash_t *h) { return count; } -static int reg_insert(reghash_t *h, char *reg, unsigned int beg, unsigned int end) { +static int reg_insert(reghash_t *h, int tid, unsigned int beg, unsigned int end) { khint_t k; reglist_t *p; @@ -118,17 +119,15 @@ static int reg_insert(reghash_t *h, char *reg, unsigned int beg, unsigned int en return -1; // Put reg in the hash table if not already there - k = kh_get(reg, h, reg); //looks strange, but only the second reg is the actual region name. + k = kh_get(reg, h, tid); if (k == kh_end(h)) { // absent from the hash table int ret; - char *s = strdup(reg); - if (NULL == s) return -1; - k = kh_put(reg, h, s, &ret); + k = kh_put(reg, h, tid, &ret); if (-1 == ret) { - free(s); return -1; } memset(&kh_val(h, k), 0, sizeof(reglist_t)); + kh_val(h, k).tid = tid; } p = &kh_val(h, k); @@ -156,7 +155,6 @@ static void reg_destroy(reghash_t *h) { for (k = 0; k < kh_end(h); ++k) { if (kh_exist(h, k)) { free(kh_val(h, k).a); - free((char*)kh_key(h, k)); } } kh_destroy(reg, h); @@ -175,11 +173,10 @@ hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr hts_reglist_t *h_reglist = NULL; khint_t k; - int i, l_count = 0; + int i, l_count = 0, tid; uint32_t j; - char reg[1024]; const char *q; - int beg, end; + int64_t beg, end; /* First, transform the char array into a hash table */ h = kh_init(reg); @@ -189,25 +186,26 @@ hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr } for (i=0; i sizeof(reg) - 1) { - hts_log_error("Region name '%s' is too long (bigger than %d)", argv[i], (int) sizeof(reg) - 1); - continue; - } - memcpy(reg, argv[i], q - argv[i]); - reg[q - argv[i]] = 0; + if (!strcmp(argv[i], ".")) { + q = argv[i] + 1; + tid = HTS_IDX_START; beg = 0; end = INT64_MAX; + } else if (!strcmp(argv[i], "*")) { + q = argv[i] + 1; + tid = HTS_IDX_NOCOOR; beg = 0; end = INT64_MAX; } else { - // not parsable as a region, but possibly a sequence named "foo:a" - if (strlen(argv[i]) > sizeof(reg) - 1) { - hts_log_error("Region name '%s' is too long (bigger than %d)", argv[i], (int) sizeof(reg) - 1); - continue; - } - strcpy(reg, argv[i]); - beg = 0; end = INT_MAX; + q = hts_parse_region(argv[i], &tid, &beg, &end, getid, hdr, + HTS_PARSE_THOUSANDS_SEP); } + if (!q) { + // not parsable as a region + hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", argv[i]); + continue; + } + + if (beg > INT_MAX) beg = INT_MAX; // Remove when fully 64-bit compliant + if (end > INT_MAX) end = INT_MAX; // Remove when fully 64-bit compliant - if (reg_insert(h, reg, beg, end) != 0) { + if (reg_insert(h, tid, beg, end) != 0) { hts_log_error("Error when inserting region='%s' in the bed hash table at address=%p", argv[i], (void *) h); goto fail; } @@ -215,31 +213,21 @@ hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr *r_count = reg_compact(h); if (!*r_count) - return NULL; + goto fail; /* Transform the hash table into a list */ h_reglist = (hts_reglist_t *)calloc(*r_count, sizeof(hts_reglist_t)); if (!h_reglist) - return NULL; + goto fail; for (k = kh_begin(h); k < kh_end(h) && l_count < *r_count; k++) { if (!kh_exist(h,k) || !(p = &kh_val(h,k))) continue; - char *reg_name = (char *)kh_key(h,k); - if (!strcmp(reg_name, ".")) { - h_reglist[l_count].tid = HTS_IDX_START; - } else if (!strcmp(reg_name, "*")) { - h_reglist[l_count].tid = HTS_IDX_NOCOOR; - } else { - h_reglist[l_count].tid = getid(hdr, reg_name); - if (h_reglist[l_count].tid < 0) - hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", reg_name); - } - - h_reglist[l_count].intervals = (hts_pair32_t *)calloc(p->n, sizeof(hts_pair32_t)); + h_reglist[l_count].tid = p->tid; + h_reglist[l_count].intervals = calloc(p->n, sizeof(h_reglist[l_count].intervals[0])); if(!(h_reglist[l_count].intervals)) { - hts_log_error("Could not allocate memory for intervals for region='%s'", kh_key(h,k)); + hts_log_error("Could not allocate memory for intervals"); goto fail; } h_reglist[l_count].count = p->n; @@ -247,7 +235,7 @@ hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr for (j = 0; j < p->n; j++) { h_reglist[l_count].intervals[j].beg = (uint32_t)(p->a[j]>>32); - h_reglist[l_count].intervals[j].end = (uint32_t)(p->a[j]); + h_reglist[l_count].intervals[j].end = (uint32_t)(p->a[j] & 0xffffffffU); if (h_reglist[l_count].intervals[j].end > h_reglist[l_count].max_end) h_reglist[l_count].max_end = h_reglist[l_count].intervals[j].end; diff --git a/sam.c b/sam.c index 5bc6facc1..a1e9172e8 100644 --- a/sam.c +++ b/sam.c @@ -298,6 +298,10 @@ int bam_name2id(bam_hdr_t *h, const char *ref) return k == kh_end(d)? -1 : kh_val(d, k); } +const char *sam_parse_region(bam_hdr_t *h, const char *s, int *tid, int64_t *beg, int64_t *end, int flags) { + return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags); +} + /************************* *** BAM alignment I/O *** *************************/ diff --git a/test/colons.bam b/test/colons.bam new file mode 100644 index 000000000..53b603130 Binary files /dev/null and b/test/colons.bam differ diff --git a/test/colons.bam.bai b/test/colons.bam.bai new file mode 100644 index 000000000..71dbd1a89 Binary files /dev/null and b/test/colons.bam.bai differ diff --git a/test/test-parse-reg.c b/test/test-parse-reg.c new file mode 100644 index 000000000..4c64cb2fe --- /dev/null +++ b/test/test-parse-reg.c @@ -0,0 +1,204 @@ +/* + Copyright (C) 2018 Genome Research Ltd. + + Author: James Bonfield + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +/* + Test region description parser. + Usage: test-parse-reg [-c] file.bam region + test-parse-reg [-c] -m file.bam region,region... + test-parse-reg -t + + -c is chr:pos is a single base coordinate, ie chr:pos-pos, + otherwise it is chr:pos- + -m is multi-region list. + -t runs built-in tests + + ./test/test-parse-reg -c -m test/colons.bam "{chr1:100-200},{chr1}:100-200,{chr1:100-200}:100,{chr1,chr3},chr1:" +*/ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +void reg_expected(bam_hdr_t *hdr, const char *reg, int flags, + char *reg_exp, int tid_exp, int64_t beg_exp, int64_t end_exp) { + const char *reg_out; + int tid_out = -1; + int64_t beg_out = -1, end_out = -1; + + reg_out = sam_parse_region(hdr, reg, &tid_out, &beg_out, &end_out, flags); + + if ((reg_out != NULL) != (reg_exp != NULL) || + (reg_out && reg_exp && strcmp(reg_out, reg_exp) != 0) || + (reg_exp && tid_out != tid_exp) || + (reg_exp && beg_out != beg_exp) || + (reg_exp && end_out != end_exp)) { + fprintf(stderr, "Parsing \"%s\" expected return \"%s\", %d:%"PRId64"-%"PRId64", " + "but got \"%s\", %d:%"PRId64"-%"PRId64"\n", + reg, + reg_exp?reg_exp:"(null)", tid_exp, beg_exp, end_exp, + reg_out?reg_out:"(null)", tid_out, beg_out, end_out); + exit(1); + } +} + +int reg_test(char *fn) { + samFile *fp; + bam_hdr_t *hdr; + + if (!(fp = sam_open(fn, "r"))) + return 1; + + if (!(hdr = sam_hdr_read(fp))) + return 1; + + // 0 chr1 + // 1 chr1:100 + // 2 chr1:100-200 + // 3 chr2:100-200 + // 4 chr3 + // 5 chr1,chr3 + + // Check range extensions. + reg_expected(hdr, "chr1", 0, "", 0, 0, INT64_MAX); + reg_expected(hdr, "chr1:50", 0, "", 0, 49, INT64_MAX); + reg_expected(hdr, "chr1:50", HTS_PARSE_ONE_COORD, "", 0, 49, 50); + reg_expected(hdr, "chr1:50-100", 0, "", 0, 49, 100); + reg_expected(hdr, "chr1:50-", 0, "", 0, 49, INT64_MAX); + reg_expected(hdr, "chr1:-50", 0, "", 0, 0, 50); + + // Check quoting + fprintf(stderr, "Expected error: "); + reg_expected(hdr, "chr1:100-200", 0, NULL, 0, 0, 0); // ambiguous + reg_expected(hdr, "{chr1}:100-200", 0, "", 0, 99, 200); + reg_expected(hdr, "{chr1:100-200}", 0, "", 2, 0, INT64_MAX); + reg_expected(hdr, "{chr1:100-200}:100-200", 0, "", 2, 99, 200); + reg_expected(hdr, "{chr2:100-200}:100-200", 0, "", 3, 99, 200); + reg_expected(hdr, "chr2:100-200:100-200", 0, "", 3, 99, 200); + reg_expected(hdr, "chr2:100-200", 0, "", 3, 0, INT64_MAX); + + // Check numerics + reg_expected(hdr, "chr3", 0, "", 4, 0, INT64_MAX); + reg_expected(hdr, "chr3:", 0, "", 4, 0, INT64_MAX); + reg_expected(hdr, "chr3:1000-1500", 0, "", 4, 999, 1500); + reg_expected(hdr, "chr3:1,000-1,500", 0, "", 4, 999, 1500); + reg_expected(hdr, "chr3:1k-1.5K", 0, "", 4, 999, 1500); + reg_expected(hdr, "chr3:1e3-1.5e3", 0, "", 4, 999, 1500); + reg_expected(hdr, "chr3:1e3-15e2", 0, "", 4, 999, 1500); + + // Check list mode + reg_expected(hdr, "chr1,chr3", HTS_PARSE_LIST, "chr3", 0, 0, INT64_MAX); + fprintf(stderr, "Expected error: "); + reg_expected(hdr, "chr1:100-200,chr3", HTS_PARSE_LIST, NULL, 0, 0, 0); // ambiguous + reg_expected(hdr, "{chr1,chr3}", HTS_PARSE_LIST, "", 5, 0, INT64_MAX); + reg_expected(hdr, "{chr1,chr3},chr1", HTS_PARSE_LIST, "chr1", 5, 0, INT64_MAX); + // incorrect usage; first reg is valid (but not what user expects). + reg_expected(hdr, "chr3:1,000-1,500", HTS_PARSE_LIST | HTS_PARSE_ONE_COORD, "000-1,500", 4, 0, 1); + + // More expected failures + reg_expected(hdr, "chr2", 0, NULL, 0, 0, 0); + reg_expected(hdr, "chr1,", 0, NULL, 0, 0, 0); + fprintf(stderr, "Expected error: "); + reg_expected(hdr, "{chr1", 0, NULL, 0, 0, 0); + reg_expected(hdr, "chr1:10-10", 0, "", 0, 9, 10); // OK + reg_expected(hdr, "chr1:10-9", 0, NULL, 0, 0, 0); // Issue#353 + fprintf(stderr, "Expected error: "); + reg_expected(hdr, "chr1:x", 0, NULL, 0, 0, 0); + fprintf(stderr, "Expected error: "); + reg_expected(hdr, "chr1:1-y", 0, NULL, 0, 0, 0); + fprintf(stderr, "Expected error: "); + reg_expected(hdr, "chr1:1,chr3", 0, NULL, 0, 0, 0); + + bam_hdr_destroy(hdr); + sam_close(fp); + + exit(0); +} + +int main(int argc, char **argv) { + bam_hdr_t *hdr; + samFile *fp; + int flags = 0; + + while (argc > 1) { + if (strcmp(argv[1], "-m") == 0) { + flags |= HTS_PARSE_LIST; + argc--; argv++; + continue; + } + + if (strcmp(argv[1], "-c") == 0) { + flags |= HTS_PARSE_ONE_COORD; + argc--; argv++; + continue; + } + + // Automatic mode for test harness + if (strcmp(argv[1], "-t") == 0) + reg_test(argv[2]); + + break; + } + + // Interactive mode for debugging + if (argc != 3) { + fprintf(stderr, "Usage: test-parse-reg [-m] [-c] region[,region]...\n"); + exit(1); + } + + if (!(fp = sam_open(argv[1], "r"))) { + perror(argv[1]); + exit(1); + } + + if (!(hdr = sam_hdr_read(fp))) { + fprintf(stderr, "Couldn't read header\n"); + exit(1); + } + + const char *reg = argv[2]; + while (*reg) { + int tid; + int64_t beg, end; + reg = sam_parse_region(hdr, reg, &tid, &beg, &end, flags); + if (!reg) { + fprintf(stderr, "Failed to parse region\n"); + exit(1); + } + printf("%-20s %12"PRId64" %12"PRId64"\n", + tid == -1 ? "*" : hdr->target_name[tid], + beg, end); + } + + bam_hdr_destroy(hdr); + sam_close(fp); + + return 0; +}