Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Work around colons in reference names (GRCh38 HLA). #708

Merged
merged 9 commits into from
May 31, 2019
8 changes: 7 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ BUILT_TEST_PROGRAMS = \
test/test-vcf-sweep \
test/test-bcf-sr \
test/fuzz/hts_open_fuzzer.o \
test/test-bcf-translate
test/test-bcf-translate \
test/test-parse-reg

BUILT_THRASH_PROGRAMS = \
test/thrash_threads1 \
Expand Down Expand Up @@ -377,6 +378,7 @@ check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS)
test/fieldarith test/fieldarith.sam
test/hfile
test/test_bgzf test/bgziptest.txt
test/test-parse-reg -t test/colons.bam
cd test/tabix && ./test-tabix.sh tabix.tst
cd test/mpileup && ./test-pileup.sh mpileup.tst
REF_PATH=: test/sam test/ce.fa test/faidx.fa test/fastqs.fq
Expand Down Expand Up @@ -413,6 +415,9 @@ test/test_realn: test/test_realn.o libhts.a
test/test-regidx: test/test-regidx.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/test-regidx.o libhts.a $(LIBS) -lpthread

test/test-parse-reg: test/test-parse-reg.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/test-parse-reg.o libhts.a $(LIBS) -lpthread

test/test_view: test/test_view.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/test_view.o libhts.a $(LIBS) -lpthread

Expand All @@ -439,6 +444,7 @@ test/pileup.o: test/pileup.c config.h $(htslib_sam_h) $(htslib_kstring_h)
test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h)
test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hfile_internal_h)
test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h)
test/test-parse-reg.o: test/test-parse-reg.c $(htslib_hts_h) $(htslib_sam_h)
test/test-realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h)
test/test-regidx.o: test/test-regidx.c config.h $(htslib_regidx_h) $(hts_internal_h)
test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) $(htslib_vcf_h)
Expand Down
115 changes: 42 additions & 73 deletions faidx.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ DEALINGS IN THE SOFTWARE. */
#include "hts_internal.h"

typedef struct {
int id; // faidx_t->name[id] is for this struct.
uint32_t line_len, line_blen;
uint64_t len;
uint64_t seq_offset;
Expand All @@ -62,6 +63,13 @@ struct __faidx_t {
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif

static int fai_name2id(void *v, const char *ref)
{
faidx_t *fai = (faidx_t *)v;
khint_t k = kh_get(s, fai->hash, ref);
return k == kh_end(fai->hash) ? -1 : kh_val(fai->hash, k).id;
}

static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, uint32_t line_len, uint32_t line_blen, uint64_t seq_offset, uint64_t qual_offset)
{
if (!name) {
Expand Down Expand Up @@ -89,6 +97,7 @@ static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len,
}
idx->name = tmp;
}
v->id = idx->n;
idx->name[idx->n++] = name_key;
v->len = len;
v->line_len = line_len;
Expand Down Expand Up @@ -684,14 +693,22 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format) {


static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
uint64_t offset, long beg, long end, int *len) {
uint64_t offset, int64_t beg, int64_t end, int *len) {
char *s;
size_t l;
int c = 0;
int ret = bgzf_useek(fai->bgzf,
offset
+ beg / val->line_blen * val->line_len
+ beg % val->line_blen, SEEK_SET);
int ret;

if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) {
hts_log_error("Range %"PRId64"..%"PRId64" too big", beg, end);
*len = -1;
return NULL;
}

ret = bgzf_useek(fai->bgzf,
offset
+ beg / val->line_blen * val->line_len
+ beg % val->line_blen, SEEK_SET);

if (ret < 0) {
*len = -1;
Expand Down Expand Up @@ -721,85 +738,32 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
return s;
}


static int fai_get_val(const faidx_t *fai, const char *str, int *len, faidx1_t *val, long *fbeg, long *fend) {
char *s, *ep;
size_t i, l, k, name_end;
static int fai_get_val(const faidx_t *fai, const char *str,
int *len, faidx1_t *val, int64_t *fbeg, int64_t *fend) {
khiter_t iter;
khash_t(s) *h;
long beg, end;
int id;
int64_t beg, end;

beg = end = -1;
h = fai->hash;
name_end = l = strlen(str);
s = (char*)malloc(l+1);
if (!s) {
*len = -1;
if (!fai_parse_region(fai, str, &id, &beg, &end, 0)) {
hts_log_warning("Reference %s not found in FASTA file, returning empty sequence", str);
*len = -2;
return 1;
}

// remove space
for (i = k = 0; i < l; ++i)
if (!isspace_c(str[i])) s[k++] = str[i];
s[k] = 0;
name_end = l = k;
// determine the sequence name
for (i = l; i > 0; --i) if (s[i - 1] == ':') break; // look for colon from the end
if (i > 0) name_end = i - 1;
if (name_end < l) { // check if this is really the end
int n_hyphen = 0;
for (i = name_end + 1; i < l; ++i) {
if (s[i] == '-') ++n_hyphen;
else if (!isdigit_c(s[i]) && s[i] != ',') break;
}
if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
s[name_end] = 0;
iter = kh_get(s, h, s);
if (iter == kh_end(h)) { // cannot find the sequence name
iter = kh_get(s, h, str); // try str as the name
if (iter != kh_end(h)) {
s[name_end] = ':';
name_end = l;
}
}
} else iter = kh_get(s, h, str);
if(iter == kh_end(h)) {
hts_log_warning("Reference %s not found in file, returning empty sequence", str);
free(s);
h = fai->hash;
iter = kh_get(s, h, faidx_iseq(fai, id));
if (!iter) {
// should have already been caught above
abort();
*len = -2;
return 1;
}
*val = kh_value(h, iter);
// parse the interval
if (name_end < l) {
int save_errno = errno;
errno = 0;
for (i = k = name_end + 1; i < l; ++i)
if (s[i] != ',') s[k++] = s[i];
s[k] = 0;
if (s[name_end + 1] == '-') {
beg = 0;
i = name_end + 2;
} else {
beg = strtol(s + name_end + 1, &ep, 10);
for (i = ep - s; i < k;) if (s[i++] == '-') break;
}
end = i < k? strtol(s + i, &ep, 10) : val->len;
if (beg > 0) --beg;
// Check for out of range numbers. Only going to be a problem on
// 32-bit platforms with >2Gb sequence length.
if (errno == ERANGE && (uint64_t) val->len > LONG_MAX) {
hts_log_error("Positions in range %s are too large for this platform", s);
free(s);
*len = -3;
return 1;
}
errno = save_errno;
} else beg = 0, end = val->len;

if (beg >= val->len) beg = val->len;
if (end >= val->len) end = val->len;
if (beg > end) beg = end;
free(s);

*fbeg = beg;
*fend = end;
Expand All @@ -811,7 +775,7 @@ static int fai_get_val(const faidx_t *fai, const char *str, int *len, faidx1_t *
char *fai_fetch(const faidx_t *fai, const char *str, int *len)
{
faidx1_t val;
long beg, end;
int64_t beg, end;

if (fai_get_val(fai, str, len, &val, &beg, &end)) {
return NULL;
Expand All @@ -824,7 +788,7 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len)

char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) {
faidx1_t val;
long beg, end;
int64_t beg, end;

if (fai_get_val(fai, str, len, &val, &beg, &end)) {
return NULL;
Expand Down Expand Up @@ -924,3 +888,8 @@ int faidx_has_seq(const faidx_t *fai, const char *seq)
return 1;
}

const char *fai_parse_region(const faidx_t *fai, const char *s,
int *tid, int64_t *beg, int64_t *end, int flags)
{
return hts_parse_region(s, tid, beg, end, (hts_name2id_f)fai_name2id, (void *)fai, flags);
}
Loading