Skip to content

Commit

Permalink
Changes faidx to use the standard region parser.
Browse files Browse the repository at this point in the history
Previously it had its own custom parser which nearly, but not quite,
matched the old hts_parse_reg code in functionality.

Note this changes the fasta coordinate type from long to int, which is
a backwards step.  However it is expected this will subsequently
change to hts_pos_t in another PR.
  • Loading branch information
jkbonfield committed Jun 7, 2018
1 parent 9a3cacf commit 6e799ec
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 65 deletions.
89 changes: 24 additions & 65 deletions faidx.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ DEALINGS IN THE SOFTWARE. */
#include "hts_internal.h"

typedef struct {
int id; // faidx_t->name[id] is for this struct.
int32_t line_len, line_blen;
int64_t len;
uint64_t offset;
Expand All @@ -60,6 +61,13 @@ struct __faidx_t {
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif

static int fai_name2id(void *v, const char *ref)
{
faidx_t *fai = (faidx_t *)v;
khint_t k = kh_get(s, fai->hash, ref);
return k == kh_end(fai->hash) ? -1 : kh_val(fai->hash, k).id;
}

static inline int fai_insert_index(faidx_t *idx, const char *name, int64_t len, int line_len, int line_blen, uint64_t offset)
{
if (!name) {
Expand Down Expand Up @@ -87,6 +95,7 @@ static inline int fai_insert_index(faidx_t *idx, const char *name, int64_t len,
}
idx->name = tmp;
}
v->id = idx->n;
idx->name[idx->n++] = name_key;
v->len = len;
v->line_len = line_len;
Expand Down Expand Up @@ -456,84 +465,30 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,

char *fai_fetch(const faidx_t *fai, const char *str, int *len)
{
char *s, *ep;
size_t i, l, k, name_end;
khiter_t iter;
faidx1_t val;
khash_t(s) *h;
long beg, end;

beg = end = -1;
h = fai->hash;
name_end = l = strlen(str);
s = (char*)malloc(l+1);
if (!s) {
*len = -1;
return NULL;
int id, beg, end; // FIXME: was long, and will be again too after #709 lands.
if (!fai_parse_region(fai, str, &id, &beg, &end, 0)) {
hts_log_warning("Reference %s not found in FASTA file, returning empty sequence", str);
*len = -2;
return 0;
}

// remove space
for (i = k = 0; i < l; ++i)
if (!isspace_c(str[i])) s[k++] = str[i];
s[k] = 0;
name_end = l = k;
// determine the sequence name
for (i = l; i > 0; --i) if (s[i - 1] == ':') break; // look for colon from the end
if (i > 0) name_end = i - 1;
if (name_end < l) { // check if this is really the end
int n_hyphen = 0;
for (i = name_end + 1; i < l; ++i) {
if (s[i] == '-') ++n_hyphen;
else if (!isdigit_c(s[i]) && s[i] != ',') break;
}
if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
s[name_end] = 0;
iter = kh_get(s, h, s);
if (iter == kh_end(h)) { // cannot find the sequence name
iter = kh_get(s, h, str); // try str as the name
if (iter != kh_end(h)) {
s[name_end] = ':';
name_end = l;
}
}
} else iter = kh_get(s, h, str);
if(iter == kh_end(h)) {
hts_log_warning("Reference %s not found in FASTA file, returning empty sequence", str);
free(s);
h = fai->hash;
iter = kh_get(s, h, faidx_iseq(fai, id));
if (!iter) {
// should have already been caught above
abort();
*len = -2;
return 0;
}
val = kh_value(h, iter);
// parse the interval
if (name_end < l) {
int save_errno = errno;
errno = 0;
for (i = k = name_end + 1; i < l; ++i)
if (s[i] != ',') s[k++] = s[i];
s[k] = 0;
if (s[name_end + 1] == '-') {
beg = 0;
i = name_end + 2;
} else {
beg = strtol(s + name_end + 1, &ep, 10);
for (i = ep - s; i < k;) if (s[i++] == '-') break;
}
end = i < k? strtol(s + i, &ep, 10) : val.len;
if (beg > 0) --beg;
// Check for out of range numbers. Only going to be a problem on
// 32-bit platforms with >2Gb sequence length.
if (errno == ERANGE && (uint64_t) val.len > LONG_MAX) {
hts_log_error("Positions in range %s are too large for this platform", s);
free(s);
*len = -2;
return NULL;
}
errno = save_errno;
} else beg = 0, end = val.len;

if (beg >= val.len) beg = val.len;
if (end >= val.len) end = val.len;
if (beg > end) beg = end;
free(s);

// now retrieve the sequence
return fai_retrieve(fai, &val, beg, end, len);
Expand Down Expand Up @@ -592,3 +547,7 @@ int faidx_has_seq(const faidx_t *fai, const char *seq)
return 1;
}

const char *fai_parse_region(const faidx_t *fai, const char *s, int *tid, int *beg, int *end, int flags)
{
return hts_parse_region(s, tid, beg, end, (hts_name2id_f)fai_name2id, (void *)fai, flags);
}
11 changes: 11 additions & 0 deletions htslib/faidx.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,17 @@ const char *faidx_iseq(const faidx_t *fai, int i);
/// Return sequence length, -1 if not present
int faidx_seq_len(const faidx_t *fai, const char *seq);

/// Parses a region string.
/** @param fai Pointer to the faidx_t struct
@param s Region string
@param tid Returns which i-th sequence is described in the region.
@param beg Returns the start of the region (0 based)
@param end Returns the one past last of the region (0 based)
@param flags Parsing method, see HTS_PARSE_* in hts.h.
@return pointer to end of parsed s if successs, NULL if not.
*/
const char *fai_parse_region(const faidx_t *fai, const char *s, int *tid, int *beg, int *end, int flags);

#ifdef __cplusplus
}
#endif
Expand Down

0 comments on commit 6e799ec

Please sign in to comment.