Skip to content

Commit

Permalink
Allow initial whitespace in FASTA ">" headers
Browse files Browse the repository at this point in the history
Fixes samtools/samtools#449.
Also ensure an empty name works; fixes #258, hat tip @mtmorgan.

Add test/faidx.fa test cases, with unnamed sequence, extra whitespace,
and tests for previously-fixed blank line-related bugs fixed in
1980e58.

Fix memory leaks introduced by 642783e.

[NEWS]
* fai_build() and samtools faidx now accept initial whitespace in ">"
  headers (e.g., "> chr1 description" is taken to refer to "chr1")
  • Loading branch information
jmarshall committed Aug 25, 2015
1 parent 82203a2 commit 306664a
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 42 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ hts.o hts.pico: hts.c config.h version.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram
vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) $(htslib_khash_str2int_h)
sam.o sam.pico: sam.c config.h $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h)
tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_khash_h)
faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_khash_h)
faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kstring_h)
synced_bcf_reader.o synced_bcf_reader.pico: synced_bcf_reader.c config.h $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) $(htslib_bgzf_h)
vcf_sweep.o vcf_sweep.pico: vcf_sweep.c config.h $(htslib_vcf_sweep_h) $(htslib_bgzf_h)
vcfutils.o vcfutils.pico: vcfutils.c config.h $(htslib_vcfutils_h)
Expand Down Expand Up @@ -289,7 +289,7 @@ tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htsl
check test: $(BUILT_TEST_PROGRAMS)
test/fieldarith test/fieldarith.sam
test/hfile
test/sam test/ce.fa
test/sam test/ce.fa test/faidx.fa
test/test-regidx
cd test && REF_PATH=: ./test_view.pl
cd test && ./test.pl
Expand Down
9 changes: 4 additions & 5 deletions faidx.5
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
'\" t
.TH faidx 5 "August 2013" "htslib" "Bioinformatics formats"
.TH faidx 5 "August 2015" "htslib" "Bioinformatics formats"
.SH NAME
faidx \- an index enabling random access to FASTA files
.\"
.\" Copyright (C) 2013 Genome Research Ltd.
.\" Copyright (C) 2013, 2015 Genome Research Ltd.
.\"
.\" Author: John Marshall <[email protected]>
.\"
Expand Down Expand Up @@ -98,9 +98,8 @@ or other line termination, the newline characters present must be consistent,
at least within each reference sequence.
.P
The \fBsamtools\fP implementation uses the first word of the "\fB>\fP" header
line text (i.e., up to the first whitespace character) as the \fBNAME\fP column.
At present, there may be no whitespace between the
">" character and the \fIname\fP.
line text (i.e., up to the first whitespace character, having skipped any
initial whitespace after the ">") as the \fBNAME\fP column.
.SH EXAMPLE
For example, given this FASTA file
.LP
Expand Down
56 changes: 27 additions & 29 deletions faidx.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ DEALINGS IN THE SOFTWARE. */
#include "htslib/faidx.h"
#include "htslib/hfile.h"
#include "htslib/khash.h"
#include "htslib/kstring.h"

typedef struct {
int32_t line_len, line_blen;
Expand Down Expand Up @@ -80,9 +81,8 @@ static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int

faidx_t *fai_build_core(BGZF *bgzf)
{
char *name;
kstring_t name = { 0, 0, NULL };
int c;
int l_name, m_name;
int line_len, line_blen, state;
int l1, l2;
faidx_t *idx;
Expand All @@ -91,7 +91,6 @@ faidx_t *fai_build_core(BGZF *bgzf)

idx = (faidx_t*)calloc(1, sizeof(faidx_t));
idx->hash = kh_init(s);
name = 0; l_name = m_name = 0;
len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
while ( (c=bgzf_getc(bgzf))>=0 ) {
if (c == '\n') { // an empty line
Expand All @@ -103,30 +102,25 @@ faidx_t *fai_build_core(BGZF *bgzf)
}
if (c == '>') { // fasta header
if (len >= 0)
fai_insert_index(idx, name, len, line_len, line_blen, offset);
l_name = 0;
while ( (c=bgzf_getc(bgzf))>=0 && !isspace(c)) {
if (m_name < l_name + 2) {
m_name = l_name + 2;
kroundup32(m_name);
name = (char*)realloc(name, m_name);
}
name[l_name++] = c;
}
name[l_name] = '\0';
fai_insert_index(idx, name.s, len, line_len, line_blen, offset);

name.l = 0;
while ((c = bgzf_getc(bgzf)) >= 0)
if (! isspace(c)) kputc_(c, &name);
else if (name.l > 0 || c == '\n') break;
kputsn("", 0, &name);

if ( c<0 ) {
fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
free(name); fai_destroy(idx);
return 0;
goto fail;
}
if (c != '\n') while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
state = 1; len = 0;
offset = bgzf_utell(bgzf);
} else {
if (state == 3) {
fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
free(name); fai_destroy(idx);
return 0;
fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name.s);
goto fail;
}
if (state == 2) state = 3;
l1 = l2 = 0;
Expand All @@ -135,9 +129,8 @@ faidx_t *fai_build_core(BGZF *bgzf)
if (isgraph(c)) ++l2;
} while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
if (state == 3 && l2) {
fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name);
free(name); fai_destroy(idx);
return 0;
fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name.s);
goto fail;
}
++l1; len += l2;
if (state == 1) line_len = l1, line_blen = l2, state = 0;
Expand All @@ -146,15 +139,19 @@ faidx_t *fai_build_core(BGZF *bgzf)
}
}
}
if ( name )
fai_insert_index(idx, name, len, line_len, line_blen, offset);

if (len >= 0)
fai_insert_index(idx, name.s, len, line_len, line_blen, offset);
else
{
free(idx);
return NULL;
}
free(name);
goto fail;

free(name.s);
return idx;

fail:
free(name.s);
fai_destroy(idx);
return NULL;
}

void fai_save(const faidx_t *fai, FILE *fp)
Expand Down Expand Up @@ -229,6 +226,7 @@ int fai_build(const char *fn)
if ( !fai )
{
if ( bgzf->is_compressed && bgzf->is_gzip ) fprintf(stderr,"Cannot index files compressed with gzip, please use bgzip\n");
bgzf_close(bgzf);
free(str);
return -1;
}
Expand Down
17 changes: 17 additions & 0 deletions test/faidx.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
>
ATGC
>trailingblank1
AAATTTGGGCCC
TTTGGGCCCAAA
GGGCCCAAA

>trailingblank2 with last dna line the same length
AAATTTGGGCCCAAATTTGGGCCC
TTTGGGCCCAAATTTGGGCCCAAA
GGGCCCAAATTTGGGCCCAAATTT

> foo
TGCATG
CA
> bar description
TTTTAAAA
33 changes: 27 additions & 6 deletions test/sam.c
Original file line number Diff line number Diff line change
Expand Up @@ -162,26 +162,47 @@ static void iterators1(void)

static void faidx1(const char *filename)
{
int n;
faidx_t *fai = fai_load(filename);
if (fai == NULL) fail("can't load faidx file");
int n, n_exp = 0;
char tmpfilename[FILENAME_MAX], line[500];
FILE *fin, *fout;
faidx_t *fai;

fin = fopen(filename, "r");
if (fin == NULL) fail("can't open %s\n", filename);
sprintf(tmpfilename, "%s.tmp", filename);
fout = fopen(tmpfilename, "w");
if (fout == NULL) fail("can't create temporary %s\n", tmpfilename);
while (fgets(line, sizeof line, fin)) {
if (line[0] == '>') n_exp++;
fputs(line, fout);
}
fclose(fin);
fclose(fout);

if (fai_build(tmpfilename) < 0) fail("can't index %s", tmpfilename);
fai = fai_load(tmpfilename);
if (fai == NULL) fail("can't load faidx file %s", tmpfilename);

n = faidx_fetch_nseq(fai);
if (n != 7) fail("faidx_fetch_nseq returned %d, expected 7", n);
if (n != n_exp)
fail("%s: faidx_fetch_nseq returned %d, expected %d", filename, n, n_exp);

n = faidx_nseq(fai);
if (n != 7) fail("faidx_nseq returned %d, expected 7", n);
if (n != n_exp)
fail("%s: faidx_nseq returned %d, expected %d", filename, n, n_exp);

fai_destroy(fai);
}

int main(int argc, char **argv)
{
int i;

status = EXIT_SUCCESS;

aux_fields1();
iterators1();
if (argc >= 2) faidx1(argv[1]);
for (i = 1; i < argc; i++) faidx1(argv[i]);

return status;
}

0 comments on commit 306664a

Please sign in to comment.