-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_ctrees.h
607 lines (517 loc) · 26.6 KB
/
parse_ctrees.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
/* File: parse_ctrees.h */
/*
This file is a part of the ``parse_ctrees`` package
Copyright (C) 2018-- Manodeep Sinha ([email protected])
License: MIT LICENSE. See LICENSE file under the top-level
directory at https://github.com/manodeep/parse_ctrees/
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <fcntl.h>
#include <string.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <unistd.h>
#include <stddef.h> /* for offsetof macro*/
#include "sglib.h"
/* this is the maximum number of CTREES columns that can be requested
(note: it is okay for the ctrees `tree_?_?_?.dat` files themselves to contain more columns)
*/
#define PARSE_CTREES_MAX_NCOLS 128
/* max. number of characters in a CTREES column name in the `tree_?_?_?.dat` file */
#define PARSE_CTREES_MAX_COLNAME_LEN 64
/* max. number of bytes to read at one call. Probably should
be larger than the number of characters in one line in the `tree_?_?_?.dat` file
Since the reads are unbuffered, increasing this value
may lead to better performance */
#define PARSE_CTREES_MAXBUFSIZE 1240
#if PARSE_CTREES_MAX_COLNAME_LEN < 64
#error Some of the Consistent-Trees column names are long. Please increase PARSE_CTREES_MAX_COLNAME_LEN to be at least 64
#endif
#if PARSE_CTREES_MAXBUFSIZE < 480
#error Some of the Consistent-Trees column names are long. Please increase PARSE_CTREES_MAX_COLNAME_LEN to be at least 64
#endif
/* Function-like macros */
#ifdef NDEBUG
#define PARSE_CTREES_XASSERT(EXP, EXIT_STATUS, ...) \
do { \
} while (0)
#else
#define PARSE_CTREES_XASSERT(EXP, EXIT_STATUS, ...) \
do { \
if (!(EXP)) { \
fprintf(stderr, "Error in file: %s\tfunc: %s\tline: %d with expression `" #EXP "'\n", \
__FILE__, __FUNCTION__, __LINE__); \
fprintf(stderr, __VA_ARGS__); \
return EXIT_STATUS; \
} \
} while (0)
#endif
/* valid numeric types for the destination for any CTrees column.
An 'int' in CTrees could be read into a 'double' pointer. These
numeric types refer to the destination type, i.e., in the case
where an CTrees 'int' is being read into a 'double', the
format should be specified as 'F64'.
All the actual reading from the `tree_?_?_?.dat` files are done
as 'strings'. The appropriate conversion to the destination is
chosen based on the destination numeric_type */
enum parse_numeric_types
{
I32 = 0, /* int32_t */
I64 = 1, /* int64_t */
U32 = 2, /* uint32_t */
U64 = 3, /* uint64_t */
F32 = 4, /* float */
F64 = 5, /* double */
num_numeric_types
};
/* because, we do not know apriori how many halos will be in a tree,
we will have to re-allocate as and when necessary. Therefore, we
do need to keep a count of "independent" arrays, all of which need to be
re-allocated any time.
The idea is that columns get parsed into multiple arrays, each of which are
in sync via the array index -- viz., the N'th parsed line, gets assigned to
the arr0[N-1], arr1[N-1], arr2[N-1] ... and so on (where the arrays themselves
can be `structs`)
base_ptr_info simply is the unique set that contain all of the destination memory
locations for the columns requested from CTREES
See the examples in the associated `main.c` for usage. The user is expected
to populate this struct. */
struct base_ptr_info {
int64_t num_base_ptrs;
void **base_ptrs[PARSE_CTREES_MAX_NCOLS];/* because the pointers may need to be re-allocated, I need 'void **' */
size_t base_element_size[PARSE_CTREES_MAX_NCOLS];/* sizeof(**(base_ptrs[i])) --> in bytes */
union {
/* nallocated is the number of elements allocated for each
element in base_ptrs (i.e., alloc'ed/re-alloc'ed via the void **base_ptrs) */
int64_t nhalos_allocated;/* number of elements allocated in each `base_ptr` */
int64_t nallocated;
};
/* the number of elements that have been read into *base_ptrs */
union {
int64_t N;/* number of rows read */
int64_t nhalos; /* for convenience */
int64_t nhalos_read;/* for convenience */
};
};
/* This struct contains info about:
how many columns to parse from each line (ncols),
which column number needs to be parsed (column_number)
what the destination numeric types are (field_types),
which one of the base_ptrs each one of the columns needs to be assigned to (base_ptr_idx),
and how to access the relevant memory address within the base_ptr_idx[N] (dest_offset_to_element)
By storing the struct elements on the stack, I avoid pesky issues of malloc/free
The code will populate this struct in the function `parse_header_ctrees`, based on the columns requested in the
user-specified variable `wanted_columns`
*/
struct ctrees_column_to_ptr {
int64_t ncols;/* number of columns that need to be parsed on each line (all these columns *DO* exist) */
int32_t column_number[PARSE_CTREES_MAX_NCOLS];/* column number in CTREES data */
enum parse_numeric_types field_types[PARSE_CTREES_MAX_NCOLS];/* destination data-type, i.e, how to parse the string into a valid numeric value */
int64_t base_ptr_idx[PARSE_CTREES_MAX_NCOLS];/* index into the base_ptr array within base_ptr_info struct */
/* dest_offset_to_element:
For array-of-structures (AOS) type base-ptrs, this is the offsetof(field-name-within-struct-definition)
For structure-of-arrays (SOA) type base-ptrs, this should be 0
this offset must be >= 0 and < size of each element of the base ptr
For offset values that are not 0, absolutely use the OFFSETOF macro
to derive the byte offset of each field */
size_t dest_offset_to_element[PARSE_CTREES_MAX_NCOLS];/* in bytes */
};
/* This function takes the array of wanted CTREES columns (``wanted_columns``) and matches those against
the column names that were found in the CTREEs output (``names``)
``nwanted`` is the number of elements in ``wanted_columns``
``totncols`` is the number of elements in ``names``
Returns a integer array of ``nwanted`` elements, where each element of this array contains the column number
in CTREES output if a match was found, otherwise contains a -1.
The performed string matching of column-names between ``wanted_columns`` and ``names`` is case-insensitive.
*/
static inline int * match_column_name(const char (*wanted_columns)[PARSE_CTREES_MAX_COLNAME_LEN], const int nwanted, const char (*names)[PARSE_CTREES_MAX_COLNAME_LEN], const int totncols)
{
int *columns = calloc(nwanted, sizeof(*columns));
PARSE_CTREES_XASSERT(columns != NULL,
NULL,
"Error: Could not allocate memory for reading in the columns for each of the %d fields\n",
nwanted);
for(int i=0;i<nwanted;i++) {
columns[i] = -1;
}
int nfound=0;
for(int i=0;i<nwanted;i++) {
const char *wanted_colname = wanted_columns[i];
int found=0;
for(int j=0;j<totncols;j++) {
const char *file_colname = names[j];
if (strcasecmp(wanted_colname, file_colname) == 0) {
fprintf(stderr, "Found `%s` in column # %d as with name `%s`\n", wanted_colname, j, file_colname);
columns[i] = j;
nfound++;
found = 1;
break;
}
}
if(found == 0) {
fprintf(stderr,"Did not find requested column `%s'\n", wanted_colname);
}
}
fprintf(stderr,"Found %d columns out of the requested %d\n", nfound, nwanted);
return columns;
}
/* Reallocates each one of the base pointers to the new requested number of elements */
static inline int reallocate_base_ptrs(struct base_ptr_info *base_info, const int64_t new_N)
{
fprintf(stderr,"reallocating from %"PRId64" elements to a %"PRId64" elements. current N = %"PRId64"\n",
base_info->nallocated, new_N, base_info->N);
for(int64_t i=0;i<base_info->num_base_ptrs;i++) {
void **this_ptr = base_info->base_ptrs[i];
const size_t size = base_info->base_element_size[i];
void *tmp = realloc(*this_ptr, size*new_N);
if(tmp == NULL) {
fprintf(stderr,"Error: Failed to re-allocated memory to go from %"PRId64" to %"PRId64" elements, each of size = %zu bytes\n",
base_info->nallocated, new_N, size);
perror(NULL);
return EXIT_FAILURE;
}
/* we have successfully re-allocted => assign the new pointer address */
*(base_info->base_ptrs[i]) = tmp;
}
base_info->nallocated = new_N;
return EXIT_SUCCESS;
}
static inline int parse_header_ctrees(char (*column_names)[PARSE_CTREES_MAX_COLNAME_LEN], enum parse_numeric_types *field_types,
int64_t *base_ptr_idx, size_t *dest_offset_to_element,
const int64_t nfields, const char *filename, struct ctrees_column_to_ptr *column_info)
{
/* Because the struct elements (of column_info) are stored on the stack,
need to check that nfields can fit */
if(nfields > PARSE_CTREES_MAX_NCOLS) {
fprintf(stderr,"Error: You have requested %"PRId64" columns but there is only space to store %"PRId64"\n",nfields, (int64_t) PARSE_CTREES_MAX_NCOLS);
fprintf(stderr,"Please define the macro variable `PARSE_CTREES_MAX_NCOLS' to be larger than %"PRId64" (before including the file `%s')\n",
nfields, __FILE__);
return EXIT_FAILURE;
}
FILE *fp = fopen(filename, "rt");
if(fp == NULL) {
fprintf(stderr,"Error: Could not open file `%s'\n",filename);
perror(NULL);
return EXIT_FAILURE;
}
char linebuf[PARSE_CTREES_MAXBUFSIZE];
if(fgets(linebuf, PARSE_CTREES_MAXBUFSIZE, fp) == NULL) {
fprintf(stderr,"Error: Could not read the first line (the header) in the file `%s'\n", filename);
perror(NULL);
return EXIT_FAILURE;
}
/* file read was successful */
/* only need the first line -> close the file */
fclose(fp);
/* first check that the first character is a '#' */
if(linebuf[0] != '#') {
fprintf(stderr,"Error: Consistent-Trees output always contain '#' as the comment character\n"
"However, the first character in the buffer is '%c'\nEntire line is `%s'", linebuf[0], linebuf);
return EXIT_FAILURE;
}
char *tofree, *string;
tofree = string = strdup(linebuf);
PARSE_CTREES_XASSERT(string != NULL,
EXIT_FAILURE,
"Error: Could not duplicate the header line (header: = `%s`\n)",
linebuf);
int totncols = 0;
char *token = NULL;
/* CTREES currently uses white-space but this parsing will also
use the comma to break (i.e., if, in the future, the CTREES format changes
to using comma's, the code will still work) */
while ((token = strsep(&string, " ,")) != NULL) {
/* fprintf(stderr,"%35s %zu\n", token, strlen(token)); */
totncols++;
}
free(tofree);
/* read succeeded -> now parse the column names */
const char delimiters[] = " ,\n#";/* space, comma, new-line, and #*/
char (*names)[PARSE_CTREES_MAX_COLNAME_LEN] = calloc(totncols, sizeof(*names));
PARSE_CTREES_XASSERT(names != NULL,
EXIT_FAILURE,
"Error: Could not allocate memory to store each column name (total size requested = %zu bytes\n)",
totncols * sizeof(*names));
/* ``tofree`` is simply there to free the memory malloc'ed by strdup. ``string`` itself will get modified */
tofree = string = strdup(linebuf);
PARSE_CTREES_XASSERT(string != NULL,
EXIT_FAILURE,
"Error: Could not duplicate the header line (header: = `%s`\n)",
linebuf);
int col = 0;
while ((token = strsep(&string, delimiters)) != NULL) {
size_t size=0, totlen = strlen(token);
if(totlen == 0) continue;
/* fprintf(stderr,"[%d] -- '%s' -- ", col, token); */
PARSE_CTREES_XASSERT(totlen > 0 && totlen < PARSE_CTREES_MAX_COLNAME_LEN,
EXIT_FAILURE,
"totlen = %zu should be between (0, %d)\n",
totlen, (int) PARSE_CTREES_MAX_COLNAME_LEN);
char *colname = names[col];
for(size_t i=0;i<totlen;i++) {
if(token[i] == '(') {
#if 0
/* locate the ending ')' -- this while loop is only for additional
testing and can be commented out */
size_t j = i+1;
while(j < totlen) {
if(token[j] == ')') {
token[j] = '\0';
/* fprintf(stderr," `token = %s` ", &token[i+1]); */
int ctrees_colnum = atoi(&(token[i+1]));
PARSE_CTREES_XASSERT(ctrees_colnum == col,
EXIT_FAILURE,
"ctrees_colnum = %d should equal col = %d\n",
ctrees_colnum, col);
break;
}
j++;
}
#endif
break;
}
colname[size] = token[i];
size++;
}
colname[size] = '\0';
/* fprintf(stderr, " `%s` \n", names[col]); */
col++;
}
PARSE_CTREES_XASSERT(col == totncols,
EXIT_FAILURE,
"Error: Previous parsing indicated %d columns in the header but only found %d actual column names\n"
"Please check that the delimiters spefied to `strsep` are the same in all calls\n",
totncols, col);
free(tofree);
int * matched_columns = match_column_name((const char (*)[PARSE_CTREES_MAX_COLNAME_LEN])column_names, nfields, (const char (*)[PARSE_CTREES_MAX_COLNAME_LEN]) names, totncols);
if(matched_columns == NULL) {
return EXIT_FAILURE;
}
/* do not need the actual names of every column in the ctrees file any longer -> free that memory */
free(names);
/* now sort the matched columns */
#define SGLIB_CHAR_ARRAY_ELEMENTS_EXCHANGER(maxlen, a, i, j) {char _sgl_aee_tmp_[maxlen]; memmove(_sgl_aee_tmp_, (a)[(i)], maxlen);memmove((a)[(i)], (a)[(j)], maxlen); memmove((a)[(j)], _sgl_aee_tmp_, maxlen);}
/* sort the matched_columns in ascending order */
#define MULTIPLE_ARRAY_EXCHANGER(type,a,i,j) { \
SGLIB_ARRAY_ELEMENTS_EXCHANGER(enum parse_numeric_types, field_types,i,j); \
SGLIB_ARRAY_ELEMENTS_EXCHANGER(int, matched_columns, i, j); \
SGLIB_ARRAY_ELEMENTS_EXCHANGER(int64_t, base_ptr_idx, i, j); \
SGLIB_ARRAY_ELEMENTS_EXCHANGER(size_t, dest_offset_to_element, i, j); \
SGLIB_CHAR_ARRAY_ELEMENTS_EXCHANGER(PARSE_CTREES_MAX_COLNAME_LEN, column_names, i, j); \
}
SGLIB_ARRAY_QUICK_SORT(int, matched_columns, nfields, SGLIB_NUMERIC_COMPARATOR , MULTIPLE_ARRAY_EXCHANGER);
#undef SGLIB_CHAR_ARRAY_ELEMENTS_EXCHANGER
#undef MULTIPLE_ARRAY_EXCHANGER
/* now assign only the columns that were found
into the column_info struct */
column_info->ncols = 0;
for(int i=0;i<nfields;i++) {
if(matched_columns[i] == -1) continue;
const int icol = column_info->ncols;
column_info->column_number[icol] = matched_columns[i];
column_info->field_types[icol] = field_types[i];
column_info->dest_offset_to_element[icol] = dest_offset_to_element[i];
column_info->base_ptr_idx[icol] = base_ptr_idx[i];
column_info->ncols++;
}
free(matched_columns);
return EXIT_SUCCESS;
}
static inline int parse_line_ctrees(const char *linebuf, const struct ctrees_column_to_ptr *column_info, struct base_ptr_info *base_ptr_info)
{
if(base_ptr_info->nallocated == base_ptr_info->N) {
const double large_N_memory_increase_fac = 1.2;
const int64_t small_N_memory_increase_fac = 2;
/* double (:=`small_N_memory_increase_fac`) the memory requested for small numbers, otherwise increase by `large_N_memory_increase_fac` */
const int64_t thresh_N_for_large_memory = 1000000;/* small_N_memory_increase_fac * N, for N less than this threshold*/
const int64_t new_N = (base_ptr_info->N < thresh_N_for_large_memory) ? (base_ptr_info->N*small_N_memory_increase_fac): (base_ptr_info->N*large_N_memory_increase_fac);
int status = reallocate_base_ptrs(base_ptr_info, new_N);
if(status != EXIT_SUCCESS) return status;
XASSERT(base_ptr_info->nallocated > base_ptr_info->N,
EXIT_FAILURE,
"Error: Something went wrong while memory reallocation "
"nallocated = %"PRId64" should have been larger than N = %"PRId64"\n",
base_ptr_info->nallocated, base_ptr_info->N);
}
int icol = -1;
char *tofree, *string;
/* fprintf(stderr,"in %s>linebuf = `%s`\n\n", __FUNCTION__, linebuf); */
tofree = string = strdup(linebuf);
char *token = NULL;
for(int i=0;i<column_info->ncols;i++) {
const int wanted_col = column_info->column_number[i];
const int64_t base_ptr_idx = column_info->base_ptr_idx[i];
PARSE_CTREES_XASSERT(base_ptr_idx < base_ptr_info->num_base_ptrs,
EXIT_FAILURE,
"Error: Valid values for base pointer index must be in range [0, %"PRId64"). Got %"PRId64" instead\n",
base_ptr_info->num_base_ptrs, base_ptr_idx);
char *dest = *((char **) (base_ptr_info->base_ptrs[base_ptr_idx]));
const size_t base_ptr_stride = base_ptr_info->base_element_size[base_ptr_idx];
const size_t dest_offset = column_info->dest_offset_to_element[i];
PARSE_CTREES_XASSERT(base_ptr_stride >= 4,
EXIT_FAILURE,
"Error: Stride=%zu is expected in bytes with a minimum of 4 bytes since that's "
"the smallest data-type supported (corresponding to float or int32_t).\n"
"Perhaps you forgot to multiply by the sizeof(element)?\n",
base_ptr_stride);
PARSE_CTREES_XASSERT(dest_offset < base_ptr_stride,
EXIT_FAILURE,
"Error: The offset from the starting address of an element can be at most the total stride in bytes\n"
"In this case offset=%zu must in the half-open range [0, %zu). Perhaps you mis-typed the offset?\n",
dest_offset, base_ptr_stride);
/* get to the starting offset for this N'th element */
dest += base_ptr_info->N * base_ptr_stride;
/* now get to this particular field */
dest += dest_offset;
/* this is the type for the destination (hence called 'field_types' rather than 'column_types') */
const enum parse_numeric_types wanted_type = column_info->field_types[i];
/* there might be duplicate column numbers in matched_columns
then the following while loop should immediately exit (without
executing any lines within)
and we will re-use the previous parsed value of token */
while(icol < wanted_col || token == NULL) {
token = strsep(&string, " ,");
if(token != NULL && ((token[0] == '\0') || (token[0] == ' '))) {
continue;
}
icol++;
}
PARSE_CTREES_XASSERT(token != NULL && token[0] != '\0' && icol == wanted_col,
EXIT_FAILURE,
"Error: token=`%s` should have valid non-zero numeric value at this stage.\n"
"And the parsed col = %d should be equal to the requested column = %d\n",
token, icol, wanted_col);
switch(wanted_type) {
case F32:{
float tmp = strtof(token, NULL);
/* fprintf(stderr,"[float] := %f (col #%d)\n", tmp, wanted_col); */
*((float *) dest) = tmp;
break;
}
case F64:{
double tmp = strtod(token, NULL);
/* fprintf(stderr,"[double] := %lf (col #%d)\n", tmp, wanted_col); */
*((double *) dest) = tmp;
break;
}
case I32:{
int32_t tmp = (int32_t) strtol(token, NULL, 10);
/* fprintf(stderr,"[int32_t] := %"PRId32" (col #%d)\n", tmp, wanted_col); */
*((int32_t *) dest) = tmp;
break;
}
case U32:{
uint32_t tmp = (uint32_t) strtoul(token, NULL, 10);
/* fprintf(stderr,"[uint32_t] := %"PRIu32" (col #%d)\n", tmp, wanted_col); */
*((uint32_t *) dest) = tmp;
break;
}
case I64:{
int64_t tmp = (int64_t) strtoll(token, NULL, 10);
/* fprintf(stderr,"[int64_t] := %"PRId64" (col #%d)\n", tmp, wanted_col); */
*((int64_t *) dest) = tmp;
break;
}
case U64:{
uint64_t tmp = (uint64_t) strtoull(token, NULL, 10);
/* fprintf(stderr,"[uint64_t] := %"PRIu64" (col #%d)\n", tmp, wanted_col); */
*((uint64_t *) dest) = tmp;
break;
}
default:
fprintf(stderr,"Error: Unknown value for parse type = %d\n", wanted_type);
fprintf(stderr,"Known values are in the range : [%d, %d)\n", I32, num_numeric_types);
return EXIT_FAILURE;
}
}
free(tofree);
base_ptr_info->N++;
/* fprintf(stderr,"parsed one line: base->N = %"PRId64" nallocated = %"PRId64" linebuf = `%s'\n", base_ptr_info->N, base_ptr_info->nallocated, linebuf); */
return EXIT_SUCCESS;
}
static inline int read_single_tree_ctrees(int fd, off_t offset, const struct ctrees_column_to_ptr *column_info, struct base_ptr_info *base_ptr_info)
{
/* Because the struct elements (of column_info) are stored on the stack,
need to check that nfields can fit */
if(column_info->ncols > PARSE_CTREES_MAX_NCOLS) {
fprintf(stderr,"Error: You have requested %"PRId64" columns but there is only space to store %"PRId64"\n",
column_info->ncols, (int64_t) PARSE_CTREES_MAX_NCOLS);
fprintf(stderr,"Please define the macro variable `PARSE_CTREES_MAX_NCOLS' to be larger than %"PRId64" (before including the file `%s')\n",
column_info->ncols, __FILE__);
return EXIT_FAILURE;
}
char read_buffer[PARSE_CTREES_MAXBUFSIZE];
const size_t to_read_bytes = PARSE_CTREES_MAXBUFSIZE - 1;
read_buffer[PARSE_CTREES_MAXBUFSIZE - 1] = '\0';
int done_reading_tree = 0;
/* two things can happen while reading -> EOF or I reach the next tree */
while(done_reading_tree == 0) {
ssize_t nbytes_read = pread(fd, read_buffer, to_read_bytes, offset);
if(nbytes_read == 0) {
done_reading_tree = 1;/* we have reached end of file */
} else if(nbytes_read < 0) {
fprintf(stderr,"Error: trying to read %zu bytes from file failed. Encountered negative bytes read \n", to_read_bytes);
perror(NULL);
return EXIT_FAILURE;
} else {
const off_t start_offset = offset;
if(read_buffer[0] == '#') {
done_reading_tree = 1;
break;
}
if(nbytes_read < (ssize_t) to_read_bytes) {
done_reading_tree = 1;/* we have reached end of file but this read buffer needs to be processed*/
}
/* some bytes were read -> now let's parse one line at a time */
char *start = &(read_buffer[0]);
const char *end = read_buffer + nbytes_read;
char *this = start;
while(this < end) {
if(*this == '\n') {
*this = '\0';
PARSE_CTREES_XASSERT(this >= start && this - start < PARSE_CTREES_MAXBUFSIZE,
EXIT_FAILURE,
"Error: Possible bug in code or mal-formed input\n"
"Current position = %p starting position = %p difference = %"PRId64" should be in range[0, %d)\n",
this, start, (int64_t) (this - start), PARSE_CTREES_MAXBUFSIZE);
char linebuf[PARSE_CTREES_MAXBUFSIZE];
memmove(linebuf, start, this - start + 1);
offset += (this - start + 1);
start = this + 1;/* might point beyond valid memory but should not get de-referenced */
/* fprintf(stderr,"calling parse_line with `%s`\n\n", linebuf); */
int status = parse_line_ctrees(linebuf, column_info, base_ptr_info);
if(status != EXIT_SUCCESS) {
return status;
}
/* fprintf(stderr,"parsed one line. bytes processed = %"PRId64" nbytes_read = %zd this = %p strlen(linebuf) = %zu\n", */
/* offset - start_offset, nbytes_read, this, strlen(linebuf)); */
}
this++;
if(this < end && *this == '#') {
/* fprintf(stderr,"encountered '#'...stopping parsing\n"); */
/* we have encountered the beginning of a new tree (new line and begins with '#tree ')*/
done_reading_tree = 1;
this = (char *) end;/* casting, otherwise compiler complains */
break;
}
}
PARSE_CTREES_XASSERT(offset - start_offset <= nbytes_read,
EXIT_FAILURE,
"Error: bytes processed = %"PRId64" should be at most num bytes read = %zd\n",
offset - start_offset, nbytes_read);
}
}
return EXIT_SUCCESS;
}
/* these two macros are for internal use only
and can therefor be undefined */
#undef PARSE_CTREES_MAXBUFSIZE
#undef PARSE_CTREES_XASSERT
#if 0
/* this will be required externally to pass in the
array of strings, containing the column names */
#undef PARSE_CTREES_MAX_COLNAME_LEN
#endif