/*
:call-seq:
Dvector.fast_fancy_read(stream, options) => Array_of_Dvectors
Reads data from an IO stream (or anything that supports a gets method)
and separate it into columns of data
according to the _options_, a hash holding the following elements
(compulsory, but you can use FANCY_READ_DEFAULTS):
* 'sep': a regular expression that separate the entries
* 'comments': any line matching this will be skipped
* 'skip_first': skips that many lines before reading anything
* 'index_col': if true, the first column returned contains the
number of the line read
* 'remove_space': whether to remove spaces at the beginning of a line. *This
option is currently not implemented !*
* 'default': what to put when nothing was found but a number must be used
As a side note, the read time is highly non-linear, which suggests that
the read is memory-allocation/copying-limited, at least for big files.
An internal memory allocation with aggressive policy should solve that,
that is, not using directly Dvectors (and it would be way faster to store
anyway).
*/
static VALUE dvector_fast_fancy_read(VALUE self, VALUE stream, VALUE options)
{
/* First, we read up options: */
double def = rb_num2dbl(rb_hash_aref(options,
rb_str_new2("default")));
int remove_space = RTEST(rb_hash_aref(options,
rb_str_new2("remove_space")));
int index_col = RTEST(rb_hash_aref(options,
rb_str_new2("index_col")));
long skip_first = FIX2LONG(rb_hash_aref(options,
rb_str_new2("skip_first")));
VALUE sep = rb_hash_aref(options, rb_str_new2("sep"));
VALUE comments = rb_hash_aref(options, rb_str_new2("comments"));
/* Then, some various variables: */
VALUE line;
ID chomp_id = rb_intern("chomp!");
ID gets_id = rb_intern("gets");
long line_number = 0;
/*
Now come the fun part - rudimentary vectors management
*/
int nb_vectors = 0; /* The number of vectors currently created */
int current_size = 10; /* The number of slots available */
double ** vectors = ALLOC_N(double *, current_size);
long index = 0; /* The current index in the vectors */
int allocated_size = 5004; /* The size available in the vectors */
int i;
/* The return value */
VALUE ary;
/* We use a real gets so we can also rely on StringIO, for instance */
while(RTEST(line = rb_funcall(stream, gets_id, 0))) {
VALUE pre, post, match;
const char * line_ptr;
int col = 0;
line_number++;
/* Whether we should skip the line... */
if(skip_first >= line_number)
continue;
/* We check for a blank line using isspace: */
line_ptr = StringValueCStr(line);
while(line_ptr && *line_ptr) {
if(! isspace(*line_ptr))
break;
line_ptr++;
}
if(! *line_ptr)
continue; /* We found a blank line */
if(remove_space) /* We replace the contents of the line */
line = rb_str_new2(line_ptr);
/* ... or a comment line */
if(RTEST(comments) && RTEST(rb_reg_match(comments, line)))
continue;
/* Then, we remove the newline: */
post = line;
rb_funcall(post, chomp_id, 0);
/* We iterate over the different portions between
matches
*/
while(RTEST(post)) {
const char * a;
char * b;
if(RTEST(rb_reg_match(sep, post))) {
match = rb_gv_get("$~");
pre = rb_reg_match_pre(match);
post = rb_reg_match_post(match);
}
else {
pre = post;
post = Qnil;
}
a = StringValueCStr(pre);
double c = strtod(a, &b);
if(b == a)
c = def;
if(col >= nb_vectors) {
nb_vectors++;
/* We need to create a new vector */
if(col >= current_size) { /* Increase the available size */
current_size += 5;
REALLOC_N(vectors, double * , current_size);
}
double * vals = vectors[col] = ALLOC_N(double, allocated_size);
/* Filling it with the default value */
for(i = 0; i < index; i++) {
vals[i] = def;
}
}
vectors[col][index] = c;
col++;
}
/* Now, we finish the line */
for(; col < nb_vectors; col++)
vectors[col][index] = def;
index++;
/* Now, we reallocate memory if necessary */
if(index >= allocated_size) {
allocated_size *= 2; /* We double the size */
for(col = 0; col < nb_vectors; col++)
REALLOC_N(vectors[col], double, allocated_size);
}
}
/* Now, we make up the array */
ary = rb_ary_new();
for(i = 0; i < nb_vectors; i++) {
/* We create a vector */
rb_ary_store(ary, i, make_dvector_from_data(cDvector, index, vectors[i]));
/* And free the memory */
free(vectors[i]);
}
free(vectors);
return ary;
}