Use a TermDocEnum to iterate through the documents that contain a particular term. You can also iterate through the positions which the term occurs in a document.
tde = index_reader.term_docs_for(:content, "fox") tde.each do |doc_id, freq| puts "fox appeared #{freq} times in document #{doc_id}:" positions = [] tde.each_position {|pos| positions << pos} puts " #{positions.join(', ')}" end # or you can do it like this; tde.seek(:title, "red") while tde.next? puts "red appeared #{tde.freq} times in document #{tde.doc}:" positions = [] while pos = tde.next_position positions << pos end puts " #{positions.join(', ')}" end
Returns the current document number pointed to by the
term_doc_enum
.
static VALUE frb_tde_doc(VALUE self) { TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self); return INT2FIX(tde->doc_num(tde)); }
Iterate through the documents and document frequencies in the
term_doc_enum
.
NOTE: this method can only be called once after each seek. If you need to
call #each
again then you should call #seek
again
too.
static VALUE frb_tde_each(VALUE self) { int doc_cnt = 0; TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self); VALUE vals = rb_ary_new2(2); rb_ary_store(vals, 0, Qnil); rb_ary_store(vals, 1, Qnil); while (tde->next(tde)) { doc_cnt++; RARRAY_PTR(vals)[0] = INT2FIX(tde->doc_num(tde)); RARRAY_PTR(vals)[1] = INT2FIX(tde->freq(tde)); rb_yield(vals); } return INT2FIX(doc_cnt); }
Iterate through each of the positions occupied by the current term in the current document. This can only be called once per document. It can be used within the each method. For example, to print the terms documents and positions;
tde.each do |doc_id, freq| puts "term appeared #{freq} times in document #{doc_id}:" positions = [] tde.each_position {|pos| positions << pos} puts " #{positions.join(', ')}" end
static VALUE frb_tde_each_position(VALUE self) { TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self); int pos; if (tde->next_position == NULL) { rb_raise(rb_eNotImpError, "to scan through positions you must create " "the TermDocEnum with Index#term_positions method rather " "than the Index#term_docs method"); } while (0 <= (pos = tde->next_position(tde))) { rb_yield(INT2FIX(pos)); } return self; }
Returns the frequency of the current document pointed to by the
term_doc_enum
.
static VALUE frb_tde_freq(VALUE self) { TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self); return INT2FIX(tde->freq(tde)); }
Move forward to the next document in the enumeration. Returns
true
if there is another document or false
otherwise.
static VALUE frb_tde_next(VALUE self) { TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self); return tde->next(tde) ? Qtrue : Qfalse; }
Move forward to the next document in the enumeration. Returns
true
if there is another document or false
otherwise.
static VALUE frb_tde_next_position(VALUE self) { TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self); int pos; if (tde->next_position == NULL) { rb_raise(rb_eNotImpError, "to scan through positions you must create " "the TermDocEnum with Index#term_positions method rather " "than the Index#term_docs method"); } pos = tde->next_position(tde); return pos >= 0 ? INT2FIX(pos) : Qnil; }
Seek the term term
in the index for field
. After
you call this method you can call next or each to skip through the
documents and positions of this particular term.
static VALUE frb_tde_seek(VALUE self, VALUE rfield, VALUE rterm) { TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self); char *term; VALUE rfnum_map = rb_ivar_get(self, id_fld_num_map); VALUE rfnum = rb_hash_aref(rfnum_map, rfield); int field_num = -1; term = StringValuePtr(rterm); if (rfnum != Qnil) { field_num = FIX2INT(rfnum); } else { rb_raise(rb_eArgError, "field %s doesn't exist in the index", (char *)frb_field(rfield)); } tde->seek(tde, field_num, term); return self; }
Seek the current term in term_enum
. You could just use the
standard seek method like this;
term_doc_enum.seek(term_enum.term)
However the seek_term_enum
method saves an index lookup so
should offer a large performance improvement.
static VALUE frb_tde_seek_te(VALUE self, VALUE rterm_enum) { TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self); TermEnum *te = (TermEnum *)frb_rb_data_ptr(rterm_enum); tde->seek_te(tde, te); return self; }
Skip to the required document number target
and return true if
there is a document >= target
.
static VALUE frb_tde_skip_to(VALUE self, VALUE rtarget) { TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self); return tde->skip_to(tde, FIX2INT(rtarget)) ? Qtrue : Qfalse; }
Returns a json representation of the term doc enum. It will also add the term positions if they are available. You can speed this up by having the method return arrays instead of objects, simply by passing an argument to the #to_json method. For example;
term_doc_enum.to_json() #=> # [ # {"document":1,"frequency":12}, # {"document":11,"frequency":1}, # {"document":29,"frequency":120}, # {"document":30,"frequency":3} # ] term_doc_enum.to_json(:fast) #=> # [ # [1,12], # [11,1], # [29,120], # [30,3] # ]
static VALUE frb_tde_to_json(int argc, VALUE *argv, VALUE self) { TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self); VALUE rjson; char *json, *jp; int capa = 65536; char *format; char close = (argc > 0) ? ']' : '}'; bool do_positions = tde->next_position != NULL; jp = json = ALLOC_N(char, capa); *(jp++) = '['; if (do_positions) { if (argc == 0) { format = "{\"document\":%d,\"frequency\":%d,\"positions\":["; } else { format = "[%d,%d,["; } } else { if (argc == 0) { format = "{\"document\":%d,\"frequency\":%d},"; } else { format = "[%d,%d],"; } } while (tde->next(tde)) { /* 100 chars should be enough room for an extra entry */ if ((jp - json) + 100 + tde->freq(tde) * 20 > capa) { capa <<= 1; REALLOC_N(json, char, capa); } sprintf(jp, format, tde->doc_num(tde), tde->freq(tde)); jp += strlen(jp); if (do_positions) { int pos; while (0 <= (pos = tde->next_position(tde))) { sprintf(jp, "%d,", pos); jp += strlen(jp); } if (*(jp - 1) == ',') jp--; *(jp++) = ']'; *(jp++) = close; *(jp++) = ','; } } if (*(jp - 1) == ',') jp--; *(jp++) = ']'; *jp = '\0'; rjson = rb_str_new2(json); free(json); return rjson; }