FuzzyQuery uses the Levenshtein distance formula for measuring the similarity between two terms. For example, weak and week have one letter difference and they are four characters long so the simlarity is 75% or 0.75. You can use this query to match terms that are very close to the search term.
FuzzyQuery can be quite useful for find documents that wouldn't normally be found because of typos.
FuzzyQuery.new(:field, "google", :min_similarity => 0.6, :prefix_length => 2) # matches => "gogle", "goggle", "googol", "googel"
Get the default value for :min_similarity
static VALUE frb_fq_get_dms(VALUE self) { return rb_cvar_get(cFuzzyQuery, id_default_min_similarity); }
Set the default value for :min_similarity
static VALUE frb_fq_set_dms(VALUE self, VALUE val) { double min_sim = NUM2DBL(val); if (min_sim >= 1.0) { rb_raise(rb_eArgError, "%f >= 1.0. :min_similarity must be < 1.0", min_sim); } else if (min_sim < 0.0) { rb_raise(rb_eArgError, "%f < 0.0. :min_similarity must be > 0.0", min_sim); } qp_default_fuzzy_min_sim = (float)min_sim; #ifdef FRT_RUBY_VERSION_1_9 rb_cvar_set(cFuzzyQuery, id_default_min_similarity, val); #else rb_cvar_set(cFuzzyQuery, id_default_min_similarity, val, Qfalse); #endif return val; }
Get the default value for :prefix_length
static VALUE frb_fq_get_dpl(VALUE self) { return rb_cvar_get(cFuzzyQuery, id_default_prefix_length); }
Set the default value for :prefix_length
static VALUE frb_fq_set_dpl(VALUE self, VALUE val) { int pre_len = FIX2INT(val); if (pre_len < 0) { rb_raise(rb_eArgError, "%d < 0. :prefix_length must be >= 0", pre_len); } qp_default_fuzzy_pre_len = pre_len; #ifdef FRT_RUBY_VERSION_1_9 rb_cvar_set(cFuzzyQuery, id_default_prefix_length, val); #else rb_cvar_set(cFuzzyQuery, id_default_prefix_length, val, Qfalse); #endif return val; }
Create a new FuzzyQuery that will match terms
with a similarity of at least :min_similarity
to
term
. Similarity is scored using the Levenshtein edit distance
formula. See en.wikipedia.org/wiki/Levenshtein_distance
If a :prefix_length
> 0 is specified, a common prefix of
that length is also required.
You can also set :max_terms
to prevent memory overflow
problems. By default it is set to 512.
FuzzyQuery.new(:content, "levenshtein", :min_similarity => 0.8, :prefix_length => 5, :max_terms => 1024)
field to search
term to search for including it's close matches
Default: 0.5. minimum levenshtein distance score for a match
Default: 0. minimum prefix_match before levenshtein distance is measured.
This parameter is used to improve performance. With a
:prefix_length
of 0, all terms in the index must be checked
which can be quite a performance hit. By setting the prefix length to a
larger number you minimize the number of terms that need to be checked.
Even 1 will cut down the work by a factor of about 26 depending on your
character set and the first letter.
Limits the number of terms that can be added to the query when it is
expanded as a MultiTermQuery. This is not
usually a problem with FuzzyQueries unless you set
:min_similarity
to a very low value.
static VALUE frb_fq_init(int argc, VALUE *argv, VALUE self) { Query *q; VALUE rfield, rterm, roptions; float min_sim = (float)NUM2DBL(rb_cvar_get(cFuzzyQuery, id_default_min_similarity)); int pre_len = FIX2INT(rb_cvar_get(cFuzzyQuery, id_default_prefix_length)); int max_terms = FIX2INT(rb_cvar_get(cMultiTermQuery, id_default_max_terms)); if (rb_scan_args(argc, argv, "21", &rfield, &rterm, &roptions) >= 3) { VALUE v; Check_Type(roptions, T_HASH); if (Qnil != (v = rb_hash_aref(roptions, sym_prefix_length))) { pre_len = FIX2INT(v); } if (Qnil != (v = rb_hash_aref(roptions, sym_min_similarity))) { min_sim = (float)NUM2DBL(v); } if (Qnil != (v = rb_hash_aref(roptions, sym_max_terms))) { max_terms = FIX2INT(v); } } if (min_sim >= 1.0) { rb_raise(rb_eArgError, "%f >= 1.0. :min_similarity must be < 1.0", min_sim); } else if (min_sim < 0.0) { rb_raise(rb_eArgError, "%f < 0.0. :min_similarity must be > 0.0", min_sim); } if (pre_len < 0) { rb_raise(rb_eArgError, "%d < 0. :prefix_length must be >= 0", pre_len); } if (max_terms < 0) { rb_raise(rb_eArgError, "%d < 0. :max_terms must be >= 0", max_terms); } q = fuzq_new_conf(frb_field(rfield), StringValuePtr(rterm), min_sim, pre_len, max_terms); Frt_Wrap_Struct(self, NULL, &frb_q_free, q); object_add(q, self); return self; }
Get the :min_similarity
for the query.
static VALUE frb_fq_min_sim(VALUE self) { GET_Q(); return rb_float_new((double)((FuzzyQuery *)q)->min_sim); }
Get the :prefix_length
for the query.
static VALUE frb_fq_pre_len(VALUE self) { GET_Q(); return INT2FIX(((FuzzyQuery *)q)->pre_len); }