Package translate :: Package search :: Package indexing :: Module XapianIndexer
[hide private]
[frames] | no frames]

Source Code for Module translate.search.indexing.XapianIndexer

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2008-2009 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21  # 
 22   
 23  """ 
 24  Interface to the Xapian indexing engine for the Translate Toolkit 
 25   
 26  Xapian v1.0 or higher is supported. 
 27   
 28  If you are interested in writing an interface for Xapian 0.x, then 
 29  you should checkout the following:: 
 30      svn export -r 7235 https://translate.svn.sourceforge.net/svnroot/translate/src/branches/translate-search-indexer-generic-merging/translate/search/indexer/ 
 31  It is not completely working, but it should give you a good start. 
 32  """ 
 33   
 34  __revision__ = "$Id: XapianIndexer.py 13124 2009-11-16 10:51:22Z friedelwolff $" 
 35   
 36  # xapian module versions before 1.0.13 hangs apache under mod_python 
 37  import sys 
 38  import re 
 39  # detect if running under apache 
 40  if 'apache' in sys.modules or '_apache' in sys.modules: 
 41      import subprocess 
 42      # even checking xapian version leads to deadlock under apache, must figure version from command line 
 43      try: 
 44          command = subprocess.Popen(['xapian-check', '--version'], stdout=subprocess.PIPE) 
 45          stdout, stderr = command.communicate() 
 46          if re.match('.*([0-9]+\.[0-9]+\.[0-9]+).*', stdout).groups()[0] < '1.0.13': 
 47              raise ImportError("Running under apache, can't load xapain") 
 48      except: 
 49          raise ImportError("Running under apache, can't load xapian") 
 50   
 51  import CommonIndexer 
 52  import xapian 
 53  import os 
 54   
 55   
56 -def is_available():
57 return xapian.major_version() > 0
58 59 60 # in xapian there is a length restriction for term strings 61 # see http://osdir.com/ml/search.xapian.general/2006-11/msg00210.html 62 # a maximum length of around 240 is described there - but we need less anyway 63 _MAX_TERM_LENGTH = 128 64 65
66 -class XapianDatabase(CommonIndexer.CommonDatabase):
67 """interface to the xapian (http://xapian.org) indexer 68 """ 69 70 QUERY_TYPE = xapian.Query 71 INDEX_DIRECTORY_NAME = "xapian" 72
73 - def __init__(self, basedir, analyzer=None, create_allowed=True):
74 """initialize or open a xapian database 75 76 @raise ValueError: the given location exists, but the database type 77 is incompatible (e.g. created by a different indexing engine) 78 @raise OSError: the database failed to initialize 79 80 @param basedir: the parent directory of the database 81 @type basedir: str 82 @param analyzer: bitwise combination of possible analyzer flags 83 to be used as the default analyzer for this database. Leave it empty 84 to use the system default analyzer (self.ANALYZER_DEFAULT). 85 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ... 86 @type analyzer: int 87 @param create_allowed: create the database, if necessary; default: True 88 @type create_allowed: bool 89 """ 90 # call the __init__ function of our parent 91 super(XapianDatabase, self).__init__(basedir, analyzer=analyzer, 92 create_allowed=create_allowed) 93 if os.path.exists(self.location): 94 # try to open an existing database 95 try: 96 self.database = xapian.WritableDatabase(self.location, 97 xapian.DB_OPEN) 98 except xapian.DatabaseOpeningError, err_msg: 99 raise ValueError("Indexer: failed to open xapian database " \ 100 + "(%s) - maybe it is not a xapian database: %s" \ 101 % (self.location, err_msg)) 102 else: 103 # create a new database 104 if not create_allowed: 105 raise OSError("Indexer: skipping database creation") 106 try: 107 # create the parent directory if it does not exist 108 parent_path = os.path.dirname(self.location) 109 if not os.path.isdir(parent_path): 110 # recursively create all directories up to parent_path 111 os.makedirs(parent_path) 112 except IOError, err_msg: 113 raise OSError("Indexer: failed to create the parent " \ 114 + "directory (%s) of the indexing database: %s" \ 115 % (parent_path, err_msg)) 116 try: 117 self.database = xapian.WritableDatabase(self.location, 118 xapian.DB_CREATE_OR_OPEN) 119 except xapian.DatabaseOpeningError, err_msg: 120 raise OSError("Indexer: failed to open or create a xapian " \ 121 + "database (%s): %s" % (self.location, err_msg))
122
123 - def flush(self, optimize=False):
124 """force to write the current changes to disk immediately 125 126 @param optimize: ignored for xapian 127 @type optimize: bool 128 """ 129 # write changes to disk (only if database is read-write) 130 if (isinstance(self.database, xapian.WritableDatabase)): 131 self.database.flush() 132 # free the database to remove locks - this is a xapian-specific issue 133 self.database = None 134 # reopen it as read-only 135 self._prepare_database()
136
137 - def _create_query_for_query(self, query):
138 """generate a query based on an existing query object 139 140 basically this function should just create a copy of the original 141 142 @param query: the original query object 143 @type query: xapian.Query 144 @return: the resulting query object 145 @rtype: xapian.Query 146 """ 147 # create a copy of the original query 148 return xapian.Query(query)
149
150 - def _create_query_for_string(self, text, require_all=True, 151 analyzer=None):
152 """generate a query for a plain term of a string query 153 154 basically this function parses the string and returns the resulting 155 query 156 157 @param text: the query string 158 @type text: str 159 @param require_all: boolean operator 160 (True -> AND (default) / False -> OR) 161 @type require_all: bool 162 @param analyzer: Define query options (partial matching, exact matching, 163 tokenizing, ...) as bitwise combinations of 164 CommonIndexer.ANALYZER_???. 165 This can override previously defined field analyzer settings. 166 If analyzer is None (default), then the configured analyzer for the 167 field is used. 168 @type analyzer: int 169 @return: resulting query object 170 @rtype: xapian.Query 171 """ 172 qp = xapian.QueryParser() 173 qp.set_database(self.database) 174 if require_all: 175 qp.set_default_op(xapian.Query.OP_AND) 176 else: 177 qp.set_default_op(xapian.Query.OP_OR) 178 if analyzer is None: 179 analyzer = self.analyzer 180 if analyzer & self.ANALYZER_PARTIAL > 0: 181 match_flags = xapian.QueryParser.FLAG_PARTIAL 182 return qp.parse_query(text, match_flags) 183 elif analyzer == self.ANALYZER_EXACT: 184 # exact matching - 185 return xapian.Query(text) 186 else: 187 # everything else (not partial and not exact) 188 match_flags = 0 189 return qp.parse_query(text, match_flags)
190
191 - def _create_query_for_field(self, field, value, analyzer=None):
192 """generate a field query 193 194 this functions creates a field->value query 195 196 @param field: the fieldname to be used 197 @type field: str 198 @param value: the wanted value of the field 199 @type value: str 200 @param analyzer: Define query options (partial matching, exact matching, 201 tokenizing, ...) as bitwise combinations of 202 CommonIndexer.ANALYZER_???. 203 This can override previously defined field analyzer settings. 204 If analyzer is None (default), then the configured analyzer for the 205 field is used. 206 @type analyzer: int 207 @return: the resulting query object 208 @rtype: xapian.Query 209 """ 210 if analyzer is None: 211 analyzer = self.analyzer 212 if analyzer == self.ANALYZER_EXACT: 213 # exact matching -> keep special characters 214 return xapian.Query("%s%s" % (field.upper(), value)) 215 # other queries need a parser object 216 qp = xapian.QueryParser() 217 qp.set_database(self.database) 218 if (analyzer & self.ANALYZER_PARTIAL > 0): 219 # partial matching 220 match_flags = xapian.QueryParser.FLAG_PARTIAL 221 return qp.parse_query(value, match_flags, field.upper()) 222 else: 223 # everything else (not partial and not exact) 224 match_flags = 0 225 return qp.parse_query(value, match_flags, field.upper())
226
227 - def _create_query_combined(self, queries, require_all=True):
228 """generate a combined query 229 230 @param queries: list of the original queries 231 @type queries: list of xapian.Query 232 @param require_all: boolean operator 233 (True -> AND (default) / False -> OR) 234 @type require_all: bool 235 @return: the resulting combined query object 236 @rtype: xapian.Query 237 """ 238 if require_all: 239 query_op = xapian.Query.OP_AND 240 else: 241 query_op = xapian.Query.OP_OR 242 return xapian.Query(query_op, queries)
243
244 - def _create_empty_document(self):
245 """create an empty document to be filled and added to the index later 246 247 @return: the new document object 248 @rtype: xapian.Document 249 """ 250 return xapian.Document()
251
252 - def _add_plain_term(self, document, term, tokenize=True):
253 """add a term to a document 254 255 @param document: the document to be changed 256 @type document: xapian.Document 257 @param term: a single term to be added 258 @type term: str 259 @param tokenize: should the term be tokenized automatically 260 @type tokenize: bool 261 """ 262 if tokenize: 263 term_gen = xapian.TermGenerator() 264 term_gen.set_document(document) 265 term_gen.index_text(term) 266 else: 267 document.add_term(_truncate_term_length(term))
268
269 - def _add_field_term(self, document, field, term, tokenize=True):
270 """add a field term to a document 271 272 @param document: the document to be changed 273 @type document: xapian.Document 274 @param field: name of the field 275 @type field: str 276 @param term: term to be associated to the field 277 @type term: str 278 @param tokenize: should the term be tokenized automatically 279 @type tokenize: bool 280 """ 281 if tokenize: 282 term_gen = xapian.TermGenerator() 283 term_gen.set_document(document) 284 term_gen.index_text(term, 1, field.upper()) 285 else: 286 document.add_term(_truncate_term_length("%s%s" % \ 287 (field.upper(), term)))
288
289 - def _add_document_to_index(self, document):
290 """add a prepared document to the index database 291 292 @param document: the document to be added 293 @type document: xapian.Document 294 """ 295 # open the database for writing 296 self._prepare_database(writable=True) 297 self.database.add_document(document)
298
299 - def begin_transaction(self):
300 """begin a transaction 301 302 Xapian supports transactions to group multiple database modifications. 303 This avoids intermediate flushing and therefore increases performance. 304 """ 305 self._prepare_database(writable=True) 306 self.database.begin_transaction()
307
308 - def cancel_transaction(self):
309 """cancel an ongoing transaction 310 311 no changes since the last execution of 'begin_transcation' are written 312 """ 313 self._prepare_database(writable=True) 314 self.database.cancel_transaction()
315
316 - def commit_transaction(self):
317 """submit the changes of an ongoing transaction 318 319 all changes since the last execution of 'begin_transaction' are written 320 """ 321 self._prepare_database(writable=True) 322 self.database.commit_transaction()
323
324 - def get_query_result(self, query):
325 """return an object containing the results of a query 326 327 @param query: a pre-compiled xapian query 328 @type query: xapian.Query 329 @return: an object that allows access to the results 330 @rtype: XapianIndexer.CommonEnquire 331 """ 332 enquire = xapian.Enquire(self.database) 333 enquire.set_query(query) 334 return XapianEnquire(enquire)
335
336 - def delete_document_by_id(self, docid):
337 """delete a specified document 338 339 @param docid: the document ID to be deleted 340 @type docid: int 341 """ 342 # open the database for writing 343 self._prepare_database(writable=True) 344 try: 345 self.database.delete_document(docid) 346 return True 347 except xapian.DocNotFoundError: 348 return False
349
350 - def search(self, query, fieldnames):
351 """return a list of the contents of specified fields for all matches of 352 a query 353 354 @param query: the query to be issued 355 @type query: xapian.Query 356 @param fieldnames: the name(s) of a field of the document content 357 @type fieldnames: string | list of strings 358 @return: a list of dicts containing the specified field(s) 359 @rtype: list of dicts 360 """ 361 result = [] 362 if isinstance(fieldnames, basestring): 363 fieldnames = [fieldnames] 364 self._walk_matches(query, _extract_fieldvalues, (result, fieldnames)) 365 return result
366
367 - def _prepare_database(self, writable=False):
368 """reopen the database as read-only or as writable if necessary 369 370 this fixes a xapian specific issue regarding open locks for 371 writable databases 372 373 @param writable: True for opening a writable database 374 @type writable: bool 375 """ 376 if writable and (not isinstance(self.database, 377 xapian.WritableDatabase)): 378 self.database = xapian.WritableDatabase(self.location, 379 xapian.DB_OPEN) 380 elif not writable and (not isinstance(self.database, xapian.Database)): 381 self.database = xapian.Database(self.location)
382 383
384 -class XapianEnquire(CommonIndexer.CommonEnquire):
385 """interface to the xapian object for storing sets of matches 386 """ 387
388 - def get_matches(self, start, number):
389 """return a specified number of qualified matches of a previous query 390 391 @param start: index of the first match to return (starting from zero) 392 @type start: int 393 @param number: the number of matching entries to return 394 @type number: int 395 @return: a set of matching entries and some statistics 396 @rtype: tuple of (returned number, available number, matches) 397 "matches" is a dictionary of:: 398 ["rank", "percent", "document", "docid"] 399 """ 400 matches = self.enquire.get_mset(start, number) 401 result = [] 402 for match in matches: 403 elem = {} 404 elem["rank"] = match[xapian.MSET_RANK] 405 elem["docid"] = match[xapian.MSET_DID] 406 elem["percent"] = match[xapian.MSET_PERCENT] 407 elem["document"] = match[xapian.MSET_DOCUMENT] 408 result.append(elem) 409 return (matches.size(), matches.get_matches_estimated(), result)
410 411
412 -def _truncate_term_length(term, taken=0):
413 """truncate the length of a term string length to the maximum allowed 414 for xapian terms 415 416 @param term: the value of the term, that should be truncated 417 @type term: str 418 @param taken: since a term consists of the name of the term and its 419 actual value, this additional parameter can be used to reduce the 420 maximum count of possible characters 421 @type taken: int 422 @return: the truncated string 423 @rtype: str 424 """ 425 if len(term) > _MAX_TERM_LENGTH - taken: 426 return term[0:_MAX_TERM_LENGTH - taken - 1] 427 else: 428 return term
429
430 -def _extract_fieldvalues(match, (result, fieldnames)):
431 """add a dict of field values to a list 432 433 usually this function should be used together with '_walk_matches' 434 for traversing a list of matches 435 @param match: a single match object 436 @type match: xapian.MSet 437 @param result: the resulting dict will be added to this list 438 @type result: list of dict 439 @param fieldnames: the names of the fields to be added to the dict 440 @type fieldnames: list of str 441 """ 442 # prepare empty dict 443 item_fields = {} 444 # fill the dict 445 for term in match["document"].termlist(): 446 for fname in fieldnames: 447 if ((fname is None) and re.match("[^A-Z]", term.term)): 448 value = term.term 449 elif re.match("%s[^A-Z]" % str(fname).upper(), term.term): 450 value = term.term[len(fname):] 451 else: 452 continue 453 # we found a matching field/term 454 if item_fields.has_key(fname): 455 item_fields[fname].append(value) 456 else: 457 item_fields[fname] = [value] 458 result.append(item_fields)
459