Package translate :: Package search :: Package indexing :: Module XapianIndexer
[hide private]
[frames] | no frames]

Source Code for Module translate.search.indexing.XapianIndexer

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2008-2009 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21  # 
 22   
 23  """ 
 24  Interface to the Xapian indexing engine for the Translate Toolkit 
 25   
 26  Xapian v1.0 or higher is supported. 
 27   
 28  If you are interested in writing an interface for Xapian 0.x, then 
 29  you should checkout the following:: 
 30      svn export -r 7235 https://translate.svn.sourceforge.net/svnroot/translate/src/branches/translate-search-indexer-generic-merging/translate/search/indexer/ 
 31  It is not completely working, but it should give you a good start. 
 32  """ 
 33   
 34  __revision__ = "$Id: XapianIndexer.py 15714 2010-09-02 20:55:12Z alaaosh $" 
 35   
 36  # xapian module versions before 1.0.13 hangs apache under mod_python 
 37  import sys 
 38  import re 
 39   
 40  # detect if running under apache 
 41  if 'apache' in sys.modules or '_apache' in sys.modules or 'mod_wsgi' in sys.modules: 
 42   
43 - def _str2version(version):
44 return [int(i) for i in version.split('.')]
45 46 import subprocess 47 # even checking xapian version leads to deadlock under apache, must figure version from command line 48 try: 49 command = subprocess.Popen(['xapian-check', '--version'], stdout=subprocess.PIPE) 50 stdout, stderr = command.communicate() 51 if _str2version(re.match('.*([0-9]+\.[0-9]+\.[0-9]+).*', stdout).groups()[0]) < [1, 0, 13]: 52 raise ImportError("Running under apache, can't load xapain") 53 except: 54 #FIXME: report is xapian-check command is missing? 55 raise ImportError("Running under apache, can't load xapian") 56 57 import CommonIndexer 58 import xapian 59 import os 60 import time 61 import logging 62 63
64 -def is_available():
65 return xapian.major_version() > 0
66 67 68 # in xapian there is a length restriction for term strings 69 # see http://osdir.com/ml/search.xapian.general/2006-11/msg00210.html 70 # a maximum length of around 240 is described there - but we need less anyway 71 _MAX_TERM_LENGTH = 128 72 73
74 -class XapianDatabase(CommonIndexer.CommonDatabase):
75 """interface to the xapian (http://xapian.org) indexer 76 """ 77 78 QUERY_TYPE = xapian.Query 79 INDEX_DIRECTORY_NAME = "xapian" 80
81 - def __init__(self, basedir, analyzer=None, create_allowed=True):
82 """initialize or open a xapian database 83 84 @raise ValueError: the given location exists, but the database type 85 is incompatible (e.g. created by a different indexing engine) 86 @raise OSError: the database failed to initialize 87 88 @param basedir: the parent directory of the database 89 @type basedir: str 90 @param analyzer: bitwise combination of possible analyzer flags 91 to be used as the default analyzer for this database. Leave it empty 92 to use the system default analyzer (self.ANALYZER_DEFAULT). 93 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ... 94 @type analyzer: int 95 @param create_allowed: create the database, if necessary; default: True 96 @type create_allowed: bool 97 """ 98 # call the __init__ function of our parent 99 super(XapianDatabase, self).__init__(basedir, analyzer=analyzer, 100 create_allowed=create_allowed) 101 self.reader = None 102 self.writer = None 103 if os.path.exists(self.location): 104 # try to open an existing database 105 try: 106 self.reader = xapian.Database(self.location) 107 except xapian.DatabaseOpeningError, err_msg: 108 raise ValueError("Indexer: failed to open xapian database " \ 109 + "(%s) - maybe it is not a xapian database: %s" \ 110 % (self.location, str(err_msg))) 111 else: 112 # create a new database 113 if not create_allowed: 114 raise OSError("Indexer: skipping database creation") 115 try: 116 # create the parent directory if it does not exist 117 parent_path = os.path.dirname(self.location) 118 if not os.path.isdir(parent_path): 119 # recursively create all directories up to parent_path 120 os.makedirs(parent_path) 121 except IOError, err_msg: 122 raise OSError("Indexer: failed to create the parent " \ 123 + "directory (%s) of the indexing database: %s" \ 124 % (parent_path, str(err_msg))) 125 try: 126 self.writer = xapian.WritableDatabase(self.location, 127 xapian.DB_CREATE_OR_OPEN) 128 self.flush() 129 except xapian.DatabaseOpeningError, err_msg: 130 raise OSError("Indexer: failed to open or create a xapian " \ 131 + "database (%s): %s" % (self.location, str(err_msg)))
132
133 - def __del__(self):
134 self.reader = None 135 self._writer_close()
136
137 - def flush(self, optimize=False):
138 """force to write the current changes to disk immediately 139 140 @param optimize: ignored for xapian 141 @type optimize: bool 142 """ 143 # write changes to disk (only if database is read-write) 144 if self._writer_is_open(): 145 self._writer_close() 146 self._index_refresh()
147
148 - def make_query(self, *args, **kwargs):
149 try: 150 return super(XapianDatabase, self).make_query(*args, **kwargs) 151 except xapian.DatabaseModifiedError: 152 self._index_refresh() 153 return super(XapianDatabase, self).make_query(*args, **kwargs)
154
155 - def _create_query_for_query(self, query):
156 """generate a query based on an existing query object 157 158 basically this function should just create a copy of the original 159 160 @param query: the original query object 161 @type query: xapian.Query 162 @return: the resulting query object 163 @rtype: xapian.Query 164 """ 165 # create a copy of the original query 166 return xapian.Query(query)
167
168 - def _create_query_for_string(self, text, require_all=True, 169 analyzer=None):
170 """generate a query for a plain term of a string query 171 172 basically this function parses the string and returns the resulting 173 query 174 175 @param text: the query string 176 @type text: str 177 @param require_all: boolean operator 178 (True -> AND (default) / False -> OR) 179 @type require_all: bool 180 @param analyzer: Define query options (partial matching, exact matching, 181 tokenizing, ...) as bitwise combinations of 182 CommonIndexer.ANALYZER_???. 183 This can override previously defined field analyzer settings. 184 If analyzer is None (default), then the configured analyzer for the 185 field is used. 186 @type analyzer: int 187 @return: resulting query object 188 @rtype: xapian.Query 189 """ 190 qp = xapian.QueryParser() 191 qp.set_database(self.reader) 192 if require_all: 193 qp.set_default_op(xapian.Query.OP_AND) 194 else: 195 qp.set_default_op(xapian.Query.OP_OR) 196 if analyzer is None: 197 analyzer = self.analyzer 198 if analyzer & self.ANALYZER_PARTIAL > 0: 199 match_flags = xapian.QueryParser.FLAG_PARTIAL 200 return qp.parse_query(text, match_flags) 201 elif analyzer == self.ANALYZER_EXACT: 202 # exact matching - 203 return xapian.Query(text) 204 else: 205 # everything else (not partial and not exact) 206 match_flags = 0 207 return qp.parse_query(text, match_flags)
208
209 - def _create_query_for_field(self, field, value, analyzer=None):
210 """generate a field query 211 212 this functions creates a field->value query 213 214 @param field: the fieldname to be used 215 @type field: str 216 @param value: the wanted value of the field 217 @type value: str 218 @param analyzer: Define query options (partial matching, exact matching, 219 tokenizing, ...) as bitwise combinations of 220 CommonIndexer.ANALYZER_???. 221 This can override previously defined field analyzer settings. 222 If analyzer is None (default), then the configured analyzer for the 223 field is used. 224 @type analyzer: int 225 @return: the resulting query object 226 @rtype: xapian.Query 227 """ 228 if analyzer is None: 229 analyzer = self.analyzer 230 if analyzer == self.ANALYZER_EXACT: 231 # exact matching -> keep special characters 232 return xapian.Query("%s%s" % (field.upper(), value)) 233 # other queries need a parser object 234 qp = xapian.QueryParser() 235 qp.set_database(self.reader) 236 if (analyzer & self.ANALYZER_PARTIAL > 0): 237 # partial matching 238 match_flags = xapian.QueryParser.FLAG_PARTIAL 239 return qp.parse_query(value, match_flags, field.upper()) 240 else: 241 # everything else (not partial and not exact) 242 match_flags = 0 243 return qp.parse_query(value, match_flags, field.upper())
244
245 - def _create_query_combined(self, queries, require_all=True):
246 """generate a combined query 247 248 @param queries: list of the original queries 249 @type queries: list of xapian.Query 250 @param require_all: boolean operator 251 (True -> AND (default) / False -> OR) 252 @type require_all: bool 253 @return: the resulting combined query object 254 @rtype: xapian.Query 255 """ 256 if require_all: 257 query_op = xapian.Query.OP_AND 258 else: 259 query_op = xapian.Query.OP_OR 260 return xapian.Query(query_op, queries)
261
262 - def _create_empty_document(self):
263 """create an empty document to be filled and added to the index later 264 265 @return: the new document object 266 @rtype: xapian.Document 267 """ 268 return xapian.Document()
269
270 - def _add_plain_term(self, document, term, tokenize=True):
271 """add a term to a document 272 273 @param document: the document to be changed 274 @type document: xapian.Document 275 @param term: a single term to be added 276 @type term: str 277 @param tokenize: should the term be tokenized automatically 278 @type tokenize: bool 279 """ 280 if tokenize: 281 term_gen = xapian.TermGenerator() 282 term_gen.set_document(document) 283 term_gen.index_text(term) 284 else: 285 document.add_term(_truncate_term_length(term))
286
287 - def _add_field_term(self, document, field, term, tokenize=True):
288 """add a field term to a document 289 290 @param document: the document to be changed 291 @type document: xapian.Document 292 @param field: name of the field 293 @type field: str 294 @param term: term to be associated to the field 295 @type term: str 296 @param tokenize: should the term be tokenized automatically 297 @type tokenize: bool 298 """ 299 if tokenize: 300 term_gen = xapian.TermGenerator() 301 term_gen.set_document(document) 302 term_gen.index_text(term, 1, field.upper()) 303 else: 304 document.add_term(_truncate_term_length("%s%s" % \ 305 (field.upper(), term)))
306
307 - def _add_document_to_index(self, document):
308 """add a prepared document to the index database 309 310 @param document: the document to be added 311 @type document: xapian.Document 312 """ 313 # open the database for writing 314 self._writer_open() 315 self.writer.add_document(document)
316
317 - def begin_transaction(self):
318 """begin a transaction 319 320 Xapian supports transactions to group multiple database modifications. 321 This avoids intermediate flushing and therefore increases performance. 322 """ 323 self._writer_open() 324 self.writer.begin_transaction()
325
326 - def cancel_transaction(self):
327 """cancel an ongoing transaction 328 329 no changes since the last execution of 'begin_transcation' are written 330 """ 331 self.writer.cancel_transaction() 332 self._writer_close()
333
334 - def commit_transaction(self):
335 """submit the changes of an ongoing transaction 336 337 all changes since the last execution of 'begin_transaction' are written 338 """ 339 self.writer.commit_transaction() 340 self._writer_close()
341
342 - def get_query_result(self, query):
343 """return an object containing the results of a query 344 345 @param query: a pre-compiled xapian query 346 @type query: xapian.Query 347 @return: an object that allows access to the results 348 @rtype: XapianIndexer.CommonEnquire 349 """ 350 enquire = xapian.Enquire(self.reader) 351 enquire.set_query(query) 352 return XapianEnquire(enquire)
353
354 - def delete_document_by_id(self, docid):
355 """delete a specified document 356 357 @param docid: the document ID to be deleted 358 @type docid: int 359 """ 360 # open the database for writing 361 self._writer_open() 362 try: 363 self.writer.delete_document(docid) 364 return True 365 except xapian.DocNotFoundError: 366 return False
367
368 - def search(self, query, fieldnames):
369 """return a list of the contents of specified fields for all matches of 370 a query 371 372 @param query: the query to be issued 373 @type query: xapian.Query 374 @param fieldnames: the name(s) of a field of the document content 375 @type fieldnames: string | list of strings 376 @return: a list of dicts containing the specified field(s) 377 @rtype: list of dicts 378 """ 379 result = [] 380 if isinstance(fieldnames, basestring): 381 fieldnames = [fieldnames] 382 try: 383 self._walk_matches(query, _extract_fieldvalues, (result, fieldnames)) 384 except xapian.DatabaseModifiedError: 385 self._index_refresh() 386 self._walk_matches(query, _extract_fieldvalues, (result, fieldnames)) 387 return result
388
389 - def _delete_stale_lock(self):
390 if not self._writer_is_open(): 391 lockfile = os.path.join(self.location, 'flintlock') 392 if os.path.exists(lockfile) and (time.time() - os.path.getmtime(lockfile)) / 60 > 15: 393 logging.warning("stale lock found in %s, removing.", self.location) 394 os.remove(lockfile)
395
396 - def _writer_open(self):
397 """open write access for the indexing database and acquire an exclusive lock""" 398 if not self._writer_is_open(): 399 self._delete_stale_lock() 400 try: 401 self.writer = xapian.WritableDatabase(self.location, xapian.DB_OPEN) 402 except xapian.DatabaseOpeningError, err_msg: 403 404 raise ValueError("Indexer: failed to open xapian database " \ 405 + "(%s) - maybe it is not a xapian database: %s" \ 406 % (self.location, str(err_msg)))
407
408 - def _writer_close(self):
409 """close indexing write access and remove database lock""" 410 if self._writer_is_open(): 411 self.writer.flush() 412 self.writer = None
413
414 - def _writer_is_open(self):
415 """check if the indexing write access is currently open""" 416 return hasattr(self, "writer") and not self.writer is None
417
418 - def _index_refresh(self):
419 """re-read the indexer database""" 420 try: 421 if self.reader is None: 422 self.reader = xapian.Database(self.location) 423 else: 424 self.reader.reopen() 425 except xapian.DatabaseOpeningError, err_msg: 426 raise ValueError("Indexer: failed to open xapian database " \ 427 + "(%s) - maybe it is not a xapian database: %s" \ 428 % (self.location, str(err_msg)))
429 430
431 -class XapianEnquire(CommonIndexer.CommonEnquire):
432 """interface to the xapian object for storing sets of matches 433 """ 434
435 - def get_matches(self, start, number):
436 """return a specified number of qualified matches of a previous query 437 438 @param start: index of the first match to return (starting from zero) 439 @type start: int 440 @param number: the number of matching entries to return 441 @type number: int 442 @return: a set of matching entries and some statistics 443 @rtype: tuple of (returned number, available number, matches) 444 "matches" is a dictionary of:: 445 ["rank", "percent", "document", "docid"] 446 """ 447 matches = self.enquire.get_mset(start, number) 448 result = [] 449 for match in matches: 450 elem = {} 451 elem["rank"] = match[xapian.MSET_RANK] 452 elem["docid"] = match[xapian.MSET_DID] 453 elem["percent"] = match[xapian.MSET_PERCENT] 454 elem["document"] = match[xapian.MSET_DOCUMENT] 455 result.append(elem) 456 return (matches.size(), matches.get_matches_estimated(), result)
457 458
459 -def _truncate_term_length(term, taken=0):
460 """truncate the length of a term string length to the maximum allowed 461 for xapian terms 462 463 @param term: the value of the term, that should be truncated 464 @type term: str 465 @param taken: since a term consists of the name of the term and its 466 actual value, this additional parameter can be used to reduce the 467 maximum count of possible characters 468 @type taken: int 469 @return: the truncated string 470 @rtype: str 471 """ 472 if len(term) > _MAX_TERM_LENGTH - taken: 473 return term[0:_MAX_TERM_LENGTH - taken - 1] 474 else: 475 return term
476 477
478 -def _extract_fieldvalues(match, (result, fieldnames)):
479 """add a dict of field values to a list 480 481 usually this function should be used together with '_walk_matches' 482 for traversing a list of matches 483 @param match: a single match object 484 @type match: xapian.MSet 485 @param result: the resulting dict will be added to this list 486 @type result: list of dict 487 @param fieldnames: the names of the fields to be added to the dict 488 @type fieldnames: list of str 489 """ 490 # prepare empty dict 491 item_fields = {} 492 # fill the dict 493 for term in match["document"].termlist(): 494 for fname in fieldnames: 495 if ((fname is None) and re.match("[^A-Z]", term.term)): 496 value = term.term 497 elif re.match("%s[^A-Z]" % str(fname).upper(), term.term): 498 value = term.term[len(fname):] 499 else: 500 continue 501 # we found a matching field/term 502 if fname in item_fields: 503 item_fields[fname].append(value) 504 else: 505 item_fields[fname] = [value] 506 result.append(item_fields)
507