1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """
24 Interface to the Xapian indexing engine for the Translate Toolkit
25
26 Xapian v1.0 or higher is supported.
27
28 If you are interested in writing an interface for Xapian 0.x, then
29 you should checkout the following::
30 svn export -r 7235 https://translate.svn.sourceforge.net/svnroot/translate/src/branches/translate-search-indexer-generic-merging/translate/search/indexer/
31 It is not completely working, but it should give you a good start.
32 """
33
34 __revision__ = "$Id: XapianIndexer.py 15330 2010-08-05 11:06:53Z alaaosh $"
35
36
37 import sys
38 import re
39
40
41 if 'apache' in sys.modules or '_apache' in sys.modules or 'mod_wsgi' in sys.modules:
43 return [int(i) for i in version.split('.')]
44
45 import subprocess
46
47 try:
48 command = subprocess.Popen(['xapian-check', '--version'], stdout=subprocess.PIPE)
49 stdout, stderr = command.communicate()
50 if _str2version(re.match('.*([0-9]+\.[0-9]+\.[0-9]+).*', stdout).groups()[0]) < [1, 0, 13]:
51 raise ImportError("Running under apache, can't load xapain")
52 except:
53
54 raise ImportError("Running under apache, can't load xapian")
55
56 import CommonIndexer
57 import xapian
58 import os
59 import time
60 import logging
61
63 return xapian.major_version() > 0
64
65
66
67
68
69 _MAX_TERM_LENGTH = 128
70
71
73 """interface to the xapian (http://xapian.org) indexer
74 """
75
76 QUERY_TYPE = xapian.Query
77 INDEX_DIRECTORY_NAME = "xapian"
78
79 - def __init__(self, basedir, analyzer=None, create_allowed=True):
80 """initialize or open a xapian database
81
82 @raise ValueError: the given location exists, but the database type
83 is incompatible (e.g. created by a different indexing engine)
84 @raise OSError: the database failed to initialize
85
86 @param basedir: the parent directory of the database
87 @type basedir: str
88 @param analyzer: bitwise combination of possible analyzer flags
89 to be used as the default analyzer for this database. Leave it empty
90 to use the system default analyzer (self.ANALYZER_DEFAULT).
91 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
92 @type analyzer: int
93 @param create_allowed: create the database, if necessary; default: True
94 @type create_allowed: bool
95 """
96
97 super(XapianDatabase, self).__init__(basedir, analyzer=analyzer,
98 create_allowed=create_allowed)
99 self.reader = None
100 self.writer = None
101 if os.path.exists(self.location):
102
103 try:
104 self.reader = xapian.Database(self.location)
105 except xapian.DatabaseOpeningError, err_msg:
106 raise ValueError("Indexer: failed to open xapian database " \
107 + "(%s) - maybe it is not a xapian database: %s" \
108 % (self.location, str(err_msg)))
109 else:
110
111 if not create_allowed:
112 raise OSError("Indexer: skipping database creation")
113 try:
114
115 parent_path = os.path.dirname(self.location)
116 if not os.path.isdir(parent_path):
117
118 os.makedirs(parent_path)
119 except IOError, err_msg:
120 raise OSError("Indexer: failed to create the parent " \
121 + "directory (%s) of the indexing database: %s" \
122 % (parent_path, str(err_msg)))
123 try:
124 self.writer = xapian.WritableDatabase(self.location,
125 xapian.DB_CREATE_OR_OPEN)
126 self.flush()
127 except xapian.DatabaseOpeningError, err_msg:
128 raise OSError("Indexer: failed to open or create a xapian " \
129 + "database (%s): %s" % (self.location, str(err_msg)))
130
134
135 - def flush(self, optimize=False):
136 """force to write the current changes to disk immediately
137
138 @param optimize: ignored for xapian
139 @type optimize: bool
140 """
141
142 if self._writer_is_open():
143 self._writer_close()
144 self._index_refresh()
145
152
154 """generate a query based on an existing query object
155
156 basically this function should just create a copy of the original
157
158 @param query: the original query object
159 @type query: xapian.Query
160 @return: the resulting query object
161 @rtype: xapian.Query
162 """
163
164 return xapian.Query(query)
165
168 """generate a query for a plain term of a string query
169
170 basically this function parses the string and returns the resulting
171 query
172
173 @param text: the query string
174 @type text: str
175 @param require_all: boolean operator
176 (True -> AND (default) / False -> OR)
177 @type require_all: bool
178 @param analyzer: Define query options (partial matching, exact matching,
179 tokenizing, ...) as bitwise combinations of
180 CommonIndexer.ANALYZER_???.
181 This can override previously defined field analyzer settings.
182 If analyzer is None (default), then the configured analyzer for the
183 field is used.
184 @type analyzer: int
185 @return: resulting query object
186 @rtype: xapian.Query
187 """
188 qp = xapian.QueryParser()
189 qp.set_database(self.reader)
190 if require_all:
191 qp.set_default_op(xapian.Query.OP_AND)
192 else:
193 qp.set_default_op(xapian.Query.OP_OR)
194 if analyzer is None:
195 analyzer = self.analyzer
196 if analyzer & self.ANALYZER_PARTIAL > 0:
197 match_flags = xapian.QueryParser.FLAG_PARTIAL
198 return qp.parse_query(text, match_flags)
199 elif analyzer == self.ANALYZER_EXACT:
200
201 return xapian.Query(text)
202 else:
203
204 match_flags = 0
205 return qp.parse_query(text, match_flags)
206
208 """generate a field query
209
210 this functions creates a field->value query
211
212 @param field: the fieldname to be used
213 @type field: str
214 @param value: the wanted value of the field
215 @type value: str
216 @param analyzer: Define query options (partial matching, exact matching,
217 tokenizing, ...) as bitwise combinations of
218 CommonIndexer.ANALYZER_???.
219 This can override previously defined field analyzer settings.
220 If analyzer is None (default), then the configured analyzer for the
221 field is used.
222 @type analyzer: int
223 @return: the resulting query object
224 @rtype: xapian.Query
225 """
226 if analyzer is None:
227 analyzer = self.analyzer
228 if analyzer == self.ANALYZER_EXACT:
229
230 return xapian.Query("%s%s" % (field.upper(), value))
231
232 qp = xapian.QueryParser()
233 qp.set_database(self.reader)
234 if (analyzer & self.ANALYZER_PARTIAL > 0):
235
236 match_flags = xapian.QueryParser.FLAG_PARTIAL
237 return qp.parse_query(value, match_flags, field.upper())
238 else:
239
240 match_flags = 0
241 return qp.parse_query(value, match_flags, field.upper())
242
244 """generate a combined query
245
246 @param queries: list of the original queries
247 @type queries: list of xapian.Query
248 @param require_all: boolean operator
249 (True -> AND (default) / False -> OR)
250 @type require_all: bool
251 @return: the resulting combined query object
252 @rtype: xapian.Query
253 """
254 if require_all:
255 query_op = xapian.Query.OP_AND
256 else:
257 query_op = xapian.Query.OP_OR
258 return xapian.Query(query_op, queries)
259
261 """create an empty document to be filled and added to the index later
262
263 @return: the new document object
264 @rtype: xapian.Document
265 """
266 return xapian.Document()
267
269 """add a term to a document
270
271 @param document: the document to be changed
272 @type document: xapian.Document
273 @param term: a single term to be added
274 @type term: str
275 @param tokenize: should the term be tokenized automatically
276 @type tokenize: bool
277 """
278 if tokenize:
279 term_gen = xapian.TermGenerator()
280 term_gen.set_document(document)
281 term_gen.index_text(term)
282 else:
283 document.add_term(_truncate_term_length(term))
284
286 """add a field term to a document
287
288 @param document: the document to be changed
289 @type document: xapian.Document
290 @param field: name of the field
291 @type field: str
292 @param term: term to be associated to the field
293 @type term: str
294 @param tokenize: should the term be tokenized automatically
295 @type tokenize: bool
296 """
297 if tokenize:
298 term_gen = xapian.TermGenerator()
299 term_gen.set_document(document)
300 term_gen.index_text(term, 1, field.upper())
301 else:
302 document.add_term(_truncate_term_length("%s%s" % \
303 (field.upper(), term)))
304
306 """add a prepared document to the index database
307
308 @param document: the document to be added
309 @type document: xapian.Document
310 """
311
312 self._writer_open()
313 self.writer.add_document(document)
314
316 """begin a transaction
317
318 Xapian supports transactions to group multiple database modifications.
319 This avoids intermediate flushing and therefore increases performance.
320 """
321 self._writer_open()
322 self.writer.begin_transaction()
323
325 """cancel an ongoing transaction
326
327 no changes since the last execution of 'begin_transcation' are written
328 """
329 self.writer.cancel_transaction()
330 self._writer_close()
331
333 """submit the changes of an ongoing transaction
334
335 all changes since the last execution of 'begin_transaction' are written
336 """
337 self.writer.commit_transaction()
338 self._writer_close()
339
341 """return an object containing the results of a query
342
343 @param query: a pre-compiled xapian query
344 @type query: xapian.Query
345 @return: an object that allows access to the results
346 @rtype: XapianIndexer.CommonEnquire
347 """
348 enquire = xapian.Enquire(self.reader)
349 enquire.set_query(query)
350 return XapianEnquire(enquire)
351
353 """delete a specified document
354
355 @param docid: the document ID to be deleted
356 @type docid: int
357 """
358
359 self._writer_open()
360 try:
361 self.writer.delete_document(docid)
362 return True
363 except xapian.DocNotFoundError:
364 return False
365
366 - def search(self, query, fieldnames):
367 """return a list of the contents of specified fields for all matches of
368 a query
369
370 @param query: the query to be issued
371 @type query: xapian.Query
372 @param fieldnames: the name(s) of a field of the document content
373 @type fieldnames: string | list of strings
374 @return: a list of dicts containing the specified field(s)
375 @rtype: list of dicts
376 """
377 result = []
378 if isinstance(fieldnames, basestring):
379 fieldnames = [fieldnames]
380 try:
381 self._walk_matches(query, _extract_fieldvalues, (result, fieldnames))
382 except xapian.DatabaseModifiedError:
383 self._index_refresh()
384 self._walk_matches(query, _extract_fieldvalues, (result, fieldnames))
385 return result
386
388 if not self._writer_is_open():
389 lockfile = os.path.join(self.location, 'flintlock')
390 if os.path.exists(lockfile) and (time.time() - os.path.getmtime(lockfile)) / 60 > 15:
391 logging.warning("stale lock found in %s, removing.", self.location)
392 os.remove(lockfile)
393
395 """open write access for the indexing database and acquire an exclusive lock"""
396 if not self._writer_is_open():
397 self._delete_stale_lock()
398 try:
399 self.writer = xapian.WritableDatabase(self.location, xapian.DB_OPEN)
400 except xapian.DatabaseOpeningError, err_msg:
401
402 raise ValueError("Indexer: failed to open xapian database " \
403 + "(%s) - maybe it is not a xapian database: %s" \
404 % (self.location, str(err_msg)))
405
407 """close indexing write access and remove database lock"""
408 if self._writer_is_open():
409 self.writer.flush()
410 self.writer = None
411
413 """check if the indexing write access is currently open"""
414 return not self.writer is None
415
417 """re-read the indexer database"""
418 try:
419 if self.reader is None:
420 self.reader = xapian.Database(self.location)
421 else:
422 self.reader.reopen()
423 except xapian.DatabaseOpeningError, err_msg:
424 raise ValueError("Indexer: failed to open xapian database " \
425 + "(%s) - maybe it is not a xapian database: %s" \
426 % (self.location, str(err_msg)))
427
428
430 """interface to the xapian object for storing sets of matches
431 """
432
434 """return a specified number of qualified matches of a previous query
435
436 @param start: index of the first match to return (starting from zero)
437 @type start: int
438 @param number: the number of matching entries to return
439 @type number: int
440 @return: a set of matching entries and some statistics
441 @rtype: tuple of (returned number, available number, matches)
442 "matches" is a dictionary of::
443 ["rank", "percent", "document", "docid"]
444 """
445 matches = self.enquire.get_mset(start, number)
446 result = []
447 for match in matches:
448 elem = {}
449 elem["rank"] = match[xapian.MSET_RANK]
450 elem["docid"] = match[xapian.MSET_DID]
451 elem["percent"] = match[xapian.MSET_PERCENT]
452 elem["document"] = match[xapian.MSET_DOCUMENT]
453 result.append(elem)
454 return (matches.size(), matches.get_matches_estimated(), result)
455
456
458 """truncate the length of a term string length to the maximum allowed
459 for xapian terms
460
461 @param term: the value of the term, that should be truncated
462 @type term: str
463 @param taken: since a term consists of the name of the term and its
464 actual value, this additional parameter can be used to reduce the
465 maximum count of possible characters
466 @type taken: int
467 @return: the truncated string
468 @rtype: str
469 """
470 if len(term) > _MAX_TERM_LENGTH - taken:
471 return term[0:_MAX_TERM_LENGTH - taken - 1]
472 else:
473 return term
474
476 """add a dict of field values to a list
477
478 usually this function should be used together with '_walk_matches'
479 for traversing a list of matches
480 @param match: a single match object
481 @type match: xapian.MSet
482 @param result: the resulting dict will be added to this list
483 @type result: list of dict
484 @param fieldnames: the names of the fields to be added to the dict
485 @type fieldnames: list of str
486 """
487
488 item_fields = {}
489
490 for term in match["document"].termlist():
491 for fname in fieldnames:
492 if ((fname is None) and re.match("[^A-Z]", term.term)):
493 value = term.term
494 elif re.match("%s[^A-Z]" % str(fname).upper(), term.term):
495 value = term.term[len(fname):]
496 else:
497 continue
498
499 if item_fields.has_key(fname):
500 item_fields[fname].append(value)
501 else:
502 item_fields[fname] = [value]
503 result.append(item_fields)
504