1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """
24 interface for the PyLucene (v2.x) indexing engine
25
26 take a look at PyLuceneIndexer1.py for the PyLucene v1.x interface
27 """
28
29 __revision__ = "$Id: PyLuceneIndexer.py 15537 2010-08-15 20:23:28Z alaaosh $"
30
31 import CommonIndexer
32 import re
33 import os
34 import time
35 import logging
36
37
38
39
40 try:
41 import PyLucene
42 _COMPILER = 'gcj'
43 except ImportError:
44
45 import lucene
46 PyLucene = lucene
47 PyLucene.initVM(PyLucene.CLASSPATH)
48 _COMPILER = 'jcc'
49
50
51 UNNAMED_FIELD_NAME = "FieldWithoutAName"
52 MAX_FIELD_SIZE = 1048576
53
54
57
58
60 """manage and use a pylucene indexing database"""
61
62 QUERY_TYPE = PyLucene.Query
63 INDEX_DIRECTORY_NAME = "lucene"
64
65 - def __init__(self, basedir, analyzer=None, create_allowed=True):
66 """initialize or open an indexing database
67
68 Any derived class must override __init__.
69
70 @raise ValueError: the given location exists, but the database type
71 is incompatible (e.g. created by a different indexing engine)
72 @raise OSError: the database failed to initialize
73
74 @param basedir: the parent directory of the database
75 @type basedir: str
76 @param analyzer: bitwise combination of possible analyzer flags
77 to be used as the default analyzer for this database. Leave it empty
78 to use the system default analyzer (self.ANALYZER_DEFAULT).
79 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
80 @type analyzer: int
81 @param create_allowed: create the database, if necessary; default: True
82 @type create_allowed: bool
83 """
84 jvm = PyLucene.getVMEnv()
85 jvm.attachCurrentThread()
86 super(PyLuceneDatabase, self).__init__(basedir, analyzer=analyzer,
87 create_allowed=create_allowed)
88 self.pyl_analyzer = PyLucene.StandardAnalyzer()
89 self.writer = None
90 self.reader = None
91 self.index_version = None
92 try:
93
94 tempreader = PyLucene.IndexReader.open(self.location)
95 tempreader.close()
96 except PyLucene.JavaError, err_msg:
97
98
99
100
101
102 if not create_allowed:
103 raise OSError("Indexer: skipping database creation")
104 try:
105
106 parent_path = os.path.dirname(self.location)
107 if not os.path.isdir(parent_path):
108
109 os.makedirs(parent_path)
110 except IOError, err_msg:
111 raise OSError("Indexer: failed to create the parent " \
112 + "directory (%s) of the indexing database: %s" \
113 % (parent_path, err_msg))
114 try:
115 tempwriter = PyLucene.IndexWriter(self.location,
116 self.pyl_analyzer, True)
117 tempwriter.close()
118 except PyLucene.JavaError, err_msg:
119 raise OSError("Indexer: failed to open or create a Lucene" \
120 + " database (%s): %s" % (self.location, err_msg))
121
122
123 numtries = 0
124
125
126 try:
127 while numtries < 10:
128 try:
129 self.reader = PyLucene.IndexReader.open(self.location)
130 self.indexVersion = self.reader.getCurrentVersion(
131 self.location)
132 self.searcher = PyLucene.IndexSearcher(self.reader)
133 break
134 except PyLucene.JavaError, e:
135
136 lock_error_msg = e
137 time.sleep(0.01)
138 numtries += 1
139 else:
140
141 raise OSError("Indexer: failed to lock index database" \
142 + " (%s)" % lock_error_msg)
143 finally:
144 pass
145
146
147 self._index_refresh()
148
150 """remove lock and close writer after loosing the last reference"""
151 self._writer_close()
152 if self.reader is not None:
153 self.reader.close()
154 self.reader = None
155 if self.searcher is not None:
156 self.searcher.close()
157 self.searcher = None
158
159 - def flush(self, optimize=False):
160 """flush the content of the database - to force changes to be written
161 to disk
162
163 some databases also support index optimization
164
165 @param optimize: should the index be optimized if possible?
166 @type optimize: bool
167 """
168 keep_open = self._writer_is_open()
169 self._writer_open()
170 try:
171 if optimize:
172 self.writer.optimize()
173 finally:
174 self.writer.flush()
175 if not keep_open:
176 self._writer_close()
177
182
184 """generate a query based on an existing query object
185
186 basically this function should just create a copy of the original
187
188 @param query: the original query object
189 @type query: PyLucene.Query
190 @return: resulting query object
191 @rtype: PyLucene.Query
192 """
193
194
195 return query
196
199 """generate a query for a plain term of a string query
200
201 basically this function parses the string and returns the resulting
202 query
203
204 @param text: the query string
205 @type text: str
206 @param require_all: boolean operator
207 (True -> AND (default) / False -> OR)
208 @type require_all: bool
209 @param analyzer: the analyzer to be used
210 possible analyzers are:
211 - L{CommonDatabase.ANALYZER_TOKENIZE}
212 the field value is splitted to be matched word-wise
213 - L{CommonDatabase.ANALYZER_PARTIAL}
214 the field value must start with the query string
215 - L{CommonDatabase.ANALYZER_EXACT}
216 keep special characters and the like
217 @type analyzer: bool
218 @return: resulting query object
219 @rtype: PyLucene.Query
220 """
221 if analyzer is None:
222 analyzer = self.analyzer
223 if analyzer == self.ANALYZER_EXACT:
224 analyzer_obj = PyLucene.KeywordAnalyzer()
225 else:
226 text = _escape_term_value(text)
227 analyzer_obj = PyLucene.StandardAnalyzer()
228 qp = PyLucene.QueryParser(UNNAMED_FIELD_NAME, analyzer_obj)
229 if (analyzer & self.ANALYZER_PARTIAL > 0):
230
231 text += "*"
232 if require_all:
233 qp.setDefaultOperator(qp.Operator.AND)
234 else:
235 qp.setDefaultOperator(qp.Operator.OR)
236 return qp.parse(text)
237
239 """generate a field query
240
241 this functions creates a field->value query
242
243 @param field: the fieldname to be used
244 @type field: str
245 @param value: the wanted value of the field
246 @type value: str
247 @param analyzer: the analyzer to be used
248 possible analyzers are:
249 - L{CommonDatabase.ANALYZER_TOKENIZE}
250 the field value is splitted to be matched word-wise
251 - L{CommonDatabase.ANALYZER_PARTIAL}
252 the field value must start with the query string
253 - L{CommonDatabase.ANALYZER_EXACT}
254 keep special characters and the like
255 @type analyzer: bool
256 @return: resulting query object
257 @rtype: PyLucene.Query
258 """
259 if analyzer is None:
260 analyzer = self.analyzer
261 if analyzer == self.ANALYZER_EXACT:
262 analyzer_obj = PyLucene.KeywordAnalyzer()
263 else:
264 value = _escape_term_value(value)
265 analyzer_obj = PyLucene.StandardAnalyzer()
266 qp = PyLucene.QueryParser(field, analyzer_obj)
267 if (analyzer & self.ANALYZER_PARTIAL > 0):
268
269 value += "*"
270 return qp.parse(value)
271
273 """generate a combined query
274
275 @param queries: list of the original queries
276 @type queries: list of PyLucene.Query
277 @param require_all: boolean operator
278 (True -> AND (default) / False -> OR)
279 @type require_all: bool
280 @return: the resulting combined query object
281 @rtype: PyLucene.Query
282 """
283 combined_query = PyLucene.BooleanQuery()
284 for query in queries:
285 combined_query.add(
286 PyLucene.BooleanClause(query, _occur(require_all, False)))
287 return combined_query
288
290 """create an empty document to be filled and added to the index later
291
292 @return: the new document object
293 @rtype: PyLucene.Document
294 """
295 return PyLucene.Document()
296
298 """add a term to a document
299
300 @param document: the document to be changed
301 @type document: PyLucene.Document
302 @param term: a single term to be added
303 @type term: str
304 @param tokenize: should the term be tokenized automatically
305 @type tokenize: bool
306 """
307 if tokenize:
308 token_flag = PyLucene.Field.Index.TOKENIZED
309 else:
310 token_flag = PyLucene.Field.Index.UN_TOKENIZED
311 document.add(PyLucene.Field(str(UNNAMED_FIELD_NAME), term,
312 PyLucene.Field.Store.YES, token_flag))
313
315 """add a field term to a document
316
317 @param document: the document to be changed
318 @type document: PyLucene.Document
319 @param field: name of the field
320 @type field: str
321 @param term: term to be associated to the field
322 @type term: str
323 @param tokenize: should the term be tokenized automatically
324 @type tokenize: bool
325 """
326 if tokenize:
327 token_flag = PyLucene.Field.Index.TOKENIZED
328 else:
329 token_flag = PyLucene.Field.Index.UN_TOKENIZED
330 document.add(PyLucene.Field(str(field), term,
331 PyLucene.Field.Store.YES, token_flag))
332
334 """add a prepared document to the index database
335
336 @param document: the document to be added
337 @type document: PyLucene.Document
338 """
339 self._writer_open()
340 self.writer.addDocument(document)
341
343 """PyLucene does not support transactions
344
345 Thus this function just opens the database for write access.
346 Call "cancel_transaction" or "commit_transaction" to close write
347 access in order to remove the exclusive lock from the database
348 directory.
349 """
350 jvm = PyLucene.getVMEnv()
351 jvm.attachCurrentThread()
352 self._writer_open()
353
355 """PyLucene does not support transactions
356
357 Thus this function just closes the database write access and removes
358 the exclusive lock.
359
360 See 'start_transaction' for details.
361 """
362 if self._writer_is_open():
363 self.writer.abort()
364 self._writer_close()
365
367 """PyLucene does not support transactions
368
369 Thus this function just closes the database write access and removes
370 the exclusive lock.
371
372 See 'start_transaction' for details.
373 """
374 self._writer_close()
375 self._index_refresh()
376
378 """return an object containing the results of a query
379
380 @param query: a pre-compiled query
381 @type query: a query object of the real implementation
382 @return: an object that allows access to the results
383 @rtype: subclass of CommonEnquire
384 """
385 return PyLuceneHits(self.searcher.search(query))
386
391
393 """delete a specified document
394
395 @param docid: the document ID to be deleted
396 @type docid: int
397 """
398 if self._writer_is_open():
399 self._writer_close()
400 try:
401 self.reader.deleteDocument(docid)
402 except PyLucene.JavaError:
403 self._index_refresh()
404 self.reader.deleteDocument(docid)
405
406 - def search(self, query, fieldnames):
407 """return a list of the contents of specified fields for all matches of
408 a query
409
410 @param query: the query to be issued
411 @type query: a query object of the real implementation
412 @param fieldnames: the name(s) of a field of the document content
413 @type fieldnames: string | list of strings
414 @return: a list of dicts containing the specified field(s)
415 @rtype: list of dicts
416 """
417 if isinstance(fieldnames, basestring):
418 fieldnames = [fieldnames]
419 hits = self.searcher.search(query)
420 if _COMPILER == 'jcc':
421
422 hits = [(hit, hits.doc(hit)) for hit in range(hits.length())]
423 result = []
424 for hit, doc in hits:
425 fields = {}
426 for fieldname in fieldnames:
427
428 if fieldname is None:
429 pyl_fieldname = UNNAMED_FIELD_NAME
430 else:
431 pyl_fieldname = fieldname
432 fields[fieldname] = doc.getValues(pyl_fieldname)
433 result.append(fields)
434 return result
435
437 if self.reader.isLocked(self.location):
438
439
440 try:
441
442 stat = os.stat(os.path.join(self.location, 'write.lock'))
443 age = (time.time() - stat.st_mtime) / 60
444 if age > 15:
445 logging.warning("stale lock found in %s, removing.", self.location)
446 self.reader.unlock(self.reader.directory())
447 except:
448 pass
449
451 """open write access for the indexing database and acquire an
452 exclusive lock
453 """
454 if not self._writer_is_open():
455 self._delete_stale_lock()
456 self.writer = PyLucene.IndexWriter(self.location, self.pyl_analyzer,
457 False)
458
459
460
461 if hasattr(self.writer, "setMaxFieldLength"):
462 self.writer.setMaxFieldLength(MAX_FIELD_SIZE)
463
464
466 """close indexing write access and remove the database lock"""
467 if self._writer_is_open():
468 self.writer.close()
469 self.writer = None
470
472 """check if the indexing write access is currently open"""
473 return not self.writer is None
474
476 """re-read the indexer database"""
477 try:
478 if self.reader is None or self.searcher is None:
479 self.reader = PyLucene.IndexReader.open(self.location)
480 self.searcher = PyLucene.IndexSearcher(self.reader)
481 elif self.index_version != self.reader.getCurrentVersion( \
482 self.location):
483 self.searcher.close()
484 self.reader.close()
485 self.reader = PyLucene.IndexReader.open(self.location)
486 self.searcher = PyLucene.IndexSearcher(self.reader)
487 self.index_version = self.reader.getCurrentVersion(self.location)
488 except PyLucene.JavaError, e:
489
490
491 pass
492
493
495 """an enquire object contains the information about the result of a request
496 """
497
499 """return a specified number of qualified matches of a previous query
500
501 @param start: index of the first match to return (starting from zero)
502 @type start: int
503 @param number: the number of matching entries to return
504 @type number: int
505 @return: a set of matching entries and some statistics
506 @rtype: tuple of (returned number, available number, matches)
507 "matches" is a dictionary of::
508 ["rank", "percent", "document", "docid"]
509 """
510
511
512 stop = start + number
513 if stop > self.enquire.length():
514 stop = self.enquire.length()
515
516 if stop <= start:
517 return (0, self.enquire.length(), [])
518 result = []
519 for index in range(start, stop):
520 item = {}
521 item["rank"] = index
522 item["docid"] = self.enquire.id(index)
523 item["percent"] = self.enquire.score(index)
524 item["document"] = self.enquire.doc(index)
525 result.append(item)
526 return (stop-start, self.enquire.length(), result)
527
528 -def _occur(required, prohibited):
529 if required == True and prohibited == False:
530 return PyLucene.BooleanClause.Occur.MUST
531 elif required == False and prohibited == False:
532 return PyLucene.BooleanClause.Occur.SHOULD
533 elif required == False and prohibited == True:
534 return PyLucene.BooleanClause.Occur.MUST_NOT
535 else:
536
537
538 return None
539
541 """get the installed pylucene version
542
543 @return: 1 -> PyLucene v1.x / 2 -> PyLucene v2.x / 0 -> unknown
544 @rtype: int
545 """
546 version = PyLucene.VERSION
547 if version.startswith("1."):
548 return 1
549 elif version.startswith("2."):
550 return 2
551 else:
552 return 0
553
554
557