|
| 1 | +#$Id: indexer_xapian.py,v 1.1 2005-04-28 00:21:42 richard Exp $ |
| 2 | +''' This implements the full-text indexer using the Xapian indexer. |
| 3 | +''' |
| 4 | +import re, os |
| 5 | + |
| 6 | +import xapian |
| 7 | + |
| 8 | +from indexer_common import Indexer, is_stopword |
| 9 | + |
| 10 | +# TODO: we need to delete documents when a property is *reindexed* |
| 11 | + |
| 12 | +class Indexer(Indexer): |
| 13 | + def __init__(self, db): |
| 14 | + self.db_path = db.config.DATABASE |
| 15 | + self.reindex = 0 |
| 16 | + self.transaction_active = False |
| 17 | + |
| 18 | + def _get_database(self): |
| 19 | + index = os.path.join(self.db_path, 'text-index') |
| 20 | + return xapian.WritableDatabase(index, xapian.DB_CREATE_OR_OPEN) |
| 21 | + |
| 22 | + def save_index(self): |
| 23 | + '''Save the changes to the index.''' |
| 24 | + if not self.transaction_active: |
| 25 | + return |
| 26 | + # XXX: Xapian databases don't actually implement transactions yet |
| 27 | + database = self._get_database() |
| 28 | + database.commit_transaction() |
| 29 | + self.transaction_active = False |
| 30 | + |
| 31 | + def close(self): |
| 32 | + '''close the indexing database''' |
| 33 | + pass |
| 34 | + |
| 35 | + def force_reindex(self): |
| 36 | + '''Force a reindexing of the database. This essentially |
| 37 | + empties the tables ids and index and sets a flag so |
| 38 | + that the databases are reindexed''' |
| 39 | + self.reindex = 1 |
| 40 | + |
| 41 | + def should_reindex(self): |
| 42 | + '''returns True if the indexes need to be rebuilt''' |
| 43 | + return self.reindex |
| 44 | + |
| 45 | + def add_text(self, identifier, text, mime_type='text/plain'): |
| 46 | + ''' "identifier" is (classname, itemid, property) ''' |
| 47 | + if mime_type != 'text/plain': |
| 48 | + return |
| 49 | + |
| 50 | + # open the database and start a transaction if needed |
| 51 | + database = self._get_database() |
| 52 | + # XXX: Xapian databases don't actually implement transactions yet |
| 53 | + #if not self.transaction_active: |
| 54 | + #database.begin_transaction() |
| 55 | + #self.transaction_active = True |
| 56 | + |
| 57 | + # TODO: allow configuration of other languages |
| 58 | + stemmer = xapian.Stem("english") |
| 59 | + doc = xapian.Document() |
| 60 | + |
| 61 | + # Xapian doesn't actually seem to care what data is put in here, so |
| 62 | + # we use it to store the text identifier. |
| 63 | + doc.set_data('%s:%s:%s'%identifier) |
| 64 | + for match in re.finditer(r'\b\w{2,25}\b', text.upper()): |
| 65 | + word = match.group(0) |
| 66 | + if is_stopword(word): |
| 67 | + continue |
| 68 | + term = stemmer.stem_word(word) |
| 69 | + doc.add_posting(term, match.start(0)) |
| 70 | + database.add_document(doc) |
| 71 | + |
| 72 | + def find(self, wordlist): |
| 73 | + '''look up all the words in the wordlist. |
| 74 | + If none are found return an empty dictionary |
| 75 | + * more rules here |
| 76 | + ''' |
| 77 | + if not wordlist: |
| 78 | + return {} |
| 79 | + |
| 80 | + database = self._get_database() |
| 81 | + |
| 82 | + enquire = xapian.Enquire(database) |
| 83 | + stemmer = xapian.Stem("english") |
| 84 | + terms = [] |
| 85 | + for term in [word.upper() for word in wordlist if 26 > len(word) > 2]: |
| 86 | + terms.append(stemmer.stem_word(term.upper())) |
| 87 | + query = xapian.Query(xapian.Query.OP_AND, terms) |
| 88 | + |
| 89 | + enquire.set_query(query) |
| 90 | + matches = enquire.get_mset(0, 10) |
| 91 | + |
| 92 | + return [tuple(m[xapian.MSET_DOCUMENT].get_data().split(':')) |
| 93 | + for m in matches] |
| 94 | + |
0 commit comments