Skip to content

Commit 565f3d8

Browse files
author
Richard Jones
committed
added Xapian indexer; replaces standard indexers if Xapian is available
1 parent b88c76a commit 565f3d8

File tree

9 files changed

+159
-28
lines changed

9 files changed

+159
-28
lines changed

CHANGES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ are given with the most recent entry first.
55
Feature:
66
- added "imapServer.py" script (sf patch 934567)
77
- added date selection popup windows (thanks Marcus Priesch)
8+
- added Xapian indexer; replaces standard indexers if Xapian is available
89

910

1011
2005-??-?? 0.8.3

roundup/admin.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
# BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
1717
# SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
1818
#
19-
# $Id: admin.py,v 1.91 2005-04-13 07:01:05 richard Exp $
19+
# $Id: admin.py,v 1.92 2005-04-28 00:21:41 richard Exp $
2020

2121
'''Administration commands for maintaining Roundup trackers.
2222
'''
@@ -407,6 +407,7 @@ def do_install(self, tracker_home, args):
407407
init.write_select_db(tracker_home, backend)
408408

409409
print _("""
410+
---------------------------------------------------------------------------
410411
You should now edit the tracker configuration file:
411412
%(config_file)s""") % {"config_file": config_ini_file}
412413

@@ -427,6 +428,10 @@ def do_install(self, tracker_home, args):
427428
You may also change the database initialisation file:
428429
%(database_init_file)s
429430
... see the documentation on customizing for more information.
431+
432+
You MUST run the "roundup-admin initialise" command once you've performed
433+
the above steps.
434+
---------------------------------------------------------------------------
430435
""") % {
431436
'database_config_file': os.path.join(tracker_home, 'schema.py'),
432437
'database_init_file': os.path.join(tracker_home, 'initial_data.py'),

roundup/backends/back_anydbm.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
1616
# SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
1717
#
18-
#$Id: back_anydbm.py,v 1.186 2005-03-03 22:16:32 richard Exp $
18+
#$Id: back_anydbm.py,v 1.187 2005-04-28 00:21:42 richard Exp $
1919
'''This module defines a backend that saves the hyperdatabase in a
2020
database chosen by anydbm. It is guaranteed to always be available in python
2121
versions >2.1.1 (the dumbdbm fallback in 2.1.1 and earlier has several
@@ -41,7 +41,11 @@
4141

4242
from blobfiles import FileStorage
4343
from sessions_dbm import Sessions, OneTimeKeys
44-
from indexer_dbm import Indexer
44+
45+
try:
46+
from indexer_xapian import Indexer
47+
except ImportError:
48+
from indexer_dbm import Indexer
4549

4650
def db_exists(config):
4751
# check for the user db
@@ -90,7 +94,7 @@ def __init__(self, config, journaltag=None):
9094
self.newnodes = {} # keep track of the new nodes by class
9195
self.destroyednodes = {}# keep track of the destroyed nodes by class
9296
self.transactions = []
93-
self.indexer = Indexer(self.dir)
97+
self.indexer = Indexer(self)
9498
self.security = security.Security(self)
9599
# ensure files are group readable and writable
96100
os.umask(0002)

roundup/backends/back_metakit.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# $Id: back_metakit.py,v 1.93 2005-03-03 22:16:32 richard Exp $
1+
# $Id: back_metakit.py,v 1.94 2005-04-28 00:21:42 richard Exp $
22
'''Metakit backend for Roundup, originally by Gordon McMillan.
33
44
Known Current Bugs:
@@ -89,7 +89,7 @@ def __init__(self, config, journaltag=None):
8989
self.dirty = 0
9090
self.lockfile = None
9191
self._db = self.__open()
92-
self.indexer = Indexer(self.config.DATABASE, self._db)
92+
self.indexer = Indexer(self)
9393
self.security = security.Security(self)
9494

9595
self.stats = {'cache_hits': 0, 'cache_misses': 0, 'get_items': 0,
@@ -2000,11 +2000,11 @@ def __init__(self, db, classname, **properties):
20002000

20012001
CURVERSION = 2
20022002

2003-
class Indexer(Indexer):
2004-
def __init__(self, path, datadb):
2005-
self.path = os.path.join(path, 'index.mk4')
2003+
class MetakitIndexer(Indexer):
2004+
def __init__(self, db):
2005+
self.path = os.path.join(db.config.DATABASE, 'index.mk4')
20062006
self.db = metakit.storage(self.path, 1)
2007-
self.datadb = datadb
2007+
self.datadb = db._db
20082008
self.reindex = 0
20092009
v = self.db.view('version')
20102010
if not v.structure():
@@ -2135,4 +2135,9 @@ def rollback(self):
21352135
self.db = metakit.storage(self.path, 1)
21362136
self.changed = 0
21372137

2138+
try:
2139+
from indexer_xapian import Indexer
2140+
except ImportError:
2141+
Indexer = MetakitIndexer
2142+
21382143
# vim: set et sts=4 sw=4 :

roundup/backends/indexer_dbm.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# that promote freedom, but obviously am giving up any rights
1515
# to compel such.
1616
#
17-
#$Id: indexer_dbm.py,v 1.5 2005-01-08 16:16:59 jlgijsbers Exp $
17+
#$Id: indexer_dbm.py,v 1.6 2005-04-28 00:21:42 richard Exp $
1818
'''This module provides an indexer class, RoundupIndexer, that stores text
1919
indices in a roundup instance. This class makes searching the content of
2020
messages, string properties and text files possible.
@@ -37,8 +37,8 @@ class Indexer(Indexer):
3737
3838
where identifier is (classname, nodeid, propertyname)
3939
'''
40-
def __init__(self, db_path):
41-
self.indexdb_path = os.path.join(db_path, 'indexes')
40+
def __init__(self, db):
41+
self.indexdb_path = os.path.join(db.config.DATABASE, 'indexes')
4242
self.indexdb = os.path.join(self.indexdb_path, 'index.db')
4343
self.reindex = 0
4444
self.quiet = 9

roundup/backends/indexer_rdbms.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#$Id: indexer_rdbms.py,v 1.8 2005-01-08 16:16:59 jlgijsbers Exp $
1+
#$Id: indexer_rdbms.py,v 1.9 2005-04-28 00:21:42 richard Exp $
22
''' This implements the full-text indexer over two RDBMS tables. The first
33
is a mapping of words to occurance IDs. The second maps the IDs to (Class,
44
propname, itemid) instances.
@@ -16,6 +16,11 @@ def close(self):
1616
'''close the indexing database'''
1717
# just nuke the circular reference
1818
self.db = None
19+
20+
def save_index(self):
21+
'''Save the changes to the index.'''
22+
# not necessary - the RDBMS connection will handle this for us
23+
pass
1924

2025
def force_reindex(self):
2126
'''Force a reindexing of the database. This essentially

roundup/backends/indexer_xapian.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
#$Id: indexer_xapian.py,v 1.1 2005-04-28 00:21:42 richard Exp $
2+
''' This implements the full-text indexer using the Xapian indexer.
3+
'''
4+
import re, os
5+
6+
import xapian
7+
8+
from indexer_common import Indexer, is_stopword
9+
10+
# TODO: we need to delete documents when a property is *reindexed*
11+
12+
class Indexer(Indexer):
13+
def __init__(self, db):
14+
self.db_path = db.config.DATABASE
15+
self.reindex = 0
16+
self.transaction_active = False
17+
18+
def _get_database(self):
19+
index = os.path.join(self.db_path, 'text-index')
20+
return xapian.WritableDatabase(index, xapian.DB_CREATE_OR_OPEN)
21+
22+
def save_index(self):
23+
'''Save the changes to the index.'''
24+
if not self.transaction_active:
25+
return
26+
# XXX: Xapian databases don't actually implement transactions yet
27+
database = self._get_database()
28+
database.commit_transaction()
29+
self.transaction_active = False
30+
31+
def close(self):
32+
'''close the indexing database'''
33+
pass
34+
35+
def force_reindex(self):
36+
'''Force a reindexing of the database. This essentially
37+
empties the tables ids and index and sets a flag so
38+
that the databases are reindexed'''
39+
self.reindex = 1
40+
41+
def should_reindex(self):
42+
'''returns True if the indexes need to be rebuilt'''
43+
return self.reindex
44+
45+
def add_text(self, identifier, text, mime_type='text/plain'):
46+
''' "identifier" is (classname, itemid, property) '''
47+
if mime_type != 'text/plain':
48+
return
49+
50+
# open the database and start a transaction if needed
51+
database = self._get_database()
52+
# XXX: Xapian databases don't actually implement transactions yet
53+
#if not self.transaction_active:
54+
#database.begin_transaction()
55+
#self.transaction_active = True
56+
57+
# TODO: allow configuration of other languages
58+
stemmer = xapian.Stem("english")
59+
doc = xapian.Document()
60+
61+
# Xapian doesn't actually seem to care what data is put in here, so
62+
# we use it to store the text identifier.
63+
doc.set_data('%s:%s:%s'%identifier)
64+
for match in re.finditer(r'\b\w{2,25}\b', text.upper()):
65+
word = match.group(0)
66+
if is_stopword(word):
67+
continue
68+
term = stemmer.stem_word(word)
69+
doc.add_posting(term, match.start(0))
70+
database.add_document(doc)
71+
72+
def find(self, wordlist):
73+
'''look up all the words in the wordlist.
74+
If none are found return an empty dictionary
75+
* more rules here
76+
'''
77+
if not wordlist:
78+
return {}
79+
80+
database = self._get_database()
81+
82+
enquire = xapian.Enquire(database)
83+
stemmer = xapian.Stem("english")
84+
terms = []
85+
for term in [word.upper() for word in wordlist if 26 > len(word) > 2]:
86+
terms.append(stemmer.stem_word(term.upper()))
87+
query = xapian.Query(xapian.Query.OP_AND, terms)
88+
89+
enquire.set_query(query)
90+
matches = enquire.get_mset(0, 10)
91+
92+
return [tuple(m[xapian.MSET_DOCUMENT].get_data().split(':'))
93+
for m in matches]
94+

roundup/backends/rdbms_common.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# $Id: rdbms_common.py,v 1.151 2005-03-03 22:16:32 richard Exp $
1+
# $Id: rdbms_common.py,v 1.152 2005-04-28 00:21:42 richard Exp $
22
''' Relational database (SQL) backend common code.
33
44
Basics:
@@ -39,7 +39,10 @@
3939

4040
# support
4141
from blobfiles import FileStorage
42-
from indexer_rdbms import Indexer
42+
try:
43+
from indexer_xapian import Indexer
44+
except ImportError:
45+
from indexer_rdbms import Indexer
4346
from sessions_rdbms import Sessions, OneTimeKeys
4447
from roundup.date import Range
4548

@@ -321,6 +324,7 @@ def reindex(self, classname=None):
321324
for klass in classes:
322325
for nodeid in klass.list():
323326
klass.index(nodeid)
327+
self.indexer.save_index()
324328

325329
hyperdb_to_sql_datatypes = {
326330
hyperdb.String : 'TEXT',
@@ -1177,6 +1181,9 @@ def commit(self):
11771181
for method, args in self.transactions:
11781182
method(*args)
11791183

1184+
# save the indexer
1185+
self.indexer.save_index()
1186+
11801187
# clear out the transactions
11811188
self.transactions = []
11821189

test/test_indexer.py

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,39 +18,49 @@
1818
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1919
# SOFTWARE.
2020

21-
# $Id: test_indexer.py,v 1.6 2005-01-05 22:28:32 jlgijsbers Exp $
21+
# $Id: test_indexer.py,v 1.7 2005-04-28 00:21:42 richard Exp $
2222

2323
import os, unittest, shutil
2424

25-
from roundup.backends.indexer_dbm import Indexer
26-
2725
class IndexerTest(unittest.TestCase):
2826
def setUp(self):
2927
if os.path.exists('test-index'):
3028
shutil.rmtree('test-index')
3129
os.mkdir('test-index')
3230
os.mkdir('test-index/files')
31+
from roundup.backends.indexer_dbm import Indexer
3332
self.dex = Indexer('test-index')
3433
self.dex.load_index()
3534

3635
def test_basics(self):
37-
self.dex.add_text('testing1', 'a the hello world')
38-
self.assertEqual(self.dex.words, {'HELLO': {1: 1}, 'WORLD': {1: 1}})
39-
self.dex.add_text('testing2', 'blah blah the world')
40-
self.assertEqual(self.dex.words, {'BLAH': {2: 2}, 'HELLO': {1: 1},
41-
'WORLD': {2: 1, 1: 1}})
42-
self.assertEqual(self.dex.find(['world']), ['testing1',
43-
'testing2'])
44-
self.assertEqual(self.dex.find(['blah']), ['testing2'])
36+
self.dex.add_text(('test', '1', 'foo'), 'a the hello world')
37+
self.dex.add_text(('test', '2', 'foo'), 'blah blah the world')
38+
self.assertEqual(self.dex.find(['world']), [('test', '1', 'foo'),
39+
('test', '2', 'foo')])
40+
self.assertEqual(self.dex.find(['blah']), [('test', '2', 'foo')])
4541
self.assertEqual(self.dex.find(['blah', 'hello']), [])
46-
self.dex.save_index()
4742

4843
def tearDown(self):
4944
shutil.rmtree('test-index')
5045

46+
class XapianIndexerTest(IndexerTest):
47+
def setUp(self):
48+
if os.path.exists('text-index'):
49+
shutil.rmtree('text-index')
50+
from roundup.backends.indexer_xapian import Indexer
51+
self.dex = Indexer('.')
52+
def tearDown(self):
53+
shutil.rmtree('text-index')
54+
5155
def test_suite():
5256
suite = unittest.TestSuite()
5357
suite.addTest(unittest.makeSuite(IndexerTest))
58+
try:
59+
import xapian
60+
suite.addTest(unittest.makeSuite(XapianIndexerTest))
61+
except ImportError:
62+
print "Skipping Xapian indexer tests"
63+
pass
5464
return suite
5565

5666
if __name__ == '__main__':

0 commit comments

Comments
 (0)