Skip to content

Commit caf4c97

Browse files
committed
Add indexer_language to change stemmer for xapian FTS indexer
Nagy Gabor asked how to enable the hungarian stemmer in roundup. This required editing indexer_xapian.py replacing hardcoded "english" term. This value is now exposed in the config file under [main] index_language. This only works for xapian currently.
1 parent 5d24476 commit caf4c97

File tree

5 files changed

+45
-3
lines changed

5 files changed

+45
-3
lines changed

CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,9 @@ Features:
9999
customizing.txt. Result of mailing list question. (John Rouillard)
100100
- issue2551109 - Improve keyword editing in jinja2 template. (Cedric Krier)
101101
- issue2551117 - Add example systemd config
102+
- Allow admin to configure language used for stemming in xapian
103+
indexer. (John Rouillard request by Nagy Gabor)
104+
102105

103106
2020-07-13 2.0.0
104107

roundup/backends/indexer_common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def __init__(self, db):
2323
# gibberish (encoded text or somesuch) or shorter than 2 characters
2424
self.minlength = 2
2525
self.maxlength = 25
26+
self.language = db.config[('main','indexer_language')]
2627

2728
def is_stopword(self, word):
2829
return word in self.stopwords

roundup/backends/indexer_xapian.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from roundup.backends.indexer_common import Indexer as IndexerBase
88
from roundup.anypy.strings import b2s, s2b
9+
from roundup.i18n import _
910

1011
# TODO: we need to delete documents when a property is *reindexed*
1112

@@ -21,6 +22,18 @@ def __init__(self, db):
2122
self.reindex = 0
2223
self.transaction_active = False
2324

25+
# self.language defined in IndexerBase.__init__
26+
# validate it here
27+
try:
28+
xapian.Stem(self.language)
29+
except xapian.InvalidArgumentError:
30+
raise ValueError(
31+
_("Invalid indexer_language %(lang)s for xapian indexer\n"
32+
"Valid languages: %(valid)s") % {
33+
"lang": self.language,
34+
"valid": b2s(xapian.Stem.get_available_languages()) }
35+
)
36+
2437
def _get_database(self):
2538
index = os.path.join(self.db_path, 'text-index')
2639
for n in range(10):
@@ -80,8 +93,7 @@ def add_text(self, identifier, text, mime_type='text/plain'):
8093
#database.begin_transaction()
8194
#self.transaction_active = True
8295

83-
# TODO: allow configuration of other languages
84-
stemmer = xapian.Stem("english")
96+
stemmer = xapian.Stem(self.language)
8597

8698
# We use the identifier twice: once in the actual "text" being
8799
# indexed so we can search on it, and again as the "data" being
@@ -115,7 +127,7 @@ def find(self, wordlist):
115127
database = self._get_database()
116128

117129
enquire = xapian.Enquire(database)
118-
stemmer = xapian.Stem("english")
130+
stemmer = xapian.Stem(self.language)
119131
terms = []
120132
for term in [word.upper() for word in wordlist
121133
if self.minlength <= len(word) <= self.maxlength]:

roundup/configuration.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -746,6 +746,12 @@ def str2value(self, value):
746746
"If no indexer is supplied, the first available indexer\n"
747747
"will be used in the following order:\n"
748748
"Possible values: xapian, whoosh, native (internal)."),
749+
(Option, "indexer_language", "english",
750+
"Used to determine what language should be used by the\n"
751+
"indexer above. Currently only affects Xapian indexer. It\n"
752+
"sets the language for the stemmer.\n"
753+
"Possible values: must be a valid language for the indexer,\n"
754+
"see indexer documentation for details."),
749755
(WordListOption, "indexer_stopwords", "",
750756
"Additional stop-words for the full-text indexer specific to\n"
751757
"your tracker. See the indexer source for the default list of\n"

test/test_indexer.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ class config(dict):
5555
DATABASE = 'test-index'
5656
config = config()
5757
config[('main', 'indexer_stopwords')] = []
58+
config[('main', 'indexer_language')] = "english"
5859

5960
class IndexerTest(unittest.TestCase):
6061
def setUp(self):
@@ -194,7 +195,26 @@ def setUp(self):
194195
self.dex = Indexer(db)
195196
def tearDown(self):
196197
shutil.rmtree('test-index')
198+
def test_invalid_language(self):
199+
""" make sure we have a reasonable error message if
200+
invalid language is specified """
201+
l = db.config[('main', 'indexer_language')]
202+
db.config[('main', 'indexer_language')] = "NO_LANG"
203+
from roundup.backends.indexer_xapian import Indexer
204+
with self.assertRaises(ValueError) as cm:
205+
Indexer(db)
206+
# note if Indexer(db) doesn't return ValueError
207+
# all Xapian tests after this point will fail.
208+
# because a valid langage will not be set.
209+
# reset the valid language.
210+
db.config[('main', 'indexer_language')] = l
197211

212+
print(cm)
213+
self.assertIn("ValueError", repr(cm.exception))
214+
# look for failing language
215+
self.assertIn("NO_LANG", cm.exception.args[0])
216+
# look for supported language
217+
self.assertIn("english", cm.exception.args[0])
198218

199219
class RDBMSIndexerTest(object):
200220
def setUp(self):

0 commit comments

Comments
 (0)