Skip to content

Commit 47da89f

Browse files
committed
fixed encoding issues for Xapian indexer
1 parent 83895b5 commit 47da89f

File tree

1 file changed

+10
-4
lines changed

1 file changed

+10
-4
lines changed

roundup/backends/indexer_xapian.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,15 @@
55
import xapian
66

77
from roundup.backends.indexer_common import Indexer as IndexerBase
8+
from roundup.anypy.strings import b2s, s2b
89

910
# TODO: we need to delete documents when a property is *reindexed*
1011

12+
# Note that Xapian always uses UTF-8 encoded string, see
13+
# https://xapian.org/docs/bindings/python3/introduction.html#strings:
14+
# "Where std::string is returned, it's always mapped to bytes in
15+
# Python..."
16+
1117
class Indexer(IndexerBase):
1218
def __init__(self, db):
1319
IndexerBase.__init__(self, db)
@@ -80,7 +86,7 @@ def add_text(self, identifier, text, mime_type='text/plain'):
8086
# We use the identifier twice: once in the actual "text" being
8187
# indexed so we can search on it, and again as the "data" being
8288
# indexed so we know what we're matching when we get results
83-
identifier = '%s:%s:%s'%identifier
89+
identifier = s2b('%s:%s:%s'%identifier)
8490

8591
# create the new document
8692
doc = xapian.Document()
@@ -93,7 +99,7 @@ def add_text(self, identifier, text, mime_type='text/plain'):
9399
word = match.group(0)
94100
if self.is_stopword(word):
95101
continue
96-
term = stemmer(word.lower())
102+
term = stemmer(s2b(word.lower()))
97103
doc.add_posting(term, match.start(0))
98104

99105
database.replace_document(identifier, doc)
@@ -114,12 +120,12 @@ def find(self, wordlist):
114120
for term in [word.upper() for word in wordlist
115121
if self.minlength <= len(word) <= self.maxlength]:
116122
if not self.is_stopword(term):
117-
terms.append(stemmer(term.lower()))
123+
terms.append(stemmer(s2b(term.lower())))
118124
query = xapian.Query(xapian.Query.OP_AND, terms)
119125

120126
enquire.set_query(query)
121127
matches = enquire.get_mset(0, database.get_doccount())
122128

123-
return [tuple(m.document.get_data().split(':'))
129+
return [tuple(b2s(m.document.get_data()).split(':'))
124130
for m in matches]
125131

0 commit comments

Comments
 (0)