55import xapian
66
77from roundup .backends .indexer_common import Indexer as IndexerBase
8+ from roundup .anypy .strings import b2s , s2b
89
910# TODO: we need to delete documents when a property is *reindexed*
1011
12+ # Note that Xapian always uses UTF-8 encoded string, see
13+ # https://xapian.org/docs/bindings/python3/introduction.html#strings:
14+ # "Where std::string is returned, it's always mapped to bytes in
15+ # Python..."
16+
1117class Indexer (IndexerBase ):
1218 def __init__ (self , db ):
1319 IndexerBase .__init__ (self , db )
@@ -80,7 +86,7 @@ def add_text(self, identifier, text, mime_type='text/plain'):
8086 # We use the identifier twice: once in the actual "text" being
8187 # indexed so we can search on it, and again as the "data" being
8288 # indexed so we know what we're matching when we get results
83- identifier = '%s:%s:%s' % identifier
89+ identifier = s2b ( '%s:%s:%s' % identifier )
8490
8591 # create the new document
8692 doc = xapian .Document ()
@@ -93,7 +99,7 @@ def add_text(self, identifier, text, mime_type='text/plain'):
9399 word = match .group (0 )
94100 if self .is_stopword (word ):
95101 continue
96- term = stemmer (word .lower ())
102+ term = stemmer (s2b ( word .lower () ))
97103 doc .add_posting (term , match .start (0 ))
98104
99105 database .replace_document (identifier , doc )
@@ -114,12 +120,12 @@ def find(self, wordlist):
114120 for term in [word .upper () for word in wordlist
115121 if self .minlength <= len (word ) <= self .maxlength ]:
116122 if not self .is_stopword (term ):
117- terms .append (stemmer (term .lower ()))
123+ terms .append (stemmer (s2b ( term .lower () )))
118124 query = xapian .Query (xapian .Query .OP_AND , terms )
119125
120126 enquire .set_query (query )
121127 matches = enquire .get_mset (0 , database .get_doccount ())
122128
123- return [tuple (m .document .get_data ().split (':' ))
129+ return [tuple (b2s ( m .document .get_data () ).split (':' ))
124130 for m in matches ]
125131
0 commit comments