perform word splitting in unicode for national characters support

Alexander Smishlajev · Alexander Smishlajev · commit 95e3b8dccd2b · 2005-05-22T18:02:00.000Z
[SF#1195739]
diff --git a/roundup/backends/indexer_rdbms.py b/roundup/backends/indexer_rdbms.py
@@ -15,7 +15,7 @@ def close(self):
         '''close the indexing database'''
         # just nuke the circular reference
         self.db = None
-  
+
     def force_reindex(self):
         '''Force a reindexing of the database.  This essentially
         empties the tables ids and index and sets a flag so
@@ -51,7 +51,9 @@ def add_text(self, identifier, text, mime_type='text/plain'):
             self.db.cursor.execute(sql, (id, ))
 
         # ok, find all the words in the text
-        wordlist = re.findall(r'\b\w{2,25}\b', str(text).upper())
+        text = unicode(text, "utf-8", "replace").upper()
+        wordlist = [w.encode("utf-8", "replace")
+                for w in re.findall(r'(?u)\b\w{2,25}\b', text)]
         words = {}
         for word in wordlist:
             if is_stopword(word):
@@ -73,7 +75,7 @@ def find(self, wordlist):
         '''look up all the words in the wordlist.
         If none are found return an empty dictionary
         * more rules here
-        '''        
+        '''
         if not wordlist:
             return {}