Skip to content

Commit 95e3b8d

Browse files
author
Alexander Smishlajev
committed
perform word splitting in unicode for national characters support
[SF#1195739]
1 parent ba3204d commit 95e3b8d

File tree

1 file changed

+5
-3
lines changed

1 file changed

+5
-3
lines changed

roundup/backends/indexer_rdbms.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def close(self):
1515
'''close the indexing database'''
1616
# just nuke the circular reference
1717
self.db = None
18-
18+
1919
def force_reindex(self):
2020
'''Force a reindexing of the database. This essentially
2121
empties the tables ids and index and sets a flag so
@@ -51,7 +51,9 @@ def add_text(self, identifier, text, mime_type='text/plain'):
5151
self.db.cursor.execute(sql, (id, ))
5252

5353
# ok, find all the words in the text
54-
wordlist = re.findall(r'\b\w{2,25}\b', str(text).upper())
54+
text = unicode(text, "utf-8", "replace").upper()
55+
wordlist = [w.encode("utf-8", "replace")
56+
for w in re.findall(r'(?u)\b\w{2,25}\b', text)]
5557
words = {}
5658
for word in wordlist:
5759
if is_stopword(word):
@@ -73,7 +75,7 @@ def find(self, wordlist):
7375
'''look up all the words in the wordlist.
7476
If none are found return an empty dictionary
7577
* more rules here
76-
'''
78+
'''
7779
if not wordlist:
7880
return {}
7981

0 commit comments

Comments
 (0)