Skip to content

Commit 446f065

Browse files
author
Alexander Smishlajev
committed
perform word splitting in unicode for national characters support
[SF#1195739]
1 parent aabbaa2 commit 446f065

File tree

1 file changed

+6
-4
lines changed

1 file changed

+6
-4
lines changed

roundup/backends/indexer_rdbms.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#$Id: indexer_rdbms.py,v 1.9 2005-04-28 00:21:42 richard Exp $
1+
#$Id: indexer_rdbms.py,v 1.10 2005-05-22 17:55:00 a1s Exp $
22
''' This implements the full-text indexer over two RDBMS tables. The first
33
is a mapping of words to occurance IDs. The second maps the IDs to (Class,
44
propname, itemid) instances.
@@ -21,7 +21,7 @@ def save_index(self):
2121
'''Save the changes to the index.'''
2222
# not necessary - the RDBMS connection will handle this for us
2323
pass
24-
24+
2525
def force_reindex(self):
2626
'''Force a reindexing of the database. This essentially
2727
empties the tables ids and index and sets a flag so
@@ -57,7 +57,9 @@ def add_text(self, identifier, text, mime_type='text/plain'):
5757
self.db.cursor.execute(sql, (id, ))
5858

5959
# ok, find all the words in the text
60-
wordlist = re.findall(r'\b\w{2,25}\b', str(text).upper())
60+
text = unicode(text, "utf-8", "replace").upper()
61+
wordlist = [w.encode("utf-8", "replace")
62+
for w in re.findall(r'(?u)\b\w{2,25}\b', text)]
6163
words = {}
6264
for word in wordlist:
6365
if is_stopword(word):
@@ -79,7 +81,7 @@ def find(self, wordlist):
7981
'''look up all the words in the wordlist.
8082
If none are found return an empty dictionary
8183
* more rules here
82-
'''
84+
'''
8385
if not wordlist:
8486
return {}
8587

0 commit comments

Comments
 (0)