1- #$Id: indexer_rdbms.py,v 1.9 2005-04-28 00:21:42 richard Exp $
1+ #$Id: indexer_rdbms.py,v 1.10 2005-05-22 17:55:00 a1s Exp $
22''' This implements the full-text indexer over two RDBMS tables. The first
33is a mapping of words to occurance IDs. The second maps the IDs to (Class,
44propname, itemid) instances.
@@ -21,7 +21,7 @@ def save_index(self):
2121 '''Save the changes to the index.'''
2222 # not necessary - the RDBMS connection will handle this for us
2323 pass
24-
24+
2525 def force_reindex (self ):
2626 '''Force a reindexing of the database. This essentially
2727 empties the tables ids and index and sets a flag so
@@ -57,7 +57,9 @@ def add_text(self, identifier, text, mime_type='text/plain'):
5757 self .db .cursor .execute (sql , (id , ))
5858
5959 # ok, find all the words in the text
60- wordlist = re .findall (r'\b\w{2,25}\b' , str (text ).upper ())
60+ text = unicode (text , "utf-8" , "replace" ).upper ()
61+ wordlist = [w .encode ("utf-8" , "replace" )
62+ for w in re .findall (r'(?u)\b\w{2,25}\b' , text )]
6163 words = {}
6264 for word in wordlist :
6365 if is_stopword (word ):
@@ -79,7 +81,7 @@ def find(self, wordlist):
7981 '''look up all the words in the wordlist.
8082 If none are found return an empty dictionary
8183 * more rules here
82- '''
84+ '''
8385 if not wordlist :
8486 return {}
8587
0 commit comments