|
1 | | -#$Id: indexer_rdbms.py,v 1.12 2006-02-06 21:00:47 richard Exp $ |
| 1 | +#$Id: indexer_rdbms.py,v 1.13 2006-04-27 06:33:18 richard Exp $ |
2 | 2 | ''' This implements the full-text indexer over two RDBMS tables. The first |
3 | 3 | is a mapping of words to occurance IDs. The second maps the IDs to (Class, |
4 | 4 | propname, itemid) instances. |
5 | 5 | ''' |
6 | | -import re |
| 6 | +import re, sets |
7 | 7 |
|
8 | 8 | from roundup.backends.indexer_common import Indexer as IndexerBase |
9 | 9 |
|
@@ -45,38 +45,31 @@ def add_text(self, identifier, text, mime_type='text/plain'): |
45 | 45 | self.db.cursor.execute(sql, identifier) |
46 | 46 | r = self.db.cursor.fetchone() |
47 | 47 | if not r: |
| 48 | + # not previously indexed |
48 | 49 | id = self.db.newid('__textids') |
49 | 50 | sql = 'insert into __textids (_textid, _class, _itemid, _prop)'\ |
50 | 51 | ' values (%s, %s, %s, %s)'%(a, a, a, a) |
51 | 52 | self.db.cursor.execute(sql, (id, ) + identifier) |
52 | | - self.db.cursor.execute('select max(_textid) from __textids') |
53 | | - id = self.db.cursor.fetchone()[0] |
54 | 53 | else: |
55 | 54 | id = int(r[0]) |
56 | 55 | # clear out any existing indexed values |
57 | 56 | sql = 'delete from __words where _textid=%s'%a |
58 | 57 | self.db.cursor.execute(sql, (id, )) |
59 | 58 |
|
60 | | - # ok, find all the words in the text |
| 59 | + # ok, find all the unique words in the text |
61 | 60 | text = unicode(text, "utf-8", "replace").upper() |
62 | 61 | wordlist = [w.encode("utf-8", "replace") |
63 | 62 | for w in re.findall(r'(?u)\b\w{2,25}\b', text)] |
64 | | - words = {} |
| 63 | + words = sets.Set() |
65 | 64 | for word in wordlist: |
66 | 65 | if self.is_stopword(word): continue |
67 | 66 | if len(word) > 25: continue |
68 | | - words[word] = 1 |
69 | | - words = words.keys() |
| 67 | + words.add(word) |
70 | 68 |
|
71 | 69 | # for each word, add an entry in the db |
72 | | - for word in words: |
73 | | - # don't dupe |
74 | | - sql = 'select * from __words where _word=%s and _textid=%s'%(a, a) |
75 | | - self.db.cursor.execute(sql, (word, id)) |
76 | | - if self.db.cursor.fetchall(): |
77 | | - continue |
78 | | - sql = 'insert into __words (_word, _textid) values (%s, %s)'%(a, a) |
79 | | - self.db.cursor.execute(sql, (word, id)) |
| 70 | + sql = 'insert into __words (_word, _textid) values (%s, %s)'%(a, a) |
| 71 | + words = [(word, id) for word in words] |
| 72 | + self.db.cursor.execute(sql, words) |
80 | 73 |
|
81 | 74 | def find(self, wordlist): |
82 | 75 | '''look up all the words in the wordlist. |
|
0 commit comments