That's the last of the RDBMS migration steps done! Yay!

Richard Jones · Richard Jones · commit cd53fa1eb368 · 2004-03-21T23:39:08.000Z
Note that the code currently has some unit testing issues:

- Metakit needs some attention in a couple of areas
- RDBMS backends are having trouble ordering their journal entries
  correctly. I'm going to be migrating them to use TIMESTAMP for the date
  column, but that's not necessarily going to fix things as mysql and
  postgresql both appear to have second granularity. Sqlite will ignore
  the data type as usual ;)

Next up is the datatype-ification of the RDBMS backends. Part of that will
involve the migration to numeric IDs, which will also be done in the *dbm
backends (already done in metakit). The ID exposed *above* the hyperdb
will be String, since so many things assume a string ID now.
diff --git a/TODO.txt b/TODO.txt
@@ -1,10 +1,9 @@
 This file has been re-purposed to contain specifically the items that need
 doing before the next release:
 
-- indexing in RDBMSes
-- add tests for group-by-multilink so I finally implement it for the RDBMSes
-- s/getnode/getitem in backends (and s/Node/Item)
 - have rdbms backends look up the journal for actor if it's not set
-- ensure index creation is triggered by the version 1->2 update
-
+- migrate rdbms backends to use typed columns
 - migrate to numeric ID values (fixes bug 817217)
+
+- ensure index creation is triggered by the version 1->2 update
+  (and other upgrade tests)
diff --git a/roundup/admin.py b/roundup/admin.py
@@ -16,7 +16,7 @@
 # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
 # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
 # 
-# $Id: admin.py,v 1.62 2004-02-11 23:55:08 richard Exp $
+# $Id: admin.py,v 1.63 2004-03-21 23:39:08 richard Exp $
 
 '''Administration commands for maintaining Roundup trackers.
 '''
@@ -748,7 +748,7 @@ def do_create(self, args, pwre = re.compile(r'{(\w+)}(.+)')):
         # convert types
         for propname, value in props.items():
             try:
-                props[key] = hyperdb.rawToHyperdb(self.db, cl, None,
+                props[propname] = hyperdb.rawToHyperdb(self.db, cl, None,
                     propname, value)
             except hyperdb.HyperdbValueError, message:
                 raise UsageError, message
diff --git a/roundup/backends/back_metakit.py b/roundup/backends/back_metakit.py
@@ -1,4 +1,4 @@
-# $Id: back_metakit.py,v 1.65 2004-03-19 05:27:55 richard Exp $
+# $Id: back_metakit.py,v 1.66 2004-03-21 23:39:08 richard Exp $
 '''Metakit backend for Roundup, originally by Gordon McMillan.
 
 Known Current Bugs:
@@ -1873,12 +1873,12 @@ def add_text(self, identifier, text, mime_type='text/plain'):
             ids[oldpos].ignore = 1
             self.changed = 1
         pos = ids.append(tblid=tblid,nodeid=nodeid,propid=propid)
-        
+
         wordlist = re.findall(r'\b\w{2,25}\b', text.upper())
         words = {}
         for word in wordlist:
-	    if not self.disallows.has_key(word):
-            	words[word] = 1
+            if not self.disallows.has_key(word):
+                words[word] = 1
         words = words.keys()
         
         index = self.db.view('index').ordered(1)
diff --git a/roundup/backends/back_mysql.py b/roundup/backends/back_mysql.py
@@ -124,14 +124,29 @@ def open_connection(self):
             self.create_version_2_tables()
 
     def create_version_2_tables(self):
+        # OTK store
         self.cursor.execute('CREATE TABLE otks (otk_key VARCHAR(255), '
-            'otk_value VARCHAR(255), otk_time FLOAT(20))')
+            'otk_value VARCHAR(255), otk_time FLOAT(20)) '
+            'TYPE=%s'%self.mysql_backend)
         self.cursor.execute('CREATE INDEX otks_key_idx ON otks(otk_key)')
+
+        # Sessions store
         self.cursor.execute('CREATE TABLE sessions (session_key VARCHAR(255), '
-            'session_time FLOAT(20), session_value VARCHAR(255))')
+            'session_time FLOAT(20), session_value VARCHAR(255)) '
+            'TYPE=%s'%self.mysql_backend)
         self.cursor.execute('CREATE INDEX sessions_key_idx ON '
             'sessions(session_key)')
 
+        # full-text indexing store
+        self.cursor.execute('CREATE TABLE _textids (_class VARCHAR(255), '
+            '_itemid VARCHAR(255), _prop VARCHAR(255), _textid INT) '
+            'TYPE=%s'%self.mysql_backend)
+        self.cursor.execute('CREATE TABLE _words (_word VARCHAR(30), '
+            '_textid INT) TYPE=%s'%self.mysql_backend)
+        self.cursor.execute('CREATE INDEX words_word_ids ON _words(_word)')
+        sql = 'insert into ids (name, num) values (%s,%s)'%(self.arg, self.arg)
+        self.cursor.execute(sql, ('_textids', 1))
+
     def add_actor_column(self):
         # update existing tables to have the new actor column
         tables = self.database_schema['tables']
diff --git a/roundup/backends/back_postgresql.py b/roundup/backends/back_postgresql.py
@@ -113,14 +113,26 @@ def open_connection(self):
             self.create_version_2_tables()
 
     def create_version_2_tables(self):
+        # OTK store
         self.cursor.execute('CREATE TABLE otks (otk_key VARCHAR(255), '
             'otk_value VARCHAR(255), otk_time FLOAT(20))')
         self.cursor.execute('CREATE INDEX otks_key_idx ON otks(otk_key)')
+
+        # Sessions store
         self.cursor.execute('CREATE TABLE sessions (session_key VARCHAR(255), '
             'session_time FLOAT(20), session_value VARCHAR(255))')
         self.cursor.execute('CREATE INDEX sessions_key_idx ON '
             'sessions(session_key)')
 
+        # full-text indexing store
+        self.cursor.execute('CREATE TABLE _textids (_class VARCHAR(255), '
+            '_itemid VARCHAR(255), _prop VARCHAR(255), _textid INT4) ')
+        self.cursor.execute('CREATE TABLE _words (_word VARCHAR(30), '
+            '_textid INT4)')
+        self.cursor.execute('CREATE INDEX words_word_ids ON _words(_word)')
+        sql = 'insert into ids (name, num) values (%s,%s)'%(self.arg, self.arg)
+        self.cursor.execute(sql, ('_textids', 1))
+
     def add_actor_column(self):
         # update existing tables to have the new actor column
         tables = self.database_schema['tables']
diff --git a/roundup/backends/back_sqlite.py b/roundup/backends/back_sqlite.py
@@ -1,4 +1,4 @@
-# $Id: back_sqlite.py,v 1.17 2004-03-18 01:58:45 richard Exp $
+# $Id: back_sqlite.py,v 1.18 2004-03-21 23:39:08 richard Exp $
 '''Implements a backend for SQLite.
 
 See https://pysqlite.sourceforge.net/ for pysqlite info
@@ -65,6 +65,15 @@ def create_version_2_tables(self):
         self.cursor.execute('create index sessions_key_idx on '
                 'sessions(session_key)')
 
+        # full-text indexing store
+        self.cursor.execute('CREATE TABLE _textids (_class varchar, '
+            '_itemid varchar, _prop varchar, _textid integer) ')
+        self.cursor.execute('CREATE TABLE _words (_word varchar, '
+            '_textid integer)')
+        self.cursor.execute('CREATE INDEX words_word_ids ON _words(_word)')
+        sql = 'insert into ids (name, num) values (%s,%s)'%(self.arg, self.arg)
+        self.cursor.execute(sql, ('_textids', 1))
+
     def add_actor_column(self):
         # update existing tables to have the new actor column
         tables = self.database_schema['tables']
diff --git a/roundup/backends/indexer_rdbms.py b/roundup/backends/indexer_rdbms.py
@@ -0,0 +1,104 @@
+''' This implements the full-text indexer over two RDBMS tables. The first
+is a mapping of words to occurance IDs. The second maps the IDs to (Class,
+propname, itemid) instances.
+'''
+import re
+
+from indexer_dbm import Indexer
+
+class Indexer(Indexer):
+    disallows = {'THE':1, 'THIS':1, 'ZZZ':1, 'THAT':1, 'WITH':1}
+    def __init__(self, db):
+        self.db = db
+        self.reindex = 0
+
+    def close(self):
+        '''close the indexing database'''
+        # just nuke the circular reference
+        self.db = None
+  
+    def force_reindex(self):
+        '''Force a reindexing of the database.  This essentially
+        empties the tables ids and index and sets a flag so
+        that the databases are reindexed'''
+        self.reindex = 1
+
+    def should_reindex(self):
+        '''returns True if the indexes need to be rebuilt'''
+        return self.reindex
+
+    def add_text(self, identifier, text, mime_type='text/plain'):
+        ''' "identifier" is  (classname, itemid, property) '''
+        if mime_type != 'text/plain':
+            return
+
+        # first, find the id of the (classname, itemid, property)
+        a = self.db.arg
+        sql = 'select _textid from _textids where _class=%s and '\
+            '_itemid=%s and _prop=%s'%(a, a, a)
+        self.db.cursor.execute(sql, identifier)
+        r = self.db.cursor.fetchone()
+        if not r:
+            id = self.db.newid('_textids')
+            sql = 'insert into _textids (_textid, _class, _itemid, _prop)'\
+                ' values (%s, %s, %s, %s)'%(a, a, a, a)
+            self.db.cursor.execute(sql, (id, ) + identifier)
+        else:
+            id = int(r[0])
+            # clear out any existing indexed values
+            sql = 'delete from _words where _textid=%s'%a
+            self.db.cursor.execute(sql, (id, ))
+
+        # ok, find all the words in the text
+        wordlist = re.findall(r'\b\w{2,25}\b', str(text).upper())
+        words = {}
+        for word in wordlist:
+            if not self.disallows.has_key(word):
+                words[word] = 1
+        words = words.keys()
+
+        # for each word, add an entry in the db
+        for word in words:
+            # don't dupe
+            sql = 'select * from _words where _word=%s and _textid=%s'%(a, a)
+            self.db.cursor.execute(sql, (word, id))
+            if self.db.cursor.fetchall():
+                continue
+            sql = 'insert into _words (_word, _textid) values (%s, %s)'%(a, a)
+            self.db.cursor.execute(sql, (word, id))
+
+    def find(self, wordlist):
+        '''look up all the words in the wordlist.
+        If none are found return an empty dictionary
+        * more rules here
+        '''        
+        l = [word.upper() for word in wordlist if 26 > len(word) > 2]
+
+        a = ','.join([self.db.arg] * len(l))
+        sql = 'select distinct(_textid) from _words where _word in (%s)'%a
+        self.db.cursor.execute(sql, tuple(l))
+        r = self.db.cursor.fetchall()
+        if not r:
+            return {}
+        a = ','.join([self.db.arg] * len(r))
+        sql = 'select _class, _itemid, _prop from _textids '\
+            'where _textid in (%s)'%a
+        self.db.cursor.execute(sql, tuple([int(id) for (id,) in r]))
+        # self.search_index has the results as {some id: identifier} ...
+        # sigh
+        r = {}
+        k = 0
+        for c,n,p in self.db.cursor.fetchall():
+            key = (str(c), str(n), str(p))
+            r[k] = key
+            k += 1
+        return r
+
+    def save_index(self):
+        # the normal RDBMS backend transaction mechanisms will handle this
+        pass
+
+    def rollback(self):
+        # the normal RDBMS backend transaction mechanisms will handle this
+        pass
+
diff --git a/roundup/backends/rdbms_common.py b/roundup/backends/rdbms_common.py
@@ -1,4 +1,4 @@
-# $Id: rdbms_common.py,v 1.82 2004-03-19 04:47:59 richard Exp $
+# $Id: rdbms_common.py,v 1.83 2004-03-21 23:39:08 richard Exp $
 ''' Relational database (SQL) backend common code.
 
 Basics:
@@ -39,7 +39,7 @@
 
 # support
 from blobfiles import FileStorage
-from indexer_dbm import Indexer
+from indexer_rdbms import Indexer
 from sessions_rdbms import Sessions, OneTimeKeys
 from roundup.date import Range
 
@@ -59,7 +59,7 @@ def __init__(self, config, journaltag=None):
         self.config, self.journaltag = config, journaltag
         self.dir = config.DATABASE
         self.classes = {}
-        self.indexer = Indexer(self.dir)
+        self.indexer = Indexer(self)
         self.security = security.Security(self)
 
         # additional transaction support for external files and the like
@@ -177,7 +177,7 @@ def post_init(self):
             self.reindex()
 
         # commit
-        self.conn.commit()
+        self.sql_commit()
 
     # update this number when we need to make changes to the SQL structure
     # of the backen database
@@ -591,7 +591,7 @@ def newid(self, classname):
         if __debug__:
             print >>hyperdb.DEBUG, 'newid', (self, sql, classname)
         self.cursor.execute(sql, (classname, ))
-        newid = self.cursor.fetchone()[0]
+        newid = int(self.cursor.fetchone()[0])
 
         # update the counter
         sql = 'update ids set num=%s where name=%s'%(self.arg, self.arg)
@@ -1066,6 +1066,7 @@ def sql_close(self):
     def close(self):
         ''' Close off the connection.
         '''
+        self.indexer.close()
         self.sql_close()
 
 #