1414# that promote freedom, but obviously am giving up any rights
1515# to compel such.
1616#
17- #$Id: indexer.py,v 1.4 2002-07-09 03:02:52 richard Exp $
17+ #$Id: indexer.py,v 1.5 2002-07-09 04:19:09 richard Exp $
1818'''
1919This module provides an indexer class, RoundupIndexer, that stores text
2020indices in a roundup instance. This class makes searching the content of
2525class Indexer :
2626 ''' Indexes information from roundup's hyperdb to allow efficient
2727 searching.
28+
29+ Three structures are created by the indexer:
30+ files {identifier: (fileid, wordcount)}
31+ words {word: {fileid: count}}
32+ fileids {fileid: identifier}
2833 '''
2934 def __init__ (self , db_path ):
30- indexdb_path = os .path .join (db_path , 'indexes' )
31- self .indexdb = os .path .join (indexdb_path , 'index.db' )
35+ self . indexdb_path = os .path .join (db_path , 'indexes' )
36+ self .indexdb = os .path .join (self . indexdb_path , 'index.db' )
3237 self .reindex = 0
3338 self .casesensitive = 0
3439 self .quiet = 9
3540
3641 # see if we need to reindex because of a change in code
37- if (not os .path .exists (indexdb_path ) or
38- not os .path .exists (os .path .join (indexdb_path , 'version' ))):
42+ if (not os .path .exists (self . indexdb_path ) or
43+ not os .path .exists (os .path .join (self . indexdb_path , 'version' ))):
3944 # TODO: if the version file exists (in the future) we'll want to
4045 # check the value in it - for now the file itself is a flag
41- if os .path .exists (indexdb_path ):
42- shutil .rmtree (indexdb_path )
43- os .makedirs (indexdb_path )
44- os .chmod (indexdb_path , 0775 )
45- open (os .path .join (indexdb_path , 'version' ), 'w' ).write ('1\n ' )
46-
47- # we need to reindex
48- self .reindex = 1
49- else :
50- self .reindex = 0
46+ self .force_reindex ()
47+
48+ def force_reindex (self ):
49+ '''Force a reindex condition
50+ '''
51+ if os .path .exists (self .indexdb_path ):
52+ shutil .rmtree (self .indexdb_path )
53+ os .makedirs (self .indexdb_path )
54+ os .chmod (self .indexdb_path , 0775 )
55+ open (os .path .join (self .indexdb_path , 'version' ), 'w' ).write ('1\n ' )
56+ self .reindex = 1
5157
5258 def should_reindex (self ):
5359 '''Should we reindex?
@@ -61,16 +67,9 @@ def add_text(self, identifier, text, mime_type='text/plain'):
6167 # make sure the index is loaded
6268 self .load_index ()
6369
64- # Is file eligible for (re)indexing?
70+ # remove old entries for this identifier
6571 if self .files .has_key (identifier ):
66- # Reindexing enabled, cleanup dicts
67- if self .reindex :
68- self .purge_entry (identifier , self .files , self .words )
69- else :
70- # DO NOT reindex this file
71- if self .quiet < 5 :
72- print "Not reindexing" , identifier
73- return 0
72+ self .purge_entry (identifier )
7473
7574 # split into words
7675 words = self .splitter (text , mime_type )
@@ -281,26 +280,43 @@ def save_index(self):
281280 pickle_fh .write (zlib .compress (pickle_str ))
282281 os .chmod (filename , 0664 )
283282
284- def purge_entry (self , fname , file_dct , word_dct ):
283+ def purge_entry (self , identifier ):
285284 ''' Remove a file from file index and word index
286285 '''
287- try : # The easy part, cleanup the file index
288- file_index = file_dct [fname ]
289- del file_dct [fname ]
290- except KeyError :
291- pass # We'll assume we only encounter KeyError's
286+ if not self .files .has_key (identifier ):
287+ return
288+
289+ file_index = self .files [identifier ][0 ]
290+ del self .files [identifier ]
291+ del self .fileids [file_index ]
292+
292293 # The much harder part, cleanup the word index
293- for word , occurs in word_dct .items ():
294+ for key , occurs in self . words .items ():
294295 if occurs .has_key (file_index ):
295296 del occurs [file_index ]
296- word_dct [word ] = occurs
297297
298298 def index_loaded (self ):
299299 return (hasattr (self ,'fileids' ) and hasattr (self ,'files' ) and
300300 hasattr (self ,'words' ))
301301
302302#
303303#$Log: not supported by cvs2svn $
304+ #Revision 1.4 2002/07/09 03:02:52 richard
305+ #More indexer work:
306+ #- all String properties may now be indexed too. Currently there's a bit of
307+ # "issue" specific code in the actual searching which needs to be
308+ # addressed. In a nutshell:
309+ # + pass 'indexme="yes"' as a String() property initialisation arg, eg:
310+ # file = FileClass(db, "file", name=String(), type=String(),
311+ # comment=String(indexme="yes"))
312+ # + the comment will then be indexed and be searchable, with the results
313+ # related back to the issue that the file is linked to
314+ #- as a result of this work, the FileClass has a default MIME type that may
315+ # be overridden in a subclass, or by the use of a "type" property as is
316+ # done in the default templates.
317+ #- the regeneration of the indexes (if necessary) is done once the schema is
318+ # set up in the dbinit.
319+ #
304320#Revision 1.3 2002/07/08 06:58:15 richard
305321#cleaned up the indexer code:
306322# - it splits more words out (much simpler, faster splitter)
0 commit comments