Skip to content

Commit 29394ac

Browse files
author
Richard Jones
committed
Features and fixes.
Feature: - trackers may configure custom stop-words for the full-text indexer Fixed: - fixes in scripts/import_sf.py - fix some unicode bugs in roundup-admin import - Xapian indexer wasn't actually being used - fix indexing of message content on roundup-admin import
1 parent c221336 commit 29394ac

File tree

14 files changed

+172
-107
lines changed

14 files changed

+172
-107
lines changed

CHANGES.txt

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
This file contains the changes to the Roundup system over time. The entries
22
are given with the most recent entry first.
33

4-
2006-??-?? 1.0.2
4+
2006-02-06 1.0.2
5+
Feature:
6+
- trackers may configure custom stop-words for the full-text indexer
7+
58
Fixed:
6-
- silly typo in scripts/import_sf.py
9+
- fixes in scripts/import_sf.py
10+
- fix some unicode bugs in roundup-admin import
11+
- Xapian indexer wasn't actually being used
12+
- fix indexing of message content on roundup-admin import
713

814

915
2006-02-03 1.0.1

doc/customizing.txt

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Customising Roundup
33
===================
44

5-
:Version: $Revision: 1.191 $
5+
:Version: $Revision: 1.192 $
66

77
.. This document borrows from the ZopeBook section on ZPT. The original is at:
88
http://www.zope.org/Documentation/Books/ZopeBook/current/ZPT.stx
@@ -140,6 +140,15 @@ Section **main**
140140
email?
141141
Allowed values: ``yes``, ``no``
142142

143+
email_registration_confirmation -- ``yes``
144+
Offer registration confirmation by email or only through the web?
145+
Allowed values: ``yes``, ``no``
146+
147+
indexer_stopwords -- default *blank*
148+
Additional stop-words for the full-text indexer specific to
149+
your tracker. See the indexer source for the default list of
150+
stop-words (e.g. ``A,AND,ARE,AS,AT,BE,BUT,BY, ...``).
151+
143152
Section **tracker**
144153
name -- ``Roundup issue tracker``
145154
A descriptive name for your roundup instance.

roundup/admin.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
# BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
1717
# SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
1818
#
19-
# $Id: admin.py,v 1.97 2005-12-03 11:26:08 a1s Exp $
19+
# $Id: admin.py,v 1.98 2006-02-06 21:00:44 richard Exp $
2020

2121
'''Administration commands for maintaining Roundup trackers.
2222
'''
@@ -1155,16 +1155,19 @@ class colon_separated(csv.excel):
11551155
file_props = None
11561156
maxid = 1
11571157
# loop through the file and create a node for each entry
1158-
for r in reader:
1158+
for n, r in enumerate(reader):
11591159
if file_props is None:
11601160
file_props = r
11611161
continue
1162+
sys.stdout.write('Importing %s - %d\r'%(classname, n))
1163+
sys.stdout.flush()
1164+
11621165
# do the import and figure the current highest nodeid
11631166
nodeid = int(cl.import_list(file_props, r))
11641167
if hasattr(cl, 'import_files'):
11651168
cl.import_files(dir, nodeid)
11661169
maxid = max(maxid, nodeid)
1167-
1170+
print
11681171
f.close()
11691172

11701173
# import the journals
@@ -1238,7 +1241,7 @@ def do_reindex(self, args, desre=re.compile('([A-Za-z]+)([0-9]+)')):
12381241
cl = self.get_class(arg)
12391242
self.db.reindex(arg)
12401243
else:
1241-
self.db.reindex()
1244+
self.db.reindex(show_progress=True)
12421245
return 0
12431246

12441247
def do_security(self, args):

roundup/backends/back_anydbm.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
1616
# SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
1717
#
18-
#$Id: back_anydbm.py,v 1.194 2006-01-24 08:26:09 a1s Exp $
18+
#$Id: back_anydbm.py,v 1.195 2006-02-06 21:00:46 richard Exp $
1919
'''This module defines a backend that saves the hyperdatabase in a
2020
database chosen by anydbm. It is guaranteed to always be available in python
2121
versions >2.1.1 (the dumbdbm fallback in 2.1.1 and earlier has several
@@ -43,8 +43,7 @@
4343
from sessions_dbm import Sessions, OneTimeKeys
4444

4545
try:
46-
# re-enable once Xapian is fixed
47-
from indexer_xapian import Indexer_disabled
46+
from indexer_xapian import Indexer
4847
except ImportError:
4948
from indexer_dbm import Indexer
5049

roundup/backends/back_metakit.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# $Id: back_metakit.py,v 1.103 2006-01-27 05:22:46 richard Exp $
1+
# $Id: back_metakit.py,v 1.104 2006-02-06 21:00:47 richard Exp $
22
'''Metakit backend for Roundup, originally by Gordon McMillan.
33
44
Known Current Bugs:
@@ -1816,6 +1816,14 @@ def import_files(self, dirname, nodeid):
18161816
os.makedirs(os.path.dirname(dest))
18171817
shutil.copyfile(source, dest)
18181818

1819+
mime_type = None
1820+
if self.getprops().has_key('type'):
1821+
mime_type = propvalues.get('type', self.get(itemid, 'type'))
1822+
if not mime_type:
1823+
mime_type = self.default_mime_type
1824+
self.db.indexer.add_text((self.classname, nodeid, 'content'),
1825+
self.get(nodeid, 'content'), mime_type)
1826+
18191827
def get(self, nodeid, propname, default=_marker, cache=1):
18201828
if propname == 'content':
18211829
poss_msg = 'Possibly an access right configuration problem.'
@@ -2078,8 +2086,7 @@ def rollback(self):
20782086
self.changed = 0
20792087

20802088
try:
2081-
# re-enable once Xapian is fixed
2082-
from indexer_xapian import Indexer_disabled
2089+
from indexer_xapian import Indexer
20832090
except ImportError:
20842091
Indexer = MetakitIndexer
20852092

roundup/backends/indexer_common.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,29 @@
1-
#$Id: indexer_common.py,v 1.4 2005-01-08 16:16:59 jlgijsbers Exp $
2-
import re
1+
#$Id: indexer_common.py,v 1.5 2006-02-06 21:00:47 richard Exp $
2+
import re, sets
33

44
from roundup import hyperdb
55

6-
stopwords = [
7-
"A", "AND", "ARE", "AS", "AT", "BE", "BUT", "BY",
8-
"FOR", "IF", "IN", "INTO", "IS", "IT",
9-
"NO", "NOT", "OF", "ON", "OR", "SUCH",
10-
"THAT", "THE", "THEIR", "THEN", "THERE", "THESE",
11-
"THEY", "THIS", "TO", "WAS", "WILL", "WITH"
6+
STOPWORDS = [
7+
"A", "AND", "ARE", "AS", "AT", "BE", "BUT", "BY",
8+
"FOR", "IF", "IN", "INTO", "IS", "IT",
9+
"NO", "NOT", "OF", "ON", "OR", "SUCH",
10+
"THAT", "THE", "THEIR", "THEN", "THERE", "THESE",
11+
"THEY", "THIS", "TO", "WAS", "WILL", "WITH"
1212
]
1313

14-
is_stopword = {}
15-
for word in stopwords:
16-
is_stopword[word] = None
17-
is_stopword = is_stopword.has_key
18-
1914
def _isLink(propclass):
2015
return (isinstance(propclass, hyperdb.Link) or
2116
isinstance(propclass, hyperdb.Multilink))
2217

2318
class Indexer:
19+
def __init__(self, db):
20+
self.stopwords = sets.Set(STOPWORDS)
21+
for word in db.config[('main', 'indexer_stopwords')]:
22+
self.stopwords.add(word)
23+
24+
def is_stopword(self, word):
25+
return word in self.stopwords
26+
2427
def getHits(self, search_terms, klass):
2528
return self.find(search_terms)
2629

roundup/backends/indexer_dbm.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# that promote freedom, but obviously am giving up any rights
1515
# to compel such.
1616
#
17-
#$Id: indexer_dbm.py,v 1.6 2005-04-28 00:21:42 richard Exp $
17+
#$Id: indexer_dbm.py,v 1.7 2006-02-06 21:00:47 richard Exp $
1818
'''This module provides an indexer class, RoundupIndexer, that stores text
1919
indices in a roundup instance. This class makes searching the content of
2020
messages, string properties and text files possible.
@@ -23,9 +23,9 @@
2323

2424
import os, shutil, re, mimetypes, marshal, zlib, errno
2525
from roundup.hyperdb import Link, Multilink
26-
from roundup.backends.indexer_common import Indexer, is_stopword
26+
from roundup.backends.indexer_common import Indexer as IndexerBase
2727

28-
class Indexer(Indexer):
28+
class Indexer(IndexerBase):
2929
'''Indexes information from roundup's hyperdb to allow efficient
3030
searching.
3131
@@ -38,6 +38,7 @@ class Indexer(Indexer):
3838
where identifier is (classname, nodeid, propertyname)
3939
'''
4040
def __init__(self, db):
41+
IndexerBase.__init__(self, db)
4142
self.indexdb_path = os.path.join(db.config.DATABASE, 'indexes')
4243
self.indexdb = os.path.join(self.indexdb_path, 'index.db')
4344
self.reindex = 0
@@ -96,7 +97,7 @@ def add_text(self, identifier, text, mime_type='text/plain'):
9697
# find the unique words
9798
filedict = {}
9899
for word in words:
99-
if is_stopword(word):
100+
if self.is_stopword(word):
100101
continue
101102
if filedict.has_key(word):
102103
filedict[word] = filedict[word]+1

roundup/backends/indexer_rdbms.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
1-
#$Id: indexer_rdbms.py,v 1.11 2005-09-28 05:42:23 richard Exp $
1+
#$Id: indexer_rdbms.py,v 1.12 2006-02-06 21:00:47 richard Exp $
22
''' This implements the full-text indexer over two RDBMS tables. The first
33
is a mapping of words to occurance IDs. The second maps the IDs to (Class,
44
propname, itemid) instances.
55
'''
66
import re
77

8-
from indexer_common import Indexer, is_stopword
8+
from roundup.backends.indexer_common import Indexer as IndexerBase
99

10-
class Indexer(Indexer):
10+
class Indexer(IndexerBase):
1111
def __init__(self, db):
12+
IndexerBase.__init__(self, db)
1213
self.db = db
1314
self.reindex = 0
1415

@@ -62,7 +63,7 @@ def add_text(self, identifier, text, mime_type='text/plain'):
6263
for w in re.findall(r'(?u)\b\w{2,25}\b', text)]
6364
words = {}
6465
for word in wordlist:
65-
if is_stopword(word): continue
66+
if self.is_stopword(word): continue
6667
if len(word) > 25: continue
6768
words[word] = 1
6869
words = words.keys()

roundup/backends/indexer_xapian.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
1-
#$Id: indexer_xapian.py,v 1.1 2005-04-28 00:21:42 richard Exp $
1+
#$Id: indexer_xapian.py,v 1.2 2006-02-06 21:00:47 richard Exp $
22
''' This implements the full-text indexer using the Xapian indexer.
33
'''
44
import re, os
55

66
import xapian
77

8-
from indexer_common import Indexer, is_stopword
8+
from roundup.backends.indexer_common import Indexer as IndexerBase
99

1010
# TODO: we need to delete documents when a property is *reindexed*
1111

12-
class Indexer(Indexer):
12+
class Indexer(IndexerBase):
1313
def __init__(self, db):
14+
IndexerBase.__init__(self, db)
1415
self.db_path = db.config.DATABASE
1516
self.reindex = 0
1617
self.transaction_active = False
@@ -63,7 +64,7 @@ def add_text(self, identifier, text, mime_type='text/plain'):
6364
doc.set_data('%s:%s:%s'%identifier)
6465
for match in re.finditer(r'\b\w{2,25}\b', text.upper()):
6566
word = match.group(0)
66-
if is_stopword(word):
67+
if self.is_stopword(word):
6768
continue
6869
term = stemmer.stem_word(word)
6970
doc.add_posting(term, match.start(0))

roundup/backends/rdbms_common.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# $Id: rdbms_common.py,v 1.164 2006-01-30 00:36:26 richard Exp $
1+
# $Id: rdbms_common.py,v 1.165 2006-02-06 21:00:47 richard Exp $
22
''' Relational database (SQL) backend common code.
33
44
Basics:
@@ -32,16 +32,15 @@
3232
import sys, os, time, re, errno, weakref, copy, logging
3333

3434
# roundup modules
35-
from roundup import hyperdb, date, password, roundupdb, security
35+
from roundup import hyperdb, date, password, roundupdb, security, support
3636
from roundup.hyperdb import String, Password, Date, Interval, Link, \
3737
Multilink, DatabaseError, Boolean, Number, Node
3838
from roundup.backends import locking
3939

4040
# support
4141
from blobfiles import FileStorage
4242
try:
43-
# re-enable once Xapian is fixed
44-
from indexer_xapian import Indexer_disabled
43+
from indexer_xapian import Indexer
4544
except ImportError:
4645
from indexer_rdbms import Indexer
4746
from sessions_rdbms import Sessions, OneTimeKeys
@@ -317,14 +316,19 @@ def refresh_database(self):
317316
self.post_init()
318317

319318

320-
def reindex(self, classname=None):
319+
def reindex(self, classname=None, show_progress=False):
321320
if classname:
322321
classes = [self.getclass(classname)]
323322
else:
324323
classes = self.classes.values()
325324
for klass in classes:
326-
for nodeid in klass.list():
327-
klass.index(nodeid)
325+
if show_progress:
326+
for nodeid in support.Progress('Reindex %s'%klass.classname,
327+
klass.list()):
328+
klass.index(nodeid)
329+
else:
330+
for nodeid in klass.list():
331+
klass.index(nodeid)
328332
self.indexer.save_index()
329333

330334
hyperdb_to_sql_datatypes = {
@@ -2371,15 +2375,17 @@ def import_list(self, propnames, proplist):
23712375
pwd = password.Password()
23722376
pwd.unpack(value)
23732377
value = pwd
2374-
d[propname] = value
2375-
if isinstance(prop, String):
2376-
if type(value) != type('') and type(value) != type(u''):
2378+
elif isinstance(prop, String):
2379+
if isinstance(value, unicode):
2380+
value = value.encode('utf8')
2381+
if not isinstance(value, str):
23772382
raise TypeError, \
23782383
'new property "%(propname)s" not a string: %(value)r' \
23792384
% locals()
23802385
if prop.indexme:
23812386
self.db.indexer.add_text((self.classname, newid, propname),
23822387
value)
2388+
d[propname] = value
23832389

23842390
# get a new id if necessary
23852391
if newid is None:

0 commit comments

Comments
 (0)