Skip to content

Commit b08e58b

Browse files
author
Richard Jones
committed
merge from HEAD
1 parent 92e3f6a commit b08e58b

File tree

4 files changed

+25
-7
lines changed

4 files changed

+25
-7
lines changed

CHANGES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Fixed:
1111
- applied patch fixing some form handling issues in ZRoundup (thanks Chris
1212
Withers)
1313
- enforce View Permission when serving file content (sf bug 1050470)
14+
- don't index common words (sf bug 1046612)
1415

1516

1617
2004-10-15 0.7.8

roundup/backends/indexer_dbm.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# that promote freedom, but obviously am giving up any rights
1515
# to compel such.
1616
#
17-
#$Id: indexer_dbm.py,v 1.1 2004-03-19 04:47:59 richard Exp $
17+
#$Id: indexer_dbm.py,v 1.1.2.1 2004-11-05 05:11:25 richard Exp $
1818
'''This module provides an indexer class, RoundupIndexer, that stores text
1919
indices in a roundup instance. This class makes searching the content of
2020
messages, string properties and text files possible.
@@ -24,6 +24,18 @@
2424
import os, shutil, re, mimetypes, marshal, zlib, errno
2525
from roundup.hyperdb import Link, Multilink
2626

27+
stopwords = [
28+
"A", "AND", "ARE", "AS", "AT", "BE", "BUT", "BY",
29+
"FOR", "IF", "IN", "INTO", "IS", "IT",
30+
"NO", "NOT", "OF", "ON", "OR", "SUCH",
31+
"THAT", "THE", "THEIR", "THEN", "THERE", "THESE",
32+
"THEY", "THIS", "TO", "WAS", "WILL", "WITH"
33+
]
34+
is_stopword = {}
35+
for word in stopwords:
36+
is_stopword[word] = None
37+
is_stopword = is_stopword.has_key
38+
2739
class Indexer:
2840
'''Indexes information from roundup's hyperdb to allow efficient
2941
searching.
@@ -95,6 +107,8 @@ def add_text(self, identifier, text, mime_type='text/plain'):
95107
# find the unique words
96108
filedict = {}
97109
for word in words:
110+
if is_stopword(word):
111+
continue
98112
if filedict.has_key(word):
99113
filedict[word] = filedict[word]+1
100114
else:

roundup/backends/indexer_rdbms.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,9 @@
44
'''
55
import re
66

7-
from indexer_dbm import Indexer
7+
from indexer_dbm import Indexer, is_stopword
88

99
class Indexer(Indexer):
10-
disallows = {'THE':1, 'THIS':1, 'ZZZ':1, 'THAT':1, 'WITH':1}
1110
def __init__(self, db):
1211
self.db = db
1312
self.reindex = 0
@@ -55,8 +54,9 @@ def add_text(self, identifier, text, mime_type='text/plain'):
5554
wordlist = re.findall(r'\b\w{2,25}\b', str(text).upper())
5655
words = {}
5756
for word in wordlist:
58-
if not self.disallows.has_key(word):
59-
words[word] = 1
57+
if is_stopword(word):
58+
continue
59+
words[word] = 1
6060
words = words.keys()
6161

6262
# for each word, add an entry in the db

test/db_test_base.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
1616
# SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
1717
#
18-
# $Id: db_test_base.py,v 1.27.2.11 2004-10-08 00:21:31 richard Exp $
18+
# $Id: db_test_base.py,v 1.27.2.12 2004-11-05 05:11:25 richard Exp $
1919

2020
import unittest, os, shutil, errno, imp, sys, time, pprint
2121

@@ -654,7 +654,7 @@ def testIndexerSearching(self):
654654
f2 = self.db.file.create(content='world', type="text/frozz",
655655
comment='blah blah')
656656
i1 = self.db.issue.create(files=[f1, f2], title="flebble plop")
657-
i2 = self.db.issue.create(title="flebble frooz")
657+
i2 = self.db.issue.create(title="flebble the frooz")
658658
self.db.commit()
659659
self.assertEquals(self.db.indexer.search(['hello'], self.db.issue),
660660
{i1: {'files': [f1]}})
@@ -664,6 +664,9 @@ def testIndexerSearching(self):
664664
self.assertEquals(self.db.indexer.search(['flebble'], self.db.issue),
665665
{i1: {}, i2: {}})
666666

667+
# unindexed stopword
668+
self.assertEquals(self.db.indexer.search(['the'], self.db.issue), {})
669+
667670
def testReindexing(self):
668671
search = self.db.indexer.search
669672
issue = self.db.issue

0 commit comments

Comments
 (0)