Skip to content

Commit 4de931c

Browse files
author
Richard Jones
committed
don't index common words [SF#1046612]
1 parent 74c5c0a commit 4de931c

File tree

4 files changed

+25
-7
lines changed

4 files changed

+25
-7
lines changed

CHANGES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ Fixed:
5050
- s/Modifed/Modified (thanks donfu)
5151
- applied patch fixing some form handling issues in ZRoundup (sf bug 995565)
5252
- enforce View Permission when serving file content (sf bug 1050470)
53+
- don't index common words (sf bug 1046612)
5354

5455

5556
2004-10-15 0.7.8

roundup/backends/indexer_dbm.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# that promote freedom, but obviously am giving up any rights
1515
# to compel such.
1616
#
17-
#$Id: indexer_dbm.py,v 1.1 2004-03-19 04:47:59 richard Exp $
17+
#$Id: indexer_dbm.py,v 1.2 2004-11-05 05:10:07 richard Exp $
1818
'''This module provides an indexer class, RoundupIndexer, that stores text
1919
indices in a roundup instance. This class makes searching the content of
2020
messages, string properties and text files possible.
@@ -24,6 +24,18 @@
2424
import os, shutil, re, mimetypes, marshal, zlib, errno
2525
from roundup.hyperdb import Link, Multilink
2626

27+
stopwords = [
28+
"A", "AND", "ARE", "AS", "AT", "BE", "BUT", "BY",
29+
"FOR", "IF", "IN", "INTO", "IS", "IT",
30+
"NO", "NOT", "OF", "ON", "OR", "SUCH",
31+
"THAT", "THE", "THEIR", "THEN", "THERE", "THESE",
32+
"THEY", "THIS", "TO", "WAS", "WILL", "WITH"
33+
]
34+
is_stopword = {}
35+
for word in stopwords:
36+
is_stopword[word] = None
37+
is_stopword = is_stopword.has_key
38+
2739
class Indexer:
2840
'''Indexes information from roundup's hyperdb to allow efficient
2941
searching.
@@ -95,6 +107,8 @@ def add_text(self, identifier, text, mime_type='text/plain'):
95107
# find the unique words
96108
filedict = {}
97109
for word in words:
110+
if is_stopword(word):
111+
continue
98112
if filedict.has_key(word):
99113
filedict[word] = filedict[word]+1
100114
else:

roundup/backends/indexer_rdbms.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,9 @@
44
'''
55
import re
66

7-
from indexer_dbm import Indexer
7+
from indexer_dbm import Indexer, is_stopword
88

99
class Indexer(Indexer):
10-
disallows = {'THE':1, 'THIS':1, 'ZZZ':1, 'THAT':1, 'WITH':1}
1110
def __init__(self, db):
1211
self.db = db
1312
self.reindex = 0
@@ -55,8 +54,9 @@ def add_text(self, identifier, text, mime_type='text/plain'):
5554
wordlist = re.findall(r'\b\w{2,25}\b', str(text).upper())
5655
words = {}
5756
for word in wordlist:
58-
if not self.disallows.has_key(word):
59-
words[word] = 1
57+
if is_stopword(word):
58+
continue
59+
words[word] = 1
6060
words = words.keys()
6161

6262
# for each word, add an entry in the db

test/db_test_base.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
1616
# SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
1717
#
18-
# $Id: db_test_base.py,v 1.51 2004-10-24 09:57:32 a1s Exp $
18+
# $Id: db_test_base.py,v 1.52 2004-11-05 05:10:07 richard Exp $
1919

2020
import unittest, os, shutil, errno, imp, sys, time, pprint
2121

@@ -684,7 +684,7 @@ def testIndexerSearching(self):
684684
f2 = self.db.file.create(content='world', type="text/frozz",
685685
comment='blah blah')
686686
i1 = self.db.issue.create(files=[f1, f2], title="flebble plop")
687-
i2 = self.db.issue.create(title="flebble frooz")
687+
i2 = self.db.issue.create(title="flebble the frooz")
688688
self.db.commit()
689689
self.assertEquals(self.db.indexer.search(['hello'], self.db.issue),
690690
{i1: {'files': [f1]}})
@@ -694,6 +694,9 @@ def testIndexerSearching(self):
694694
self.assertEquals(self.db.indexer.search(['flebble'], self.db.issue),
695695
{i1: {}, i2: {}})
696696

697+
# unindexed stopword
698+
self.assertEquals(self.db.indexer.search(['the'], self.db.issue), {})
699+
697700
def testReindexing(self):
698701
search = self.db.indexer.search
699702
issue = self.db.issue

0 commit comments

Comments
 (0)