Skip to content

Commit 0a0b817

Browse files
Bernhard ReiterBernhard Reiter
authored andcommitted
Indexers behaviour made more consistent regarding length of indexed words...
...and stopwords (thanks Thomas Arendsen Hein, Bernhard Reiter)(issue 2550584)
1 parent 5d96fb9 commit 0a0b817

File tree

5 files changed

+27
-16
lines changed

5 files changed

+27
-16
lines changed

CHANGES.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@ are given with the most recent entry first.
44
2009-xx-xx 1.4.X
55

66
Fixes:
7-
7+
- Indexers behaviour made more consistent regarding length of indexed words
8+
and stopwords (thanks Thomas Arendsen Hein, Bernhard Reiter)(issue 2550584)
89
- fixed typos in the installation instructions (thanks Thomas Arendsen Hein)
910
(issue 2550573)
1011

roundup/backends/indexer_common.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ def __init__(self, db):
2222
self.stopwords = set(STOPWORDS)
2323
for word in db.config[('main', 'indexer_stopwords')]:
2424
self.stopwords.add(word)
25+
# Do not index anything longer than 25 characters since that'll be
26+
# gibberish (encoded text or somesuch) or shorter than 2 characters
27+
self.minlength = 2
28+
self.maxlength = 25
2529

2630
def is_stopword(self, word):
2731
return word in self.stopwords

roundup/backends/indexer_dbm.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -135,14 +135,12 @@ def text_splitter(self, text):
135135
# case insensitive
136136
text = str(text).upper()
137137

138-
# Split the raw text, losing anything longer than 25 characters
139-
# since that'll be gibberish (encoded text or somesuch) or shorter
140-
# than 3 characters since those short words appear all over the
141-
# place
142-
return re.findall(r'\b\w{2,25}\b', text)
143-
144-
# we override this to ignore not 2 < word < 25 and also to fix a bug -
145-
# the (fail) case.
138+
# Split the raw text
139+
return re.findall(r'\b\w{%d,%d}\b' % (self.minlength, self.maxlength),
140+
text)
141+
142+
# we override this to ignore too short and too long words
143+
# and also to fix a bug - the (fail) case.
146144
def find(self, wordlist):
147145
'''Locate files that match ALL the words in wordlist
148146
'''
@@ -152,10 +150,12 @@ def find(self, wordlist):
152150
entries = {}
153151
hits = None
154152
for word in wordlist:
155-
if not 2 < len(word) < 25:
153+
if not self.minlength <= len(word) <= self.maxlength:
156154
# word outside the bounds of what we index - ignore
157155
continue
158156
word = word.upper()
157+
if self.is_stopword(word):
158+
continue
159159
entry = self.words.get(word) # For each word, get index
160160
entries[word] = entry # of matching files
161161
if not entry: # Nothing for this one word (fail)

roundup/backends/indexer_rdbms.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,11 @@ def add_text(self, identifier, text, mime_type='text/plain'):
6666
# ok, find all the unique words in the text
6767
text = unicode(text, "utf-8", "replace").upper()
6868
wordlist = [w.encode("utf-8")
69-
for w in re.findall(r'(?u)\b\w{2,25}\b', text)]
69+
for w in re.findall(r'(?u)\b\w{%d,%d}\b'
70+
% (self.minlength, self.maxlength), text)]
7071
words = set()
7172
for word in wordlist:
7273
if self.is_stopword(word): continue
73-
if len(word) > 25: continue
7474
words.add(word)
7575

7676
# for each word, add an entry in the db
@@ -86,7 +86,9 @@ def find(self, wordlist):
8686
if not wordlist:
8787
return []
8888

89-
l = [word.upper() for word in wordlist if 26 > len(word) > 2]
89+
l = [word.upper() for word in wordlist
90+
if self.minlength <= len(word) <= self.maxlength]
91+
l = [word for word in l if not self.is_stopword(word)]
9092

9193
if not l:
9294
return []

roundup/backends/indexer_xapian.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,9 @@ def add_text(self, identifier, text, mime_type='text/plain'):
8888
doc.set_data(identifier)
8989
doc.add_posting(identifier, 0)
9090

91-
for match in re.finditer(r'\b\w{2,25}\b', text.upper()):
91+
for match in re.finditer(r'\b\w{%d,%d}\b'
92+
% (self.minlength, self.maxlength),
93+
text.upper()):
9294
word = match.group(0)
9395
if self.is_stopword(word):
9496
continue
@@ -112,8 +114,10 @@ def find(self, wordlist):
112114
enquire = xapian.Enquire(database)
113115
stemmer = xapian.Stem("english")
114116
terms = []
115-
for term in [word.upper() for word in wordlist if 26 > len(word) > 2]:
116-
terms.append(stemmer(term.upper()))
117+
for term in [word.upper() for word in wordlist
118+
if self.minlength <= len(word) <= self.maxlength]:
119+
if not self.is_stopword(term):
120+
terms.append(stemmer(term))
117121
query = xapian.Query(xapian.Query.OP_AND, terms)
118122

119123
enquire.set_query(query)

0 commit comments

Comments
 (0)