Indexers behaviour made more consistent regarding length of indexed words...

Bernhard Reiter · Bernhard Reiter · commit 0a0b8172cebd · 2009-09-11T15:55:11.000Z
...and stopwords (thanks Thomas Arendsen Hein, Bernhard Reiter)(issue 2550584)
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -4,7 +4,8 @@ are given with the most recent entry first.
 2009-xx-xx 1.4.X
 
 Fixes:
-
+- Indexers behaviour made more consistent regarding length of indexed words
+  and stopwords (thanks Thomas Arendsen Hein, Bernhard Reiter)(issue 2550584)
 - fixed typos in the installation instructions (thanks Thomas Arendsen Hein)
   (issue 2550573) 
 
diff --git a/roundup/backends/indexer_common.py b/roundup/backends/indexer_common.py
@@ -22,6 +22,10 @@ def __init__(self, db):
         self.stopwords = set(STOPWORDS)
         for word in db.config[('main', 'indexer_stopwords')]:
             self.stopwords.add(word)
+        # Do not index anything longer than 25 characters since that'll be
+        # gibberish (encoded text or somesuch) or shorter than 2 characters
+        self.minlength = 2
+        self.maxlength = 25
 
     def is_stopword(self, word):
         return word in self.stopwords
diff --git a/roundup/backends/indexer_dbm.py b/roundup/backends/indexer_dbm.py
@@ -135,14 +135,12 @@ def text_splitter(self, text):
         # case insensitive
         text = str(text).upper()
 
-        # Split the raw text, losing anything longer than 25 characters
-        # since that'll be gibberish (encoded text or somesuch) or shorter
-        # than 3 characters since those short words appear all over the
-        # place
-        return re.findall(r'\b\w{2,25}\b', text)
-
-    # we override this to ignore not 2 < word < 25 and also to fix a bug -
-    # the (fail) case.
+        # Split the raw text
+        return re.findall(r'\b\w{%d,%d}\b' % (self.minlength, self.maxlength),
+                          text)
+
+    # we override this to ignore too short and too long words
+    # and also to fix a bug - the (fail) case.
     def find(self, wordlist):
         '''Locate files that match ALL the words in wordlist
         '''
@@ -152,10 +150,12 @@ def find(self, wordlist):
         entries = {}
         hits = None
         for word in wordlist:
-            if not 2 < len(word) < 25:
+            if not self.minlength <= len(word) <= self.maxlength:
                 # word outside the bounds of what we index - ignore
                 continue
             word = word.upper()
+            if self.is_stopword(word):
+                continue
             entry = self.words.get(word)    # For each word, get index
             entries[word] = entry           #   of matching files
             if not entry:                   # Nothing for this one word (fail)
diff --git a/roundup/backends/indexer_rdbms.py b/roundup/backends/indexer_rdbms.py
@@ -66,11 +66,11 @@ def add_text(self, identifier, text, mime_type='text/plain'):
         # ok, find all the unique words in the text
         text = unicode(text, "utf-8", "replace").upper()
         wordlist = [w.encode("utf-8")
-            for w in re.findall(r'(?u)\b\w{2,25}\b', text)]
+            for w in re.findall(r'(?u)\b\w{%d,%d}\b'
+                                % (self.minlength, self.maxlength), text)]
         words = set()
         for word in wordlist:
             if self.is_stopword(word): continue
-            if len(word) > 25: continue
             words.add(word)
 
         # for each word, add an entry in the db
@@ -86,7 +86,9 @@ def find(self, wordlist):
         if not wordlist:
             return []
 
-        l = [word.upper() for word in wordlist if 26 > len(word) > 2]
+        l = [word.upper() for word in wordlist
+             if self.minlength <= len(word) <= self.maxlength]
+        l = [word for word in l if not self.is_stopword(word)]
 
         if not l:
             return []
diff --git a/roundup/backends/indexer_xapian.py b/roundup/backends/indexer_xapian.py
@@ -88,7 +88,9 @@ def add_text(self, identifier, text, mime_type='text/plain'):
         doc.set_data(identifier)
         doc.add_posting(identifier, 0)
 
-        for match in re.finditer(r'\b\w{2,25}\b', text.upper()):
+        for match in re.finditer(r'\b\w{%d,%d}\b'
+                                 % (self.minlength, self.maxlength),
+                                 text.upper()):
             word = match.group(0)
             if self.is_stopword(word):
                 continue
@@ -112,8 +114,10 @@ def find(self, wordlist):
         enquire = xapian.Enquire(database)
         stemmer = xapian.Stem("english")
         terms = []
-        for term in [word.upper() for word in wordlist if 26 > len(word) > 2]:
-            terms.append(stemmer(term.upper()))
+        for term in [word.upper() for word in wordlist
+                          if self.minlength <= len(word) <= self.maxlength]:
+            if not self.is_stopword(term):
+                terms.append(stemmer(term))
         query = xapian.Query(xapian.Query.OP_AND, terms)
 
         enquire.set_query(query)