Skip to content

Commit b54ef4f

Browse files
committed
issue2551189 - increase size of words in full text index.
Increased indexed word maxlength to 50 DB migration code is written and tests work. Restructured some tests to allow for code reuse. Docs. If this passes CI without errors 2551189 should be done. However, testing on my system generates errors. Encoding (indexer unicode russian unicode string invalid) and collation errors (utf8_bin not valid) when running under python2. No issues with python3 and I haven't changed code that should cause these since the last successful build in CI. So if this fails in CI we will have more checkins.
1 parent cf48923 commit b54ef4f

File tree

12 files changed

+254
-50
lines changed

12 files changed

+254
-50
lines changed

CHANGES.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,11 @@ Features:
127127
- Add better error display to the user. Needed to expose errors in fts5
128128
search syntax to the user while also displaying the template page
129129
structure. (John Rouillard)
130+
- issue2551189 - increase size of words in full text index.
131+
Many terms (like exception names or symbolic constants) are larger
132+
than 25. Also German words are long. Since there is little chance
133+
of fixing German to shorten their words, change indexer maxlength to
134+
50. (Thomas Hein provided patch; patch reworked John Rouillard)
130135

131136
2021-07-13 2.1.0
132137

doc/upgrading.txt

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,15 @@ Migrating from 2.1.0 to 2.x.y
3838
Rdbms version change from 6 to 7 (required)
3939
-------------------------------------------
4040

41-
To add support for database supported full-text search (FTS), the
42-
rdbms database schema has to be upgraded.
41+
This release includes two changes that require updates to the database
42+
schema:
43+
44+
1. The size of words included in the Roundup FTS indexers have been
45+
increased from 25 to 50. This requires changes to the database
46+
columns used by the native indexer. This also affect the whoosh
47+
and xapian indexers.
48+
2. Some databases that include native full-text search (native-fts
49+
indexer) searching are now supported.
4350

4451
You should run the ``roundup-admin migrate`` command for your
4552
tracker once you've installed the latest codebase.
@@ -52,6 +59,16 @@ updated" (if you've not previously run it on an RDBMS backend) or
5259
"No migration action required" (if you have run it, or have used
5360
another interface to the tracker, or are using anydbm).
5461

62+
See `below for directions on enabling native-fts`_.
63+
64+
.. _below for directions on enabling native-fts: \
65+
#enhanced-full-text-search-optional
66+
67+
The increase in indexed word length also affects whoosh and xapian
68+
backends. You may want to run ``roundup-admin -i tracker_home
69+
reindex`` if you want to index or search for longer words in your full
70+
text searches. Re-indexing make take some time.
71+
5572
Check compression settings (optional)
5673
-------------------------------------
5774

roundup/backends/back_mysql.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,15 @@ def fix_version_5_tables(self):
412412
else:
413413
self.log_info('No changes needed.')
414414

415+
def fix_version_6_tables(self):
416+
# Modify length for __words._word column.
417+
c = self.cursor
418+
sql = "alter table __words change column _word _word varchar(%s)" % (
419+
self.arg)
420+
# Why magic number 5? It was the original offset between
421+
# column length and maxlength.
422+
c.execute(sql, (self.indexer.maxlength + 5,))
423+
415424
def __repr__(self):
416425
return '<myroundsql 0x%x>'%id(self)
417426

roundup/backends/back_postgresql.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,15 @@ def fix_version_3_tables(self):
264264
self.sql('''CREATE INDEX words_both_idx ON public.__words
265265
USING btree (_word, _textid)''')
266266

267+
def fix_version_6_tables(self):
268+
# Modify length for __words._word column.
269+
c = self.cursor
270+
sql = 'alter table __words alter column _word type varchar(%s)' % (
271+
self.arg)
272+
# Why magic number 5? It was the original offset between
273+
# column length and maxlength.
274+
c.execute(sql, (self.indexer.maxlength + 5,))
275+
267276
def add_actor_column(self):
268277
# update existing tables to have the new actor column
269278
tables = self.database_schema['tables']

roundup/backends/back_sqlite.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,8 @@ def _add_fts5_table(self):
219219
)
220220

221221
def fix_version_6_tables(self):
222+
# note sqlite has no limit on column size so v6 fixes
223+
# to __words._word length are not needed.
222224
# Add native full-text indexing table
223225
self._add_fts5_table()
224226

roundup/backends/indexer_common.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,11 @@ def __init__(self, db):
1919
self.stopwords = set(STOPWORDS)
2020
for word in db.config[('main', 'indexer_stopwords')]:
2121
self.stopwords.add(word)
22-
# Do not index anything longer than 25 characters since that'll be
23-
# gibberish (encoded text or somesuch) or shorter than 2 characters
22+
# Do not index anything longer than maxlength characters since
23+
# that'll be gibberish (encoded text or somesuch) or shorter
24+
# than 2 characters
2425
self.minlength = 2
25-
self.maxlength = 25
26+
self.maxlength = 50
2627
self.language = db.config[('main','indexer_language')]
2728
# Some indexers have a query language. If that is the case,
2829
# we don't parse the user supplied query into a wordlist.

roundup/backends/rdbms_common.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -416,9 +416,9 @@ def fix_version_5_tables(self):
416416
pass
417417

418418
def fix_version_6_tables(self):
419-
# Default (used by mysql): NOOP
420-
# sqlite/postgres override this to add fts
421-
# full text search tables.
419+
# Default (used by nobody): NOOP
420+
# Each backend mysql, postgres, sqlite overrides this
421+
# You would think ALTER commands would be the same but nooo.
422422
pass
423423

424424
def _convert_journal_tables(self):

test/test_cgi.py

Lines changed: 90 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,66 @@ def testAddMessageNoEscape(self):
100100
self.assertEqual(cm([],'<i>x</i>\n<b>x</b>',False),
101101
['<i>x</i><br />\n<b>x</b>'])
102102

103-
class FormTestCase(FormTestParent, StringFragmentCmpHelper, unittest.TestCase):
103+
class testCsvExport(object):
104+
105+
def testCSVExport(self):
106+
cl = self._make_client(
107+
{'@columns': 'id,title,status,keyword,assignedto,nosy'},
108+
nodeid=None, userid='1')
109+
cl.classname = 'issue'
110+
111+
demo_id=self.db.user.create(username='demo', address='[email protected]',
112+
roles='User', realname='demo')
113+
key_id1=self.db.keyword.create(name='keyword1')
114+
key_id2=self.db.keyword.create(name='keyword2')
115+
self.db.issue.create(title='foo1', status='2', assignedto='4', nosy=['3',demo_id])
116+
self.db.issue.create(title='bar2', status='1', assignedto='3', keyword=[key_id1,key_id2])
117+
self.db.issue.create(title='baz32', status='4')
118+
output = io.BytesIO()
119+
cl.request = MockNull()
120+
cl.request.wfile = output
121+
# call export version that outputs names
122+
actions.ExportCSVAction(cl).handle()
123+
should_be=(s2b('"id","title","status","keyword","assignedto","nosy"\r\n'
124+
'"1","foo1","deferred","","Contrary, Mary","Bork, Chef;Contrary, Mary;demo"\r\n'
125+
'"2","bar2","unread","keyword1;keyword2","Bork, Chef","Bork, Chef"\r\n'
126+
'"3","baz32","need-eg","","",""\r\n'))
127+
#print(should_be)
128+
print(output.getvalue())
129+
self.assertEqual(output.getvalue(), should_be)
130+
output = io.BytesIO()
131+
cl.request = MockNull()
132+
cl.request.wfile = output
133+
# call export version that outputs id numbers
134+
actions.ExportCSVWithIdAction(cl).handle()
135+
should_be = s2b('"id","title","status","keyword","assignedto","nosy"\r\n'
136+
"\"1\",\"foo1\",\"2\",\"[]\",\"4\",\"['3', '4', '5']\"\r\n"
137+
"\"2\",\"bar2\",\"1\",\"['1', '2']\",\"3\",\"['3']\"\r\n"
138+
'\"3\","baz32",\"4\","[]","None","[]"\r\n')
139+
#print(should_be)
140+
print(output.getvalue())
141+
self.assertEqual(output.getvalue(), should_be)
142+
143+
# test full text search
144+
cl = self._make_client(
145+
{'@columns': 'id,title,status,keyword,assignedto,nosy',
146+
"@search_text": "bar2"}, nodeid=None, userid='1')
147+
cl.classname = 'issue'
148+
output = io.BytesIO()
149+
cl.request = MockNull()
150+
cl.request.wfile = output
151+
152+
# call export version that outputs names
153+
actions.ExportCSVAction(cl).handle()
154+
should_be=(s2b('"id","title","status","keyword","assignedto","nosy"\r\n'
155+
'"2","bar2","unread","keyword1;keyword2","Bork, Chef","Bork, Chef"\r\n'))
156+
157+
# call export version that outputs id numbers
158+
actions.ExportCSVWithIdAction(cl).handle()
159+
should_be = s2b('"id","title","status","keyword","assignedto","nosy"\r\n'
160+
"\"2\",\"bar2\",\"1\",\"['1', '2']\",\"3\",\"['3']\"\r\n")
161+
162+
class FormTestCase(FormTestParent, StringFragmentCmpHelper, testCsvExport, unittest.TestCase):
104163

105164
def setUp(self):
106165
FormTestParent.setUp(self)
@@ -1810,44 +1869,6 @@ def testRoles(self):
18101869
self.db.user.set('1', roles='')
18111870
self.assertTrue(not item.hasRole(''))
18121871

1813-
def testCSVExport(self):
1814-
cl = self._make_client(
1815-
{'@columns': 'id,title,status,keyword,assignedto,nosy'},
1816-
nodeid=None, userid='1')
1817-
cl.classname = 'issue'
1818-
1819-
demo_id=self.db.user.create(username='demo', address='[email protected]',
1820-
roles='User', realname='demo')
1821-
key_id1=self.db.keyword.create(name='keyword1')
1822-
key_id2=self.db.keyword.create(name='keyword2')
1823-
self.db.issue.create(title='foo1', status='2', assignedto='4', nosy=['3',demo_id])
1824-
self.db.issue.create(title='bar2', status='1', assignedto='3', keyword=[key_id1,key_id2])
1825-
self.db.issue.create(title='baz32', status='4')
1826-
output = io.BytesIO()
1827-
cl.request = MockNull()
1828-
cl.request.wfile = output
1829-
# call export version that outputs names
1830-
actions.ExportCSVAction(cl).handle()
1831-
should_be=(s2b('"id","title","status","keyword","assignedto","nosy"\r\n'
1832-
'"1","foo1","deferred","","Contrary, Mary","Bork, Chef;Contrary, Mary;demo"\r\n'
1833-
'"2","bar2","unread","keyword1;keyword2","Bork, Chef","Bork, Chef"\r\n'
1834-
'"3","baz32","need-eg","","",""\r\n'))
1835-
#print(should_be)
1836-
print(output.getvalue())
1837-
self.assertEqual(output.getvalue(), should_be)
1838-
output = io.BytesIO()
1839-
cl.request = MockNull()
1840-
cl.request.wfile = output
1841-
# call export version that outputs id numbers
1842-
actions.ExportCSVWithIdAction(cl).handle()
1843-
should_be = s2b('"id","title","status","keyword","assignedto","nosy"\r\n'
1844-
"\"1\",\"foo1\",\"2\",\"[]\",\"4\",\"['3', '4', '5']\"\r\n"
1845-
"\"2\",\"bar2\",\"1\",\"['1', '2']\",\"3\",\"['3']\"\r\n"
1846-
'\"3\","baz32",\"4\","[]","None","[]"\r\n')
1847-
#print(should_be)
1848-
print(output.getvalue())
1849-
self.assertEqual(output.getvalue(), should_be)
1850-
18511872
def testCSVExportCharset(self):
18521873
cl = self._make_client(
18531874
{'@columns': 'id,title,status,keyword,assignedto,nosy'},
@@ -2364,7 +2385,7 @@ def testTemplateSubdirectory(self):
23642385
r = t.selectTemplate("user", "subdir/item")
23652386
self.assertEqual("subdir/user.item", r)
23662387

2367-
class SqliteNativeFtsCgiTest(unittest.TestCase, testFtsQuery):
2388+
class SqliteNativeFtsCgiTest(unittest.TestCase, testFtsQuery, testCsvExport):
23682389
"""All of the rest of the tests use anydbm as the backend.
23692390
In addtion to normal fts test, this class tests renderError
23702391
when renderContext fails.
@@ -2382,6 +2403,11 @@ def setUp(self):
23822403
# open the database
23832404
self.db = self.instance.open('admin')
23842405
self.db.tx_Source = "web"
2406+
self.db.user.create(username='Chef', address='[email protected]',
2407+
realname='Bork, Chef', roles='User')
2408+
self.db.user.create(username='mary', address='[email protected]',
2409+
roles='User', realname='Contrary, Mary')
2410+
self.db.post_init()
23852411

23862412
# create a client instance and hijack write_html
23872413
self.client = client.Client(self.instance, "user",
@@ -2429,6 +2455,30 @@ def testRenderContextBadFtsQuery(self):
24292455
self.assertEqual(result, expected)
24302456
self.assertEqual(self.client.response_code, 400)
24312457

2458+
#
2459+
# SECURITY
2460+
#
2461+
# XXX test all default permissions
2462+
def _make_client(self, form, classname='user', nodeid='1',
2463+
userid='2', template='item'):
2464+
cl = client.Client(self.instance, None, {'PATH_INFO':'/',
2465+
'REQUEST_METHOD':'POST'}, db_test_base.makeForm(form))
2466+
cl.classname = classname
2467+
if nodeid is not None:
2468+
cl.nodeid = nodeid
2469+
cl.db = self.db
2470+
#cl.db.Otk = MockNull()
2471+
#cl.db.Otk.data = {}
2472+
#cl.db.Otk.getall = self.data_get
2473+
#cl.db.Otk.set = self.data_set
2474+
cl.userid = userid
2475+
cl.language = ('en',)
2476+
cl._error_message = []
2477+
cl._ok_message = []
2478+
cl.template = template
2479+
return cl
2480+
2481+
24322482
class SqliteNativeCgiTest(unittest.TestCase, testFtsQuery):
24332483
"""All of the rest of the tests use anydbm as the backend.
24342484
This class tests renderContext for fulltext search.

test/test_indexer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ def test_extremewords(self):
123123
pytest.skip("extremewords not tested for native FTS backends")
124124

125125
short = "b"
126-
long = "abcdefghijklmnopqrstuvwxyz"
126+
long = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"
127127
self.dex.add_text(('test', '1', 'a'), '%s hello world' % short)
128128
self.dex.add_text(('test', '2', 'a'), 'blah a %s world' % short)
129129
self.dex.add_text(('test', '3', 'a'), 'blah Blub river')

test/test_mysql.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,44 @@ def setUp(self):
6565
mysqlOpener.setUp(self)
6666
DBTest.setUp(self)
6767

68+
def testUpgrade_6_to_7(self):
69+
70+
# load the database
71+
self.db.issue.create(title="flebble frooz")
72+
self.db.commit()
73+
74+
if self.db.database_schema['version'] != 7:
75+
self.skipTest("This test only runs for database version 7")
76+
77+
self.db.database_schema['version'] = 6
78+
79+
# test by shrinking _words and trying to insert a long value
80+
# it should fail.
81+
# run post-init
82+
# same test should succeed.
83+
84+
self.db.sql("alter table __words change column "
85+
"_word _word varchar(10)")
86+
87+
long_string = "a" * (self.db.indexer.maxlength + 5)
88+
89+
with self.assertRaises(MySQLdb.DataError) as ctx:
90+
# DataError : Data too long for column '_word' at row 1
91+
self.db.sql("insert into __words VALUES('%s',1)" % long_string)
92+
93+
self.assertIn("Data too long for column '_word'",
94+
ctx.exception.args[1])
95+
96+
# test upgrade altering row
97+
self.db.post_init()
98+
99+
# This insert with text of expected column size should succeed
100+
self.db.sql("insert into __words VALUES('%s',1)" % long_string)
101+
102+
# Verify it fails at one more than the expected column size
103+
too_long_string = "a" * (self.db.indexer.maxlength + 6)
104+
with self.assertRaises(MySQLdb.DataError) as ctx:
105+
self.db.sql("insert into __words VALUES('%s',1)" % too_long_string)
68106

69107
@skip_mysql
70108
class mysqlROTest(mysqlOpener, ROTest, unittest.TestCase):

0 commit comments

Comments
 (0)