diff --git a/docker-compose.yml b/docker-compose.yml index 4c3f2f6b8e..073d04b896 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -132,6 +132,18 @@ services: volumes: - blobdb-data:/var/lib/postgresql/data +# typesense: +# image: typesense/typesense:30.1 +# restart: on-failure +# ports: +# - "8108:8108" +# volumes: +# - ./typesense-data:/data +# command: +# - '--data-dir=/data' +# - '--api-key=typesense-api-key' +# - '--enable-cors' + # Celery Beat is a periodic task runner. It is not normally needed for development, # but can be enabled by uncommenting the following. # diff --git a/ietf/doc/tasks.py b/ietf/doc/tasks.py index 19edb39014..273242e35f 100644 --- a/ietf/doc/tasks.py +++ b/ietf/doc/tasks.py @@ -209,3 +209,14 @@ def update_rfc_searchindex_task(self, rfc_number: int): countdown=searchindex_settings["TASK_RETRY_DELAY"], max_retries=searchindex_settings["TASK_MAX_RETRIES"], ) + + +@shared_task +def rebuild_searchindex_task(*, batchsize=40, drop_collection=False): + if drop_collection: + searchindex.delete_collection() + searchindex.create_collection() + searchindex.update_or_create_rfc_entries( + Document.objects.filter(type_id="rfc").order_by("-rfc_number"), + batchsize=batchsize, + ) diff --git a/ietf/doc/tests_tasks.py b/ietf/doc/tests_tasks.py index 728d21f131..2e2d65463f 100644 --- a/ietf/doc/tests_tasks.py +++ b/ietf/doc/tests_tasks.py @@ -24,6 +24,7 @@ generate_idnits2_rfc_status_task, investigate_fragment_task, notify_expirations_task, + rebuild_searchindex_task, update_rfc_searchindex_task, ) @@ -144,6 +145,48 @@ def test_update_rfc_searchindex_task( with self.assertRaises(Retry): update_rfc_searchindex_task(rfc_number=rfc.rfc_number) + @mock.patch("ietf.doc.tasks.searchindex.update_or_create_rfc_entries") + @mock.patch("ietf.doc.tasks.searchindex.create_collection") + @mock.patch("ietf.doc.tasks.searchindex.delete_collection") + def test_rebuild_searchindex_task(self, mock_delete, mock_create, mock_update): + rfcs = WgRfcFactory.create_batch(10) + rebuild_searchindex_task() + self.assertFalse(mock_delete.called) + self.assertFalse(mock_create.called) + self.assertTrue(mock_update.called) + self.assertQuerysetEqual( + mock_update.call_args.args[0], + sorted(rfcs, key=lambda doc: -doc.rfc_number), + ordered=True, + ) + + mock_delete.reset_mock() + mock_create.reset_mock() + mock_update.reset_mock() + rebuild_searchindex_task(drop_collection=True) + self.assertTrue(mock_delete.called) + self.assertTrue(mock_create.called) + self.assertTrue(mock_update.called) + self.assertQuerysetEqual( + mock_update.call_args.args[0], + sorted(rfcs, key=lambda doc: -doc.rfc_number), + ordered=True, + ) + + mock_delete.reset_mock() + mock_create.reset_mock() + mock_update.reset_mock() + rebuild_searchindex_task(drop_collection=True, batchsize=3) + self.assertTrue(mock_delete.called) + self.assertTrue(mock_create.called) + self.assertTrue(mock_update.called) + self.assertQuerysetEqual( + mock_update.call_args.args[0], + sorted(rfcs, key=lambda doc: -doc.rfc_number), + ordered=True, + ) + self.assertEqual(mock_update.call_args.kwargs["batchsize"], 3) + class Idnits2SupportTests(TestCase): settings_temp_path_overrides = TestCase.settings_temp_path_overrides + [ diff --git a/ietf/utils/searchindex.py b/ietf/utils/searchindex.py index e4427b88b5..a47e6d2f12 100644 --- a/ietf/utils/searchindex.py +++ b/ietf/utils/searchindex.py @@ -2,12 +2,15 @@ """Search indexing utilities""" import re +from itertools import batched from math import floor +from typing import Iterable import httpx # just for exceptions import typesense import typesense.exceptions from django.conf import settings +from typesense.types.document import DocumentSchema from ietf.doc.models import Document, StoredObject from ietf.doc.storage_utils import retrieve_str @@ -42,6 +45,24 @@ def enabled(): return _settings["TYPESENSE_API_URL"] != "" +def get_typesense_client() -> typesense.Client: + _settings = get_settings() + client = typesense.Client( + { + "api_key": _settings["TYPESENSE_API_KEY"], + "nodes": [_settings["TYPESENSE_API_URL"]], + } + ) + return client + + +def get_collection_name() -> str: + _settings = get_settings() + collection_name = _settings["TYPESENSE_COLLECTION_NAME"] + assert isinstance(collection_name, str) + return collection_name + + def _sanitize_text(content): """Sanitize content or abstract text for search""" # REs (with approximate names) @@ -62,7 +83,7 @@ def _sanitize_text(content): return content.strip() -def update_or_create_rfc_entry(rfc: Document): +def typesense_doc_from_rfc(rfc: Document) -> DocumentSchema: assert rfc.type_id == "rfc" assert rfc.rfc_number is not None @@ -75,8 +96,8 @@ def update_or_create_rfc_entry(rfc: Document): f"Indexing as {subseries[0].name}" ) subseries = subseries[0] if len(subseries) > 0 else None - obsoleted_by = rfc.relations_that("obs") - updated_by = rfc.relations_that("updates") + obsoleted_by = rfc.related_that("obs") + updated_by = rfc.related_that("updates") stored_txt = ( StoredObject.objects.exclude_deleted() @@ -91,8 +112,8 @@ def update_or_create_rfc_entry(rfc: Document): except Exception as err: log(f"Unable to retrieve {stored_txt} from storage: {err}") - ts_id = f"doc-{rfc.pk}" ts_document = { + "id": f"doc-{rfc.pk}", "rfcNumber": rfc.rfc_number, "rfc": str(rfc.rfc_number), "filename": rfc.name, @@ -143,13 +164,205 @@ def update_or_create_rfc_entry(rfc: Document): ts_document["adName"] = rfc.ad.name if content != "": ts_document["content"] = _sanitize_text(content) - _settings = get_settings() - client = typesense.Client( + return ts_document + + +def update_or_create_rfc_entry(rfc: Document): + """Update/create index entries for one RFC""" + ts_document = typesense_doc_from_rfc(rfc) + client = get_typesense_client() + client.collections[get_collection_name()].documents.upsert(ts_document) + + +def update_or_create_rfc_entries( + rfcs: Iterable[Document], batchsize: int | None = None +): + """Update/create index entries for RFCs in bulk + + If batchsize is set, computes index data in batches of batchsize and adds to the + index. Will make a total of (len(rfcs) // batchsize) + 1 API calls. + + N.b. that typesense has a server-side batch size that defaults to 40, which should + "almost never be changed from the default." This does not change that. Further, + the python client library's import_ method has a batch_size parameter that does + client-side batching. We don't use that, either. + """ + success_count = 0 + fail_count = 0 + client = get_typesense_client() + batches = [rfcs] if batchsize is None else batched(rfcs, batchsize) + for batch in batches: + tdoc_batch = [typesense_doc_from_rfc(rfc) for rfc in batch] + results = client.collections[get_collection_name()].documents.import_( + tdoc_batch, {"action": "upsert"} + ) + for tdoc, result in zip(tdoc_batch, results): + if result["success"]: + success_count += 1 + else: + fail_count += 1 + log(f"Failed to index RFC {tdoc['rfcNumber']}: {result['error']}") + log(f"Added {success_count} RFCs to the index, failed to add {fail_count}") + + +DOCS_SCHEMA = { + "enable_nested_fields": True, + "default_sorting_field": "ranking", + "fields": [ + # RFC number in integer form, for sorting asc/desc in search results + # Omit field for drafts { - "api_key": _settings["TYPESENSE_API_KEY"], - "nodes": [_settings["TYPESENSE_API_URL"]], - } - ) - client.collections[_settings["TYPESENSE_COLLECTION_NAME"]].documents.upsert( - {"id": ts_id} | ts_document - ) + "name": "rfcNumber", + "type": "int32", + "facet": False, + "optional": True, + "sort": True, + }, + # RFC number in string form, for direct matching with ranking + # Omit field for drafts + {"name": "rfc", "type": "string", "facet": False, "optional": True}, + # For drafts that correspond to an RFC, insert the RFC number + # Omit field for rfcs or if not relevant + {"name": "ref", "type": "string", "facet": False, "optional": True}, + # Filename of the document (without the extension, e.g. "rfc1234" + # or "draft-ietf-abc-def-02") + {"name": "filename", "type": "string", "facet": False, "infix": True}, + # Title of the draft / rfc + {"name": "title", "type": "string", "facet": False}, + # Abstract of the draft / rfc + {"name": "abstract", "type": "string", "facet": False}, + # A list of search keywords if relevant, set to empty array otherwise + {"name": "keywords", "type": "string[]", "facet": True}, + # Type of the document + # Accepted values: "draft" or "rfc" + {"name": "type", "type": "string", "facet": True}, + # State(s) of the document (e.g. "Published", "Adopted by a WG", etc.) + # Use the full name, not the slug + {"name": "state", "type": "string[]", "facet": True, "optional": True}, + # Status (Standard Level Name) + # Object with properties "slug" and "name" + # e.g.: { slug: "std", "name": "Internet Standard" } + {"name": "status", "type": "object", "facet": True, "optional": True}, + # The subseries it is part of. (e.g. "BCP") + # Omit otherwise. + { + "name": "subseries.acronym", + "type": "string", + "facet": True, + "optional": True, + }, + # The subseries number it is part of. (e.g. 123) + # Omit otherwise. + { + "name": "subseries.number", + "type": "int32", + "facet": True, + "sort": True, + "optional": True, + }, + # The total of RFCs in the subseries + # Omit if not part of a subseries + { + "name": "subseries.total", + "type": "int32", + "facet": False, + "sort": False, + "optional": True, + }, + # Date of the document, in unix epoch seconds (can be negative for < 1970) + {"name": "date", "type": "int64", "facet": False}, + # Expiration date of the document, in unix epoch seconds (can be negative + # for < 1970). Omit field for RFCs + {"name": "expires", "type": "int64", "facet": False, "optional": True}, + # Publication date of the RFC, in unix epoch seconds (can be negative + # for < 1970). Omit field for drafts + { + "name": "publicationDate", + "type": "int64", + "facet": True, + "optional": True, + }, + # Working Group + # Object with properties "acronym", "name" and "full" + # e.g.: + # { + # "acronym": "ntp", + # "name": "Network Time Protocols", + # "full": "ntp - Network Time Protocols", + # } + {"name": "group", "type": "object", "facet": True, "optional": True}, + # Area + # Object with properties "acronym", "name" and "full" + # e.g.: + # { + # "acronym": "mpls", + # "name": "Multiprotocol Label Switching", + # "full": "mpls - Multiprotocol Label Switching", + # } + {"name": "area", "type": "object", "facet": True, "optional": True}, + # Stream + # Object with properties "slug" and "name" + # e.g.: { slug: "ietf", "name": "IETF" } + {"name": "stream", "type": "object", "facet": True, "optional": True}, + # List of authors + # Array of objects with properties "name" and "affiliation" + # e.g.: + # [ + # {"name": "John Doe", "affiliation": "ACME Inc."}, + # {"name": "Ada Lovelace", "affiliation": "Babbage Corps."}, + # ] + {"name": "authors", "type": "object[]", "facet": True, "optional": True}, + # Area Director Name (e.g. "Leonardo DaVinci") + {"name": "adName", "type": "string", "facet": True, "optional": True}, + # Whether the document should be hidden by default in search results or not. + {"name": "flags.hiddenDefault", "type": "bool", "facet": True}, + # Whether the document is obsoleted by another document or not. + {"name": "flags.obsoleted", "type": "bool", "facet": True}, + # Whether the document is updated by another document or not. + {"name": "flags.updated", "type": "bool", "facet": True}, + # List of documents that obsolete this document. + # Array of strings. Use RFC number for RFCs. (e.g. ["123", "456"]) + # Omit if none. Must be provided if "flags.obsoleted" is set to True. + { + "name": "obsoletedBy", + "type": "string[]", + "facet": False, + "optional": True, + }, + # List of documents that update this document. + # Array of strings. Use RFC number for RFCs. (e.g. ["123", "456"]) + # Omit if none. Must be provided if "flags.updated" is set to True. + {"name": "updatedBy", "type": "string[]", "facet": False, "optional": True}, + # Sanitized content of the document. + # Make sure to remove newlines, double whitespaces, symbols and tags. + { + "name": "content", + "type": "string", + "facet": False, + "optional": True, + "store": False, + }, + # Ranking value to use when no explicit sorting is used during search + # Set to the RFC number for RFCs and the revision number for drafts + # This ensures newer RFCs get listed first in the default search results + # (without a query) + {"name": "ranking", "type": "int32", "facet": False}, + ], +} + + +def create_collection(): + collection_name = get_collection_name() + log(f"Creating '{collection_name}' collection") + client = get_typesense_client() + client.collections.create({"name": get_collection_name()} | DOCS_SCHEMA) + + +def delete_collection(): + collection_name = get_collection_name() + log(f"Deleting '{collection_name}' collection") + client = get_typesense_client() + try: + client.collections[collection_name].delete() + except typesense.exceptions.ObjectNotFound: + pass diff --git a/ietf/utils/tests_searchindex.py b/ietf/utils/tests_searchindex.py index 8740716c85..0bff96ec7d 100644 --- a/ietf/utils/tests_searchindex.py +++ b/ietf/utils/tests_searchindex.py @@ -1,6 +1,7 @@ # Copyright The IETF Trust 2026, All Rights Reserved from unittest import mock +import typesense.exceptions from django.conf import settings from django.test.utils import override_settings @@ -51,42 +52,29 @@ def test_sanitize_text(self): "TYPESENSE_COLLECTION_NAME": "frogs", } ) - @mock.patch("ietf.utils.searchindex.typesense.Client") - def test_update_or_create_rfc_entry(self, mock_ts_client_constructor): + def test_typesense_doc_from_rfc(self): not_rfc = WgDraftFactory() assert isinstance(not_rfc, Document) with self.assertRaises(AssertionError): - searchindex.update_or_create_rfc_entry(not_rfc) - self.assertFalse(mock_ts_client_constructor.called) + searchindex.typesense_doc_from_rfc(not_rfc) invalid_rfc = WgRfcFactory(name="rfc1000000", rfc_number=None) assert isinstance(invalid_rfc, Document) with self.assertRaises(AssertionError): - searchindex.update_or_create_rfc_entry(invalid_rfc) - self.assertFalse(mock_ts_client_constructor.called) + searchindex.typesense_doc_from_rfc(invalid_rfc) rfc = PublishedRfcDocEventFactory().doc assert isinstance(rfc, Document) - searchindex.update_or_create_rfc_entry(rfc) - self.assertTrue(mock_ts_client_constructor.called) - # walk the tree down to the method we expected to be called... - mock_upsert = mock_ts_client_constructor.return_value.collections[ - "frogs" - ].documents.upsert # matches value in override_settings above - self.assertTrue(mock_upsert.called) - upserted_dict = mock_upsert.call_args[0][0] + result = searchindex.typesense_doc_from_rfc(rfc) # Check a few values, not exhaustive - self.assertEqual(upserted_dict["id"], f"doc-{rfc.pk}") - self.assertEqual(upserted_dict["rfcNumber"], rfc.rfc_number) - self.assertEqual( - upserted_dict["abstract"], searchindex._sanitize_text(rfc.abstract) - ) - self.assertNotIn("adName", upserted_dict) - self.assertNotIn("content", upserted_dict) # no blob - self.assertNotIn("subseries", upserted_dict) + self.assertEqual(result["id"], f"doc-{rfc.pk}") + self.assertEqual(result["rfcNumber"], rfc.rfc_number) + self.assertEqual(result["abstract"], searchindex._sanitize_text(rfc.abstract)) + self.assertNotIn("adName", result) + self.assertNotIn("content", result) # no blob + self.assertNotIn("subseries", result) # repeat, this time with contents, an AD, and subseries docs - mock_upsert.reset_mock() store_str( kind="rfc", name=f"txt/{rfc.name}.txt", @@ -99,17 +87,15 @@ def test_update_or_create_rfc_entry(self, mock_ts_client_constructor): # (the typesense schema does not support this for real at the moment) BcpFactory(contains=[rfc], name="bcp1234") StdFactory(contains=[rfc], name="std1234") - searchindex.update_or_create_rfc_entry(rfc) - self.assertTrue(mock_upsert.called) - upserted_dict = mock_upsert.call_args[0][0] + result = searchindex.typesense_doc_from_rfc(rfc) # Check a few values, not exhaustive self.assertEqual( - upserted_dict["content"], + result["content"], searchindex._sanitize_text("The contents of this RFC"), ) - self.assertEqual(upserted_dict["adName"], "Alfred D. Rector") - self.assertIn("subseries", upserted_dict) - ss_dict = upserted_dict["subseries"] + self.assertEqual(result["adName"], "Alfred D. Rector") + self.assertIn("subseries", result) + ss_dict = result["subseries"] # We should get one of the two subseries docs, but neither is more correct # than the other... self.assertTrue( @@ -119,10 +105,108 @@ def test_update_or_create_rfc_entry(self, mock_ts_client_constructor): ) ) - # Finally, delete the contents blob and make sure things don't blow up - mock_upsert.reset_mock() + # Finally, delete the contents blob and make sure things don't blow up Blob.objects.get(bucket="rfc", name=f"txt/{rfc.name}.txt").delete() + result = searchindex.typesense_doc_from_rfc(rfc) + self.assertNotIn("content", result) + + @override_settings( + SEARCHINDEX_CONFIG={ + "TYPESENSE_API_URL": "http://ts.example.com", + "TYPESENSE_API_KEY": "test-api-key", + "TYPESENSE_COLLECTION_NAME": "frogs", + } + ) + @mock.patch("ietf.utils.searchindex.typesense_doc_from_rfc") + @mock.patch("ietf.utils.searchindex.typesense.Client") + def test_update_or_create_rfc_entry( + self, mock_ts_client_constructor, mock_tdoc_from_rfc + ): + fake_tdoc = object() + mock_tdoc_from_rfc.return_value = fake_tdoc + rfc = WgRfcFactory() + assert isinstance(rfc, Document) searchindex.update_or_create_rfc_entry(rfc) + self.assertTrue(mock_ts_client_constructor.called) + # walk the tree down to the method we expected to be called... + mock_upsert = mock_ts_client_constructor.return_value.collections[ + "frogs" # matches value in override_settings above + ].documents.upsert self.assertTrue(mock_upsert.called) - upserted_dict = mock_upsert.call_args[0][0] - self.assertNotIn("content", upserted_dict) + self.assertEqual(mock_upsert.call_args, mock.call(fake_tdoc)) + + @override_settings( + SEARCHINDEX_CONFIG={ + "TYPESENSE_API_URL": "http://ts.example.com", + "TYPESENSE_API_KEY": "test-api-key", + "TYPESENSE_COLLECTION_NAME": "frogs", + } + ) + @mock.patch("ietf.utils.searchindex.typesense_doc_from_rfc") + @mock.patch("ietf.utils.searchindex.typesense.Client") + def test_update_or_create_rfc_entries( + self, mock_ts_client_constructor, mock_tdoc_from_rfc + ): + fake_tdoc = object() + mock_tdoc_from_rfc.return_value = fake_tdoc + rfc = WgRfcFactory() + assert isinstance(rfc, Document) + searchindex.update_or_create_rfc_entries([rfc] * 50) # list of docs... + self.assertEqual(mock_ts_client_constructor.call_count, 1) + # walk the tree down to the method we expected to be called... + mock_import_ = mock_ts_client_constructor.return_value.collections[ + "frogs" # matches value in override_settings above + ].documents.import_ + self.assertEqual(mock_import_.call_count, 1) + self.assertEqual( + mock_import_.call_args, mock.call([fake_tdoc] * 50, {"action": "upsert"}) + ) + + mock_import_.reset_mock() + searchindex.update_or_create_rfc_entries([rfc] * 50, batchsize=20) + self.assertEqual(mock_ts_client_constructor.call_count, 2) # one more + # walk the tree down to the method we expected to be called... + mock_import_ = mock_ts_client_constructor.return_value.collections[ + "frogs" # matches value in override_settings above + ].documents.import_ + self.assertEqual(mock_import_.call_count, 3) + self.assertEqual( + mock_import_.call_args_list, + [ + mock.call([fake_tdoc] * 20, {"action": "upsert"}), + mock.call([fake_tdoc] * 20, {"action": "upsert"}), + mock.call([fake_tdoc] * 10, {"action": "upsert"}), + ], + ) + + @override_settings( + SEARCHINDEX_CONFIG={ + "TYPESENSE_API_URL": "http://ts.example.com", + "TYPESENSE_API_KEY": "test-api-key", + "TYPESENSE_COLLECTION_NAME": "frogs", + } + ) + @mock.patch("ietf.utils.searchindex.typesense.Client") + def test_create_collection(self, mock_ts_client_constructor): + searchindex.create_collection() + self.assertEqual(mock_ts_client_constructor.call_count, 1) + mock_collections = mock_ts_client_constructor.return_value.collections + self.assertTrue(mock_collections.create.called) + self.assertEqual(mock_collections.create.call_args[0][0]["name"], "frogs") + + @override_settings( + SEARCHINDEX_CONFIG={ + "TYPESENSE_API_URL": "http://ts.example.com", + "TYPESENSE_API_KEY": "test-api-key", + "TYPESENSE_COLLECTION_NAME": "frogs", + } + ) + @mock.patch("ietf.utils.searchindex.typesense.Client") + def test_delete_collection(self, mock_ts_client_constructor): + searchindex.delete_collection() + self.assertEqual(mock_ts_client_constructor.call_count, 1) + mock_collections = mock_ts_client_constructor.return_value.collections + self.assertTrue(mock_collections["frogs"].delete.called) + + mock_collections["frogs"].side_effect = typesense.exceptions.ObjectNotFound + searchindex.delete_collection() # should ignore the exception