diff --git a/.gitignore b/.gitignore index 84bc800e3b..ccc7a46b08 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .DS_store datatracker.sublime-project datatracker.sublime-workspace +/.claude /.coverage /.factoryboy_random_state /.mypy_cache diff --git a/docker-compose.yml b/docker-compose.yml index 4c3f2f6b8e..073d04b896 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -132,6 +132,18 @@ services: volumes: - blobdb-data:/var/lib/postgresql/data +# typesense: +# image: typesense/typesense:30.1 +# restart: on-failure +# ports: +# - "8108:8108" +# volumes: +# - ./typesense-data:/data +# command: +# - '--data-dir=/data' +# - '--api-key=typesense-api-key' +# - '--enable-cors' + # Celery Beat is a periodic task runner. It is not normally needed for development, # but can be enabled by uncommenting the following. # diff --git a/ietf/api/tests_views_rpc.py b/ietf/api/tests_views_rpc.py index 0db67e126f..180221cffc 100644 --- a/ietf/api/tests_views_rpc.py +++ b/ietf/api/tests_views_rpc.py @@ -1,4 +1,5 @@ # Copyright The IETF Trust 2025, All Rights Reserved +import datetime from io import StringIO from pathlib import Path from tempfile import TemporaryDirectory @@ -10,12 +11,15 @@ from django.test.utils import override_settings from django.urls import reverse as urlreverse import mock +from django.utils import timezone from ietf.blobdb.models import Blob from ietf.doc.factories import IndividualDraftFactory, RfcFactory, WgDraftFactory, WgRfcFactory from ietf.doc.models import RelatedDocument, Document from ietf.group.factories import RoleFactory, GroupFactory from ietf.person.factories import PersonFactory +from ietf.sync.rfcindex import rfcindex_is_dirty +from ietf.utils.models import DirtyBits from ietf.utils.test_utils import APITestCase, reload_db_objects @@ -408,8 +412,13 @@ def _valid_post_data(): ) @override_settings(APP_API_TOKENS={"ietf.api.views_rpc": ["valid-token"]}) - @mock.patch("ietf.api.views_rpc.create_rfc_index_task") - def test_refresh_rfc_index(self, mock_task): + def test_refresh_rfc_index(self): + DirtyBits.objects.create( + slug=DirtyBits.Slugs.RFCINDEX, + dirty_time=timezone.now() - datetime.timedelta(days=1), + processed_time=timezone.now() - datetime.timedelta(hours=12), + ) + self.assertFalse(rfcindex_is_dirty()) url = urlreverse("ietf.api.purple_api.refresh_rfc_index") response = self.client.get(url) self.assertEqual(response.status_code, 403) @@ -417,7 +426,7 @@ def test_refresh_rfc_index(self, mock_task): self.assertEqual(response.status_code, 403) response = self.client.get(url, headers={"X-Api-Key": "valid-token"}) self.assertEqual(response.status_code, 405) - self.assertFalse(mock_task.delay.called) + self.assertFalse(rfcindex_is_dirty()) response = self.client.post(url, headers={"X-Api-Key": "valid-token"}) self.assertEqual(response.status_code, 202) - self.assertTrue(mock_task.delay.called) + self.assertTrue(rfcindex_is_dirty()) diff --git a/ietf/api/views_rpc.py b/ietf/api/views_rpc.py index 1e96118e58..6bc45fe3da 100644 --- a/ietf/api/views_rpc.py +++ b/ietf/api/views_rpc.py @@ -32,7 +32,9 @@ EmailPersonSerializer, RfcWithAuthorsSerializer, DraftWithAuthorsSerializer, - NotificationAckSerializer, RfcPubSerializer, RfcFileSerializer, + NotificationAckSerializer, + RfcPubSerializer, + RfcFileSerializer, EditableRfcSerializer, ) from ietf.doc.models import Document, DocHistory, RfcAuthor, DocEvent @@ -45,7 +47,7 @@ update_rfc_searchindex_task, ) from ietf.person.models import Email, Person -from ietf.sync.tasks import create_rfc_index_task +from ietf.sync.rfcindex import mark_rfcindex_as_dirty class Conflict(APIException): @@ -344,9 +346,10 @@ def post(self, request): class RfcAuthorViewSet(viewsets.ReadOnlyModelViewSet): """ViewSet for RfcAuthor model - + Router needs to provide rfc_number as a kwarg """ + api_key_endpoint = "ietf.api.views_rpc" queryset = RfcAuthor.objects.all() @@ -407,7 +410,7 @@ class RfcPubFilesView(APIView): def _fs_destination(self, filename: str | Path) -> Path: """Destination for an uploaded RFC file in the filesystem - + Strips any path components in filename and returns an absolute Path. """ rfc_path = Path(settings.RFC_PATH) @@ -419,7 +422,7 @@ def _fs_destination(self, filename: str | Path) -> Path: def _blob_destination(self, filename: str | Path) -> str: """Destination name for an uploaded RFC file in the blob store - + Strips any path components in filename and returns an absolute Path. """ filename = Path(filename) # could potentially have directory components @@ -472,9 +475,7 @@ def post(self, request): code="files-exist", ) for possible_existing_blob in possible_rfc_blobs: - if exists_in_storage( - kind=blob_kind, name=possible_existing_blob - ): + if exists_in_storage(kind=blob_kind, name=possible_existing_blob): raise Conflict( "Blob(s) already exist for this RFC", code="blobs-exist", @@ -523,7 +524,9 @@ def post(self, request): # Trigger red precomputer needs_updating = [rfc.rfc_number] - for rel in rfc.relateddocument_set.filter(relationship_id__in=["obs","updates"]): + for rel in rfc.relateddocument_set.filter( + relationship_id__in=["obs", "updates"] + ): needs_updating.append(rel.target.rfc_number) trigger_red_precomputer_task.delay(rfc_number_list=sorted(needs_updating)) # Trigger search index update @@ -540,10 +543,10 @@ class RfcIndexView(APIView): @extend_schema( operation_id="refresh_rfc_index", summary="Refresh rfc-index files", - description="Requests creation of rfc-index.xml and rfc-index.txt files", + description="Requests creation of various index files.", responses={202: None}, request=None, ) def post(self, request): - create_rfc_index_task.delay() + mark_rfcindex_as_dirty() return Response(status=202) diff --git a/ietf/doc/feeds.py b/ietf/doc/feeds.py index afe96cf0df..0269906fcf 100644 --- a/ietf/doc/feeds.py +++ b/ietf/doc/feeds.py @@ -5,6 +5,7 @@ import datetime import unicodedata +from django.conf import settings from django.contrib.syndication.views import Feed, FeedDoesNotExist from django.utils.feedgenerator import Atom1Feed, Rss201rev2Feed from django.urls import reverse as urlreverse @@ -223,7 +224,7 @@ def item_extra_kwargs(self, item): extra.update({"dcterms_accessRights": "gratis"}) extra.update({"dcterms_format": "text/html"}) media_contents = [] - if item.rfc_number < 8650: + if item.rfc_number < settings.FIRST_V3_RFC: if item.rfc_number not in [8, 9, 51, 418, 500, 530, 589]: for fmt, media_type in [("txt", "text/plain"), ("html", "text/html")]: media_contents.append( diff --git a/ietf/doc/models.py b/ietf/doc/models.py index 972f0a34e8..cc79b73831 100644 --- a/ietf/doc/models.py +++ b/ietf/doc/models.py @@ -52,6 +52,7 @@ from ietf.person.utils import get_active_balloters from ietf.utils import log from ietf.utils.decorators import memoize +from ietf.utils.text import decode_document_content from ietf.utils.validators import validate_no_control_chars from ietf.utils.mail import formataddr from ietf.utils.models import ForeignKey @@ -640,19 +641,7 @@ def text(self, size = -1): except IOError as e: log.log(f"Error reading text for {path}: {e}") return None - text = None - try: - text = raw.decode('utf-8') - except UnicodeDecodeError: - for back in range(1,4): - try: - text = raw[:-back].decode('utf-8') - break - except UnicodeDecodeError: - pass - if text is None: - text = raw.decode('latin-1') - return text + return decode_document_content(raw) def text_or_error(self): return self.text() or "Error; cannot read '%s'"%self.get_base_name() diff --git a/ietf/doc/storage.py b/ietf/doc/storage.py index 375620ccaf..ee1e76c4fa 100644 --- a/ietf/doc/storage.py +++ b/ietf/doc/storage.py @@ -114,7 +114,6 @@ def _get_write_parameters(self, name, content=None): class StoredObjectBlobdbStorage(BlobdbStorage): - ietf_log_blob_timing = True warn_if_missing = True # TODO-BLOBSTORE make this configurable (or remove it) def _save_stored_object(self, name, content) -> StoredObject: diff --git a/ietf/doc/storage_utils.py b/ietf/doc/storage_utils.py index 81588c83ec..9c18bb8a8a 100644 --- a/ietf/doc/storage_utils.py +++ b/ietf/doc/storage_utils.py @@ -10,6 +10,7 @@ from django.core.files.storage import storages, Storage from ietf.utils.log import log +from ietf.utils.text import decode_document_content class StorageUtilsError(Exception): @@ -164,34 +165,30 @@ def store_str( def retrieve_bytes(kind: str, name: str) -> bytes: from ietf.doc.storage import maybe_log_timing - content = b"" - if settings.ENABLE_BLOBSTORAGE: - try: - store = _get_storage(kind) - with store.open(name) as f: - with maybe_log_timing( - hasattr(store, "ietf_log_blob_timing") and store.ietf_log_blob_timing, - "read", - bucket_name=store.bucket_name if hasattr(store, "bucket_name") else "", - name=name, - ): - content = f.read() - except Exception as err: - log(f"Blobstore Error: Failed to read bytes from {kind}:{name}: {repr(err)}") - if settings.SERVER_MODE == "development": - raise + if not settings.ENABLE_BLOBSTORAGE: + return b"" + try: + store = _get_storage(kind) + with store.open(name) as f: + with maybe_log_timing( + hasattr(store, "ietf_log_blob_timing") and store.ietf_log_blob_timing, + "read", + bucket_name=store.bucket_name if hasattr(store, "bucket_name") else "", + name=name, + ): + content = f.read() + except Exception as err: + log(f"Blobstore Error: Failed to read bytes from {kind}:{name}: {repr(err)}") + raise return content def retrieve_str(kind: str, name: str) -> str: - content = "" - if settings.ENABLE_BLOBSTORAGE: - try: - content_bytes = retrieve_bytes(kind, name) - # TODO-BLOBSTORE: try to decode all the different ways doc.text() does - content = content_bytes.decode("utf-8") - except Exception as err: - log(f"Blobstore Error: Failed to read string from {kind}:{name}: {repr(err)}") - if settings.SERVER_MODE == "development": - raise + if not settings.ENABLE_BLOBSTORAGE: + return "" + try: + content = decode_document_content(retrieve_bytes(kind, name)) + except Exception as err: + log(f"Blobstore Error: Failed to read string from {kind}:{name}: {repr(err)}") + raise return content diff --git a/ietf/doc/tasks.py b/ietf/doc/tasks.py index 19edb39014..273242e35f 100644 --- a/ietf/doc/tasks.py +++ b/ietf/doc/tasks.py @@ -209,3 +209,14 @@ def update_rfc_searchindex_task(self, rfc_number: int): countdown=searchindex_settings["TASK_RETRY_DELAY"], max_retries=searchindex_settings["TASK_MAX_RETRIES"], ) + + +@shared_task +def rebuild_searchindex_task(*, batchsize=40, drop_collection=False): + if drop_collection: + searchindex.delete_collection() + searchindex.create_collection() + searchindex.update_or_create_rfc_entries( + Document.objects.filter(type_id="rfc").order_by("-rfc_number"), + batchsize=batchsize, + ) diff --git a/ietf/doc/tests_notprepped.py b/ietf/doc/tests_notprepped.py new file mode 100644 index 0000000000..f417aa7931 --- /dev/null +++ b/ietf/doc/tests_notprepped.py @@ -0,0 +1,122 @@ +# Copyright The IETF Trust 2026, All Rights Reserved + +from django.conf import settings +from django.utils import timezone +from django.urls import reverse as urlreverse + +from pyquery import PyQuery + +from ietf.doc.factories import WgRfcFactory +from ietf.doc.models import StoredObject +from ietf.doc.storage_utils import store_bytes +from ietf.utils.test_utils import TestCase + + +class NotpreppedRfcXmlTests(TestCase): + def test_editor_source_button_visibility(self): + pre_v3 = WgRfcFactory(rfc_number=settings.FIRST_V3_RFC - 1) + first_v3 = WgRfcFactory(rfc_number=settings.FIRST_V3_RFC) + post_v3 = WgRfcFactory(rfc_number=settings.FIRST_V3_RFC + 1) + + for rfc, expect_button in [(pre_v3, False), (first_v3, True), (post_v3, True)]: + r = self.client.get( + urlreverse( + "ietf.doc.views_doc.document_main", kwargs=dict(name=rfc.name) + ) + ) + self.assertEqual(r.status_code, 200) + buttons = PyQuery(r.content)('a.btn:contains("Get editor source")') + if expect_button: + self.assertEqual(len(buttons), 1, msg=f"rfc_number={rfc.rfc_number}") + expected_href = urlreverse( + "ietf.doc.views_doc.rfcxml_notprepped_wrapper", + kwargs=dict(number=rfc.rfc_number), + ) + self.assertEqual( + buttons.attr("href"), + expected_href, + msg=f"rfc_number={rfc.rfc_number}", + ) + else: + self.assertEqual(len(buttons), 0, msg=f"rfc_number={rfc.rfc_number}") + + def test_rfcxml_notprepped(self): + number = settings.FIRST_V3_RFC + stored_name = f"notprepped/rfc{number}.notprepped.xml" + url = f"/doc/rfc{number}/notprepped/" + + # 404 for pre-v3 RFC numbers (no document needed) + r = self.client.get(f"/doc/rfc{number - 1}/notprepped/") + self.assertEqual(r.status_code, 404) + + # 404 when no RFC document exists in the database + r = self.client.get(url) + self.assertEqual(r.status_code, 404) + + # 404 when RFC document exists but has no StoredObject + WgRfcFactory(rfc_number=number) + r = self.client.get(url) + self.assertEqual(r.status_code, 404) + + # 404 when StoredObject exists but backing storage is missing (FileNotFoundError) + now = timezone.now() + StoredObject.objects.create( + store="rfc", + name=stored_name, + sha384="a" * 96, + len=0, + store_created=now, + created=now, + modified=now, + ) + r = self.client.get(url) + self.assertEqual(r.status_code, 404) + + # 200 with correct content-type, attachment disposition, and body when object is fully stored + xml_content = b"test" + store_bytes("rfc", stored_name, xml_content, allow_overwrite=True) + r = self.client.get(url) + self.assertEqual(r.status_code, 200) + self.assertEqual(r["Content-Type"], "application/xml") + self.assertEqual( + r["Content-Disposition"], + f'attachment; filename="rfc{number}.notprepped.xml"', + ) + self.assertEqual(b"".join(r.streaming_content), xml_content) + + def test_rfcxml_notprepped_wrapper(self): + number = settings.FIRST_V3_RFC + + # 404 for pre-v3 RFC numbers (no document needed) + r = self.client.get( + urlreverse( + "ietf.doc.views_doc.rfcxml_notprepped_wrapper", + kwargs=dict(number=number - 1), + ) + ) + self.assertEqual(r.status_code, 404) + + # 404 when no RFC document exists in the database + r = self.client.get( + urlreverse( + "ietf.doc.views_doc.rfcxml_notprepped_wrapper", + kwargs=dict(number=number), + ) + ) + self.assertEqual(r.status_code, 404) + + # 200 with rendered template when RFC document exists + rfc = WgRfcFactory(rfc_number=number) + r = self.client.get( + urlreverse( + "ietf.doc.views_doc.rfcxml_notprepped_wrapper", + kwargs=dict(number=number), + ) + ) + self.assertEqual(r.status_code, 200) + q = PyQuery(r.content) + self.assertIn(str(rfc.rfc_number), q("h1").text()) + download_url = urlreverse( + "ietf.doc.views_doc.rfcxml_notprepped", kwargs=dict(number=number) + ) + self.assertEqual(len(q(f'a.btn[href="{download_url}"]')), 1) diff --git a/ietf/doc/tests_tasks.py b/ietf/doc/tests_tasks.py index 728d21f131..2e2d65463f 100644 --- a/ietf/doc/tests_tasks.py +++ b/ietf/doc/tests_tasks.py @@ -24,6 +24,7 @@ generate_idnits2_rfc_status_task, investigate_fragment_task, notify_expirations_task, + rebuild_searchindex_task, update_rfc_searchindex_task, ) @@ -144,6 +145,48 @@ def test_update_rfc_searchindex_task( with self.assertRaises(Retry): update_rfc_searchindex_task(rfc_number=rfc.rfc_number) + @mock.patch("ietf.doc.tasks.searchindex.update_or_create_rfc_entries") + @mock.patch("ietf.doc.tasks.searchindex.create_collection") + @mock.patch("ietf.doc.tasks.searchindex.delete_collection") + def test_rebuild_searchindex_task(self, mock_delete, mock_create, mock_update): + rfcs = WgRfcFactory.create_batch(10) + rebuild_searchindex_task() + self.assertFalse(mock_delete.called) + self.assertFalse(mock_create.called) + self.assertTrue(mock_update.called) + self.assertQuerysetEqual( + mock_update.call_args.args[0], + sorted(rfcs, key=lambda doc: -doc.rfc_number), + ordered=True, + ) + + mock_delete.reset_mock() + mock_create.reset_mock() + mock_update.reset_mock() + rebuild_searchindex_task(drop_collection=True) + self.assertTrue(mock_delete.called) + self.assertTrue(mock_create.called) + self.assertTrue(mock_update.called) + self.assertQuerysetEqual( + mock_update.call_args.args[0], + sorted(rfcs, key=lambda doc: -doc.rfc_number), + ordered=True, + ) + + mock_delete.reset_mock() + mock_create.reset_mock() + mock_update.reset_mock() + rebuild_searchindex_task(drop_collection=True, batchsize=3) + self.assertTrue(mock_delete.called) + self.assertTrue(mock_create.called) + self.assertTrue(mock_update.called) + self.assertQuerysetEqual( + mock_update.call_args.args[0], + sorted(rfcs, key=lambda doc: -doc.rfc_number), + ordered=True, + ) + self.assertEqual(mock_update.call_args.kwargs["batchsize"], 3) + class Idnits2SupportTests(TestCase): settings_temp_path_overrides = TestCase.settings_temp_path_overrides + [ diff --git a/ietf/doc/urls.py b/ietf/doc/urls.py index 61e94b2231..0c13503b78 100644 --- a/ietf/doc/urls.py +++ b/ietf/doc/urls.py @@ -99,6 +99,8 @@ url(r'^%(name)s(?:/%(rev)s)?/$' % settings.URL_REGEXPS, views_doc.document_main), url(r'^%(name)s(?:/%(rev)s)?/bibtex/$' % settings.URL_REGEXPS, views_doc.document_bibtex), + url(r'^rfc(?P[0-9]+)/notprepped/$' , views_doc.rfcxml_notprepped), + url(r'^rfc(?P[0-9]+)/notprepped-wrapper/$', views_doc.rfcxml_notprepped_wrapper), url(r'^%(name)s(?:/%(rev)s)?/idnits2-state/$' % settings.URL_REGEXPS, views_doc.idnits2_state), url(r'^bibxml3/reference.I-D.%(name)s(?:-%(rev)s)?.xml$' % settings.URL_REGEXPS, views_doc.document_bibxml_ref), url(r'^bibxml3/%(name)s(?:-%(rev)s)?.xml$' % settings.URL_REGEXPS, views_doc.document_bibxml), diff --git a/ietf/doc/views_doc.py b/ietf/doc/views_doc.py index c1f6352ac3..5b57a62074 100644 --- a/ietf/doc/views_doc.py +++ b/ietf/doc/views_doc.py @@ -1,4 +1,4 @@ -# Copyright The IETF Trust 2009-2024, All Rights Reserved +# Copyright The IETF Trust 2009-2026, All Rights Reserved # -*- coding: utf-8 -*- # # Parts Copyright (C) 2009-2010 Nokia Corporation and/or its subsidiary(-ies). @@ -43,9 +43,10 @@ from celery.result import AsyncResult from django.core.cache import caches +from django.core.files.base import ContentFile from django.core.exceptions import PermissionDenied from django.db.models import Max -from django.http import HttpResponse, Http404, HttpResponseBadRequest, JsonResponse +from django.http import FileResponse, HttpResponse, Http404, HttpResponseBadRequest, JsonResponse from django.shortcuts import render, get_object_or_404, redirect from django.template.loader import render_to_string from django.urls import reverse as urlreverse @@ -57,7 +58,7 @@ import debug # pyflakes:ignore from ietf.doc.models import ( Document, DocHistory, DocEvent, BallotDocEvent, BallotType, - ConsensusDocEvent, NewRevisionDocEvent, TelechatDocEvent, WriteupDocEvent, IanaExpertDocEvent, + ConsensusDocEvent, NewRevisionDocEvent, StoredObject, TelechatDocEvent, WriteupDocEvent, IanaExpertDocEvent, IESG_BALLOT_ACTIVE_STATES, STATUSCHANGE_RELATIONS, DocumentActionHolder, DocumentAuthor, RelatedDocument, RelatedDocHistory) from ietf.doc.tasks import investigate_fragment_task @@ -86,6 +87,7 @@ from ietf.review.models import ReviewAssignment from ietf.review.utils import can_request_review_of_doc, review_assignments_to_list_for_docs, review_requests_to_list_for_docs from ietf.review.utils import no_review_from_teams_on_doc +from ietf.doc.storage_utils import retrieve_bytes from ietf.utils import markup_txt, log, markdown from ietf.utils.draft import get_status_from_draft_text from ietf.utils.meetecho import MeetechoAPIError, SlidesManager @@ -2356,3 +2358,29 @@ def investigate(request): "results": results, }, ) + +def rfcxml_notprepped(request, number): + number = int(number) + if number < settings.FIRST_V3_RFC: + raise Http404 + rfc = Document.objects.filter(type="rfc", rfc_number=number).first() + if rfc is None: + raise Http404 + name = f"notprepped/rfc{number}.notprepped.xml" + if not StoredObject.objects.filter(name=name).exists(): + raise Http404 + try: + bytes = retrieve_bytes("rfc", name) + except FileNotFoundError: + raise Http404 + return FileResponse(ContentFile(bytes, name=f"rfc{number}.notprepped.xml"), as_attachment=True) + + +def rfcxml_notprepped_wrapper(request, number): + number = int(number) + if number < settings.FIRST_V3_RFC: + raise Http404 + rfc = Document.objects.filter(type="rfc", rfc_number=number).first() + if rfc is None: + raise Http404 + return render(request, "doc/notprepped_wrapper.html", context={"rfc": rfc}) diff --git a/ietf/group/tests_review.py b/ietf/group/tests_review.py index 89c755bb26..bb9b79a416 100644 --- a/ietf/group/tests_review.py +++ b/ietf/group/tests_review.py @@ -888,10 +888,10 @@ def test_requests_history_filter_page(self): self.assertEqual(r.status_code, 200) self.assertContains(r, review_req.doc.name) self.assertContains(r, review_req2.doc.name) - self.assertContains(r, 'Assigned') - self.assertContains(r, 'Accepted') - self.assertContains(r, 'Completed') - self.assertContains(r, 'Ready') + self.assertContains(r, 'data-text="Assigned"') + self.assertContains(r, 'data-text="Accepted"') + self.assertContains(r, 'data-text="Completed"') + self.assertContains(r, 'data-text="Ready"') self.assertContains(r, escape(assignment.reviewer.person.name)) self.assertContains(r, escape(assignment2.reviewer.person.name)) @@ -907,10 +907,10 @@ def test_requests_history_filter_page(self): self.assertEqual(r.status_code, 200) self.assertContains(r, review_req.doc.name) self.assertNotContains(r, review_req2.doc.name) - self.assertContains(r, 'Assigned') - self.assertNotContains(r, 'Accepted') - self.assertNotContains(r, 'Completed') - self.assertNotContains(r, 'Ready') + self.assertContains(r, 'data-text="Assigned"') + self.assertNotContains(r, 'data-text="Accepted"') + self.assertNotContains(r, 'data-text="Completed"') + self.assertNotContains(r, 'data-text="Ready"') self.assertContains(r, escape(assignment.reviewer.person.name)) self.assertNotContains(r, escape(assignment2.reviewer.person.name)) @@ -926,10 +926,10 @@ def test_requests_history_filter_page(self): self.assertEqual(r.status_code, 200) self.assertNotContains(r, review_req.doc.name) self.assertContains(r, review_req2.doc.name) - self.assertNotContains(r, 'Assigned') - self.assertContains(r, 'Accepted') - self.assertContains(r, 'Completed') - self.assertContains(r, 'Ready') + self.assertNotContains(r, 'data-text="Assigned"') + self.assertContains(r, 'data-text="Accepted"') + self.assertContains(r, 'data-text="Completed"') + self.assertContains(r, 'data-text="Ready"') self.assertNotContains(r, escape(assignment.reviewer.person.name)) self.assertContains(r, escape(assignment2.reviewer.person.name)) @@ -940,9 +940,9 @@ def test_requests_history_filter_page(self): r = self.client.get(url) self.assertEqual(r.status_code, 200) self.assertNotContains(r, review_req.doc.name) - self.assertNotContains(r, 'Assigned') - self.assertNotContains(r, 'Accepted') - self.assertNotContains(r, 'Completed') + self.assertNotContains(r, 'data-text="Assigned"') + self.assertNotContains(r, 'data-text="Accepted"') + self.assertNotContains(r, 'data-text="Completed"') def test_requests_history_invalid_filter_parameters(self): # First assignment as assigned diff --git a/ietf/meeting/tests_session_requests.py b/ietf/meeting/tests_session_requests.py index 0cb092d2f8..42dbee5f23 100644 --- a/ietf/meeting/tests_session_requests.py +++ b/ietf/meeting/tests_session_requests.py @@ -236,7 +236,7 @@ def test_edit(self): self.assertRedirects(r, redirect_url) # Check whether updates were stored in the database - sessions = Session.objects.filter(meeting=meeting, group=mars) + sessions = Session.objects.filter(meeting=meeting, group=mars).order_by("id") self.assertEqual(len(sessions), 2) session = sessions[0] self.assertFalse(session.constraints().filter(name='time_relation')) diff --git a/ietf/meeting/tests_views.py b/ietf/meeting/tests_views.py index 258ffe554c..17988e50be 100644 --- a/ietf/meeting/tests_views.py +++ b/ietf/meeting/tests_views.py @@ -33,6 +33,7 @@ from django.http import QueryDict, FileResponse from django.template import Context, Template from django.utils import timezone +from django.utils.html import escape from django.utils.safestring import mark_safe from django.utils.text import slugify @@ -9491,7 +9492,7 @@ def test_session_attendance(self): self.assertEqual(r.status_code, 200) self.assertContains(r, '3 attendees') for person in persons: - self.assertContains(r, person.plain_name()) + self.assertContains(r, escape(person.plain_name())) # Test for the "I was there" button. def _test_button(person, expected): @@ -9511,14 +9512,14 @@ def _test_button(person, expected): # attempt to POST anyway is ignored r = self.client.post(attendance_url) self.assertEqual(r.status_code, 200) - self.assertNotContains(r, persons[3].plain_name()) + self.assertNotContains(r, escape(persons[3].plain_name())) self.assertEqual(session.attended_set.count(), 3) # button is shown, and POST is accepted meeting.importantdate_set.update(name_id='revsub',date=date_today() + datetime.timedelta(days=20)) _test_button(persons[3], True) r = self.client.post(attendance_url) self.assertEqual(r.status_code, 200) - self.assertContains(r, persons[3].plain_name()) + self.assertContains(r, escape(persons[3].plain_name())) self.assertEqual(session.attended_set.count(), 4) # When the meeting is finalized, a bluesheet file is generated, diff --git a/ietf/meeting/utils.py b/ietf/meeting/utils.py index bdf3d3d3d3..10ae0d3667 100644 --- a/ietf/meeting/utils.py +++ b/ietf/meeting/utils.py @@ -1025,9 +1025,18 @@ def resolve_materials_for_one_meeting(meeting: Meeting): ) def resolve_uploaded_material(meeting: Meeting, doc: Document): - resolved = [] + resolved: list[ResolvedMaterial] = [] + remove = ResolvedMaterial.objects.none() blob = resolve_one_material(doc, rev=None, ext=None) - if blob is not None: + if blob is None: + # Versionless file does not exist. Remove the versionless ResolvedMaterial + # if it existed. This is to avoid leaving behind a stale link to a replaced + # version. This comes up e.g. if a ProceedingsMaterial is changed from having + # an uploaded file to being an external URL. + remove = ResolvedMaterial.objects.filter( + name=doc.name, meeting_number=meeting.number + ) + else: resolved.append( ResolvedMaterial( name=doc.name, @@ -1047,12 +1056,15 @@ def resolve_uploaded_material(meeting: Meeting, doc: Document): blob=blob.name, ) ) + # Create the new record(s) ResolvedMaterial.objects.bulk_create( resolved, update_conflicts=True, unique_fields=["name", "meeting_number"], update_fields=["bucket", "blob"], ) + # and remove one if necessary (will be a none() queryset if not) + remove.delete() def store_blob_for_one_material_file(doc: Document, rev: str, filepath: Path): diff --git a/ietf/meeting/views_proceedings.py b/ietf/meeting/views_proceedings.py index d1169bff2d..639efa1da4 100644 --- a/ietf/meeting/views_proceedings.py +++ b/ietf/meeting/views_proceedings.py @@ -14,7 +14,7 @@ from ietf.meeting.models import Meeting, MeetingHost from ietf.meeting.helpers import get_meeting from ietf.name.models import ProceedingsMaterialTypeName -from ietf.meeting.utils import handle_upload_file +from ietf.meeting.utils import handle_upload_file, resolve_uploaded_material from ietf.utils.text import xslugify class UploadProceedingsMaterialForm(FileUploadForm): @@ -150,7 +150,7 @@ def save_proceedings_material_doc(meeting, material_type, title, request, file=N if events: doc.save_with_history(events) - + resolve_uploaded_material(meeting, doc) return doc diff --git a/ietf/settings.py b/ietf/settings.py index 40a4cb5c56..3aa45a453c 100644 --- a/ietf/settings.py +++ b/ietf/settings.py @@ -235,7 +235,9 @@ AUTHENTICATION_BACKENDS = ( 'ietf.ietfauth.backends.CaseInsensitiveModelBackend', ) -FILE_UPLOAD_PERMISSIONS = 0o644 +FILE_UPLOAD_PERMISSIONS = 0o644 + +FIRST_V3_RFC = 8650 # diff --git a/ietf/sync/errata.py b/ietf/sync/errata.py new file mode 100644 index 0000000000..113d987291 --- /dev/null +++ b/ietf/sync/errata.py @@ -0,0 +1,184 @@ +# Copyright The IETF Trust 2026, All Rights Reserved +import datetime +import json +from collections import defaultdict +from typing import DefaultDict + +from django.conf import settings +from django.core.files.storage import storages +from django.db import transaction +from django.db.models import Q + +from ietf.doc.models import Document, DocEvent +from ietf.name.models import DocTagName +from ietf.person.models import Person +from ietf.utils.log import log +from ietf.utils.models import DirtyBits + + +DEFAULT_ERRATA_JSON_BLOB_NAME = "other/errata.json" + +type ErrataJsonEntry = dict[str, str] + +def get_errata_last_updated() -> datetime.datetime: + """Get timestamp of the last errata.json update + + May raise FileNotFoundError or other storage/S3 exceptions. Be prepared. + """ + red_bucket = storages["red_bucket"] + return red_bucket.get_modified_time( + getattr(settings, "ERRATA_JSON_BLOB_NAME", DEFAULT_ERRATA_JSON_BLOB_NAME) + ) + + +def get_errata_data() -> list[ErrataJsonEntry]: + red_bucket = storages["red_bucket"] + with red_bucket.open( + getattr(settings, "ERRATA_JSON_BLOB_NAME", DEFAULT_ERRATA_JSON_BLOB_NAME), "r" + ) as f: + errata_data = json.load(f) + return errata_data + + +def errata_map_from_json(errata_data: list[ErrataJsonEntry]): + """Create a dict mapping RFC number to a list of applicable errata records""" + errata = defaultdict(list) + for item in errata_data: + doc_id = item["doc-id"] + if doc_id.upper().startswith("RFC"): + rfc_number = int(doc_id[3:]) + errata[rfc_number].append(item) + return dict(errata) + + +def update_errata_tags(errata_data: list[ErrataJsonEntry]): + tag_has_errata = DocTagName.objects.get(slug="errata") + tag_has_verified_errata = DocTagName.objects.get(slug="verified-errata") + system = Person.objects.get(name="(System)") + + errata_map = errata_map_from_json(errata_data) + nums_with_errata = [ + num + for num, errata in errata_map.items() + if any(er["errata_status_code"] != "Rejected" for er in errata) + ] + nums_with_verified_errata = [ + num + for num, errata in errata_map.items() + if any(er["errata_status_code"] == "Verified" for er in errata) + ] + + rfcs_gaining_errata_tag = Document.objects.filter( + type_id="rfc", rfc_number__in=nums_with_errata + ).exclude(tags=tag_has_errata) + + rfcs_gaining_verified_errata_tag = Document.objects.filter( + type_id="rfc", rfc_number__in=nums_with_verified_errata + ).exclude(tags=tag_has_verified_errata) + + rfcs_losing_errata_tag = Document.objects.filter( + type_id="rfc", tags=tag_has_errata + ).exclude(rfc_number__in=nums_with_errata) + + rfcs_losing_verified_errata_tag = Document.objects.filter( + type_id="rfc", tags=tag_has_verified_errata + ).exclude(rfc_number__in=nums_with_verified_errata) + + # map rfc_number to add/remove lists + changes: DefaultDict[Document, dict[str, list[DocTagName]]] = defaultdict( + lambda: {"add": [], "remove": []} + ) + for rfc in rfcs_gaining_errata_tag: + changes[rfc]["add"].append(tag_has_errata) + for rfc in rfcs_gaining_verified_errata_tag: + changes[rfc]["add"].append(tag_has_verified_errata) + for rfc in rfcs_losing_errata_tag: + changes[rfc]["remove"].append(tag_has_errata) + for rfc in rfcs_losing_verified_errata_tag: + changes[rfc]["remove"].append(tag_has_verified_errata) + + for rfc, changeset in changes.items(): + # Update in a transaction per RFC to keep tags and DocEvents consistent. + # With this in place, an interrupted task will be cleanly completed on the + # next run. + with transaction.atomic(): + change_descs = [] + for tag in changeset["add"]: + rfc.tags.add(tag) + change_descs.append(f"added {tag.slug} tag") + for tag in changeset["remove"]: + rfc.tags.remove(tag) + change_descs.append(f"removed {tag.slug} tag") + summary = "Update from RFC Editor: " + ", ".join(change_descs) + if rfc.rfc_number in errata_map and all( + er["errata_status_code"] == "Rejected" + for er in errata_map[rfc.rfc_number] + ): + summary += " (all errata rejected)" + DocEvent.objects.create( + doc=rfc, + rev=rfc.rev, # expect no rev + by=system, + type="sync_from_rfc_editor", + desc=summary, + ) + + +def update_errata_from_rfceditor(): + errata_data = get_errata_data() + update_errata_tags(errata_data) + + +## DirtyBits management for the errata tags + +ERRATA_SLUG = DirtyBits.Slugs.ERRATA + + +def update_errata_dirty_time() -> DirtyBits | None: + try: + last_update = get_errata_last_updated() + except Exception as err: + log(f"Error in get_errata_last_updated: {err}") + return None + else: + dirty_work, created = DirtyBits.objects.update_or_create( + slug=ERRATA_SLUG, defaults={"dirty_time": last_update} + ) + if created: + log(f"Created DirtyBits(slug='{ERRATA_SLUG}')") + return dirty_work + + +def mark_errata_as_processed(when: datetime.datetime): + n_updated = DirtyBits.objects.filter( + Q(processed_time__isnull=True) | Q(processed_time__lt=when), + slug=ERRATA_SLUG, + ).update(processed_time=when) + if n_updated > 0: + log(f"processed_time is now {when.isoformat()}") + else: + log("processed_time not updated, no matching record found") + + +def errata_are_dirty(): + """Does the rfc index need to be updated?""" + dirty_work = update_errata_dirty_time() # creates DirtyBits if needed + if dirty_work is None: + # A None indicates we could not check the timestamp of errata.json. In that + # case, we are not likely to be able to read the blob either, so don't try + # to process it. An error was already logged. + return False + display_processed_time = ( + dirty_work.processed_time.isoformat() + if dirty_work.processed_time is not None + else "never" + ) + log( + f"DirtyBits(slug='{ERRATA_SLUG}'): " + f"dirty_time={dirty_work.dirty_time.isoformat()} " + f"processed_time={display_processed_time}" + ) + return ( + dirty_work.processed_time is None + or dirty_work.dirty_time >= dirty_work.processed_time + ) diff --git a/ietf/sync/rfcindex.py b/ietf/sync/rfcindex.py index 0ea6fb939f..d1a0ed432f 100644 --- a/ietf/sync/rfcindex.py +++ b/ietf/sync/rfcindex.py @@ -1,4 +1,5 @@ # Copyright The IETF Trust 2026, All Rights Reserved +import datetime import json from collections import defaultdict from collections.abc import Container @@ -11,6 +12,7 @@ from django.conf import settings from django.core.files.base import ContentFile +from django.db.models import Q from lxml import etree from django.core.files.storage import storages @@ -22,6 +24,7 @@ from ietf.doc.models import Document from ietf.name.models import StdLevelName from ietf.utils.log import log +from ietf.utils.models import DirtyBits FORMATS_FOR_INDEX = ["txt", "html", "pdf", "xml", "ps"] SS_TXT_MARGIN = 3 @@ -150,7 +153,7 @@ def get_publication_std_levels() -> dict[int, StdLevelName]: def format_ordering(rfc_number): - if rfc_number < 8650: + if rfc_number < settings.FIRST_V3_RFC: ordering = ["txt", "ps", "pdf", "html", "xml"] else: ordering = ["html", "txt", "ps", "pdf", "xml"] @@ -739,3 +742,50 @@ def create_fyi_txt_index(): }, ) save_to_red_bucket("fyi-index.txt", index) + + +## DirtyBits management for the RFC index + +RFCINDEX_SLUG = DirtyBits.Slugs.RFCINDEX + + +def mark_rfcindex_as_dirty(): + _, created = DirtyBits.objects.update_or_create( + slug=RFCINDEX_SLUG, defaults={"dirty_time": timezone.now()} + ) + if created: + log(f"Created DirtyBits(slug='{RFCINDEX_SLUG}')") + + +def mark_rfcindex_as_processed(when: datetime.datetime): + n_updated = DirtyBits.objects.filter( + Q(processed_time__isnull=True) | Q(processed_time__lt=when), + slug=RFCINDEX_SLUG, + ).update(processed_time=when) + if n_updated > 0: + log(f"processed_time is now {when.isoformat()}") + else: + log("processed_time not updated, no matching record found") + + +def rfcindex_is_dirty(): + """Does the rfc index need to be updated?""" + dirty_work, created = DirtyBits.objects.get_or_create( + slug=RFCINDEX_SLUG, defaults={"dirty_time": timezone.now()} + ) + if created: + log(f"Created DirtyBits(slug='{RFCINDEX_SLUG}')") + display_processed_time = ( + dirty_work.processed_time.isoformat() + if dirty_work.processed_time is not None + else "never" + ) + log( + f"DirtyBits(slug='{RFCINDEX_SLUG}'): " + f"dirty_time={dirty_work.dirty_time.isoformat()} " + f"processed_time={display_processed_time}" + ) + return ( + dirty_work.processed_time is None + or dirty_work.dirty_time >= dirty_work.processed_time + ) diff --git a/ietf/sync/tasks.py b/ietf/sync/tasks.py index 4c84dc581e..34b2efeb5c 100644 --- a/ietf/sync/tasks.py +++ b/ietf/sync/tasks.py @@ -17,8 +17,20 @@ from ietf.doc.tasks import rebuild_reference_relations_task from ietf.sync import iana from ietf.sync import rfceditor +from ietf.sync.errata import ( + errata_are_dirty, + mark_errata_as_processed, + update_errata_from_rfceditor, +) from ietf.sync.rfceditor import MIN_QUEUE_RESULTS, parse_queue, update_drafts_from_queue -from ietf.sync.rfcindex import create_rfc_txt_index, create_rfc_xml_index +from ietf.sync.rfcindex import ( + create_bcp_txt_index, + create_fyi_txt_index, + create_rfc_txt_index, + create_rfc_xml_index, + create_std_txt_index, + rfcindex_is_dirty, mark_rfcindex_as_processed, mark_rfcindex_as_dirty, +) from ietf.sync.utils import build_from_file_content, load_rfcs_into_blobdb, rsync_helper from ietf.utils import log from ietf.utils.timezone import date_today @@ -27,13 +39,13 @@ @shared_task def rfc_editor_index_update_task(full_index=False): """Update metadata from the RFC index - + Default is to examine only changes in the past 365 days. Call with full_index=True to update the full RFC index. - + According to comments on the original script, a year's worth took about 20s on production as of August 2022 - + The original rfc-editor-index-update script had a long-disabled provision for running the rebuild_reference_relations scripts after the update. That has not been brought over at all because it should be implemented as its own task if it is needed. @@ -51,7 +63,7 @@ def rfc_editor_index_update_task(full_index=False): timeout=30, # seconds ) except requests.Timeout as exc: - log.log(f'GET request timed out retrieving RFC editor index: {exc}') + log.log(f"GET request timed out retrieving RFC editor index: {exc}") return # failed rfc_index_xml = response.text index_data = rfceditor.parse_index(io.StringIO(rfc_index_xml)) @@ -61,9 +73,9 @@ def rfc_editor_index_update_task(full_index=False): timeout=30, # seconds ) except requests.Timeout as exc: - log.log(f'GET request timed out retrieving RFC editor errata: {exc}') + log.log(f"GET request timed out retrieving RFC editor errata: {exc}") return # failed - errata_data = response.json() + errata_data = response.json() if len(index_data) < rfceditor.MIN_INDEX_RESULTS: log.log("Not enough index entries, only %s" % len(index_data)) return # failed @@ -96,15 +108,15 @@ def rfc_editor_queue_updates_task(): drafts, warnings = parse_queue(io.StringIO(response.text)) for w in warnings: log.log(f"Warning: {w}") - + if len(drafts) < MIN_QUEUE_RESULTS: log.log("Not enough results, only %s" % len(drafts)) return # failed - + changed, warnings = update_drafts_from_queue(drafts) for w in warnings: log.log(f"Warning: {w}") - + for c in changed: log.log(f"Updated {c}") @@ -120,9 +132,11 @@ def iana_changes_update_task(): MAX_INTERVAL_ACCEPTED_BY_IANA = datetime.timedelta(hours=23) start = ( - timezone.now() - - datetime.timedelta(hours=23) - + datetime.timedelta(seconds=CLOCK_SKEW_COMPENSATION,) + timezone.now() + - datetime.timedelta(hours=23) + + datetime.timedelta( + seconds=CLOCK_SKEW_COMPENSATION, + ) ) end = start + datetime.timedelta(hours=23) @@ -133,7 +147,9 @@ def iana_changes_update_task(): # requests if necessary text = iana.fetch_changes_json( - settings.IANA_SYNC_CHANGES_URL, t, min(end, t + MAX_INTERVAL_ACCEPTED_BY_IANA) + settings.IANA_SYNC_CHANGES_URL, + t, + min(end, t + MAX_INTERVAL_ACCEPTED_BY_IANA), ) log.log(f"Retrieved the JSON: {text}") @@ -159,9 +175,9 @@ def iana_protocols_update_task(): # "this needs to be the date where this tool is first deployed" in the original # iana-protocols-updates script)" rfc_must_published_later_than = datetime.datetime( - 2012, - 11, - 26, + 2012, + 11, + 26, tzinfo=datetime.UTC, ) @@ -171,17 +187,17 @@ def iana_protocols_update_task(): timeout=30, ) except requests.Timeout as exc: - log.log(f'GET request timed out retrieving IANA protocols page: {exc}') + log.log(f"GET request timed out retrieving IANA protocols page: {exc}") return rfc_numbers = iana.parse_protocol_page(response.text) def batched(l, n): """Split list l up in batches of max size n. - + For Python 3.12 or later, replace this with itertools.batched() """ - return (l[i:i + n] for i in range(0, len(l), n)) + return (l[i : i + n] for i in range(0, len(l), n)) for batch in batched(rfc_numbers, 100): updated = iana.update_rfc_log_from_protocol_page( @@ -192,6 +208,7 @@ def batched(l, n): for d in updated: log.log("Added history entry for %s" % d.display_name()) + @shared_task def fix_subseries_docevents_task(): """Repairs DocEvents related to bugs around removing docs from subseries @@ -233,6 +250,7 @@ def fix_subseries_docevents_task(): time=obsoleting_time ) + @shared_task def rsync_rfcs_from_rfceditor_task(rfc_numbers: list[int]): log.log(f"Rsyncing rfcs from rfc-editor: {rfc_numbers}") @@ -276,7 +294,51 @@ def load_rfcs_into_blobdb_task(start: int, end: int): @shared_task -def create_rfc_index_task(): - create_rfc_txt_index() - create_rfc_xml_index() +def update_errata_from_rfceditor_task(): + if errata_are_dirty(): + # new_processed_time is the *start* of processing so that any changes after + # this point will trigger another refresh + new_processed_time = timezone.now() + update_errata_from_rfceditor() + mark_errata_as_processed(new_processed_time) + mark_rfcindex_as_dirty() # ensure any changes are reflected in the indexes + +@shared_task +def refresh_rfc_index_task(): + if rfcindex_is_dirty(): + # new_processed_time is the *start* of processing so that any changes after + # this point will trigger another refresh + new_processed_time = timezone.now() + + try: + create_rfc_txt_index() + except Exception as e: + log.log(f"Error: failure in creating rfc-index.txt. {e}") + pass + + try: + create_rfc_xml_index() + except Exception as e: + log.log(f"Error: failure in creating rfc-index.xml. {e}") + pass + + try: + create_bcp_txt_index() + except Exception as e: + log.log(f"Error: failure in creating bcp-index.txt. {e}") + pass + + try: + create_std_txt_index() + except Exception as e: + log.log(f"Error: failure in creating std-index.txt. {e}") + pass + + try: + create_fyi_txt_index() + except Exception as e: + log.log(f"Error: failure in creating fyi-index.txt. {e}") + pass + + mark_rfcindex_as_processed(new_processed_time) diff --git a/ietf/sync/tests.py b/ietf/sync/tests.py index 21d6cb5cd5..e83b6a5e0a 100644 --- a/ietf/sync/tests.py +++ b/ietf/sync/tests.py @@ -1,5 +1,4 @@ -# Copyright The IETF Trust 2012-2020, All Rights Reserved -# -*- coding: utf-8 -*- +# Copyright The IETF Trust 2012-2026, All Rights Reserved import os @@ -13,6 +12,8 @@ from dataclasses import dataclass from django.conf import settings +from django.core.files.base import ContentFile +from django.core.files.storage import storages from django.urls import reverse as urlreverse from django.utils import timezone from django.test.utils import override_settings @@ -25,15 +26,34 @@ RfcFactory, DocumentAuthorFactory, DocEventFactory, - BcpFactory, + BcpFactory, WgRfcFactory, +) +from ietf.doc.models import ( + Document, + DocEvent, + DeletedEvent, + DocTagName, + RelatedDocument, + State, + StateDocEvent, ) -from ietf.doc.models import Document, DocEvent, DeletedEvent, DocTagName, RelatedDocument, State, StateDocEvent from ietf.doc.utils import add_state_change_event from ietf.group.factories import GroupFactory from ietf.person.factories import PersonFactory from ietf.person.models import Person from ietf.sync import iana, rfceditor, tasks +from ietf.sync.errata import ( + update_errata_from_rfceditor, + get_errata_last_updated, + get_errata_data, + errata_map_from_json, + update_errata_dirty_time, + mark_errata_as_processed, + update_errata_tags, +) +from ietf.sync.tasks import update_errata_from_rfceditor_task from ietf.utils.mail import outbox, empty_outbox +from ietf.utils.models import DirtyBits from ietf.utils.test_utils import login_testing_unauthorized from ietf.utils.test_utils import TestCase from ietf.utils.timezone import date_today, RPC_TZINFO @@ -882,6 +902,191 @@ def test_rfceditor_undo(self): self.assertTrue(StateDocEvent.objects.filter(desc="First", doc=draft)) +class ErrataTests(TestCase): + @override_settings(ERRATA_JSON_BLOB_NAME="myblob.json") + def test_get_errata_last_update(self): + red_bucket = storages["red_bucket"] # InMemoryStorage in test + red_bucket.save("myblob.json", ContentFile("file")) + self.assertEqual( + get_errata_last_updated(), red_bucket.get_modified_time("myblob.json") + ) + + @override_settings(ERRATA_JSON_BLOB_NAME="myblob.json") + def test_get_errata_data(self): + red_bucket = storages["red_bucket"] # InMemoryStorage in test + red_bucket.save("myblob.json", ContentFile('[{"value": 3}]')) + self.assertEqual( + get_errata_data(), + [{"value": 3}], + ) + + def test_errata_map_from_json(self): + input_data = [ + { + "doc-id": "not-an-rfc", + "errata_status_code": "Verified", + }, + { + "doc-id": "rfc01234", + "errata_status_code": "Reported", + }, + { + "doc-id": "RFC1001", + "errata_status_code": "Verified" + }, + { + "doc-id": "RfC1234", + "errata_status_code": "Verified", + }, + ] + expected_output = {1001: [input_data[2]], 1234: [input_data[1], input_data[3]]} + self.assertDictEqual(errata_map_from_json(input_data), expected_output) + + @mock.patch("ietf.sync.errata.update_errata_tags") + @mock.patch("ietf.sync.errata.get_errata_data") + def test_update_errata_from_rfceditor(self, mock_get_data, mock_update): + fake_data = object() + mock_get_data.return_value = fake_data + update_errata_from_rfceditor() + self.assertTrue(mock_get_data.called) + self.assertTrue(mock_update.called) + self.assertEqual(mock_update.call_args, mock.call(fake_data)) + + def test_update_errata_tags(self): + tag_has_errata = DocTagName.objects.get(slug="errata") + tag_has_verified_errata = DocTagName.objects.get(slug="verified-errata") + + rfcs = WgRfcFactory.create_batch(10) + rfcs[0].tags.set([tag_has_errata]) + rfcs[1].tags.set([tag_has_errata, tag_has_verified_errata]) + rfcs[2].tags.set([tag_has_errata]) + rfcs[3].tags.set([tag_has_errata, tag_has_verified_errata]) + rfcs[4].tags.set([tag_has_errata]) + rfcs[5].tags.set([tag_has_errata, tag_has_verified_errata]) + + # Only contains the fields we care about, not the full JSON + errata_data = [ + # rfcs[0] had errata and should keep it + {"doc-id": rfcs[0].name, "errata_status_code": "Held for Document Update"}, + {"doc-id": rfcs[0].name, "errata_status_code": "Rejected"}, + # rfcs[1] had errata+verified-errata and should keep both + {"doc-id": rfcs[1].name, "errata_status_code": "Verified"}, + # rfcs[2] had errata and should gain verified-errata + {"doc-id": rfcs[2].name, "errata_status_code": "Verified"}, + # rfcs[3] had errata+verified errata and should lose both + {"doc-id": rfcs[3].name, "errata_status_code": "Rejected"}, + # rfcs[4] had errata and should gain verified-errata + {"doc-id": rfcs[4].name, "errata_status_code": "Verified"}, + {"doc-id": rfcs[4].name, "errata_status_code": "Reported"}, + # rfcs[5] had errata+verified-errata and should lose verified-errata + {"doc-id": rfcs[5].name, "errata_status_code": "Reported"}, + # rfcs[6] had none and should gain errata + {"doc-id": rfcs[6].name, "errata_status_code": "Reported"}, + # rfcs[7] had none and should gain errata+verified-errata + {"doc-id": rfcs[7].name, "errata_status_code": "Verified"}, + # rfcs[8] had none and it should stay that way + {"doc-id": rfcs[8].name, "errata_status_code": "Rejected"}, + # rfcs[9] had none and it should stay that way (no entry at all) + ] + update_errata_tags(errata_data) + + self.assertCountEqual(rfcs[0].tags.all(), [tag_has_errata]) + self.assertIsNone(rfcs[0].docevent_set.first()) # no change + + self.assertCountEqual( + rfcs[1].tags.all(), [tag_has_errata, tag_has_verified_errata] + ) + self.assertIsNone(rfcs[1].docevent_set.first()) # no change + + self.assertCountEqual( + rfcs[2].tags.all(), [tag_has_errata, tag_has_verified_errata] + ) + self.assertEqual(rfcs[2].docevent_set.count(), 1) + self.assertIn(": added verified-errata tag", rfcs[2].docevent_set.first().desc) + + self.assertCountEqual(rfcs[3].tags.all(), []) + self.assertEqual(rfcs[3].docevent_set.count(), 1) + self.assertIn( + ": removed errata tag, removed verified-errata tag (all errata rejected)", + rfcs[3].docevent_set.first().desc, + ) + + self.assertCountEqual( + rfcs[4].tags.all(), [tag_has_errata, tag_has_verified_errata] + ) + self.assertEqual(rfcs[4].docevent_set.count(), 1) + self.assertIn(": added verified-errata tag", rfcs[4].docevent_set.first().desc) + + self.assertCountEqual(rfcs[5].tags.all(), [tag_has_errata]) + self.assertEqual(rfcs[5].docevent_set.count(), 1) + self.assertIn( + ": removed verified-errata tag", rfcs[5].docevent_set.first().desc + ) + + self.assertCountEqual(rfcs[6].tags.all(), [tag_has_errata]) + self.assertEqual(rfcs[6].docevent_set.count(), 1) + self.assertIn(": added errata tag", rfcs[6].docevent_set.first().desc) + + self.assertCountEqual( + rfcs[7].tags.all(), [tag_has_errata, tag_has_verified_errata] + ) + self.assertEqual(rfcs[7].docevent_set.count(), 1) + self.assertIn( + ": added errata tag, added verified-errata tag", + rfcs[7].docevent_set.first().desc, + ) + + self.assertCountEqual(rfcs[8].tags.all(), []) + self.assertIsNone(rfcs[8].docevent_set.first()) # no change + + self.assertCountEqual(rfcs[9].tags.all(), []) + self.assertIsNone(rfcs[9].docevent_set.first()) # no change + + @override_settings(ERRATA_JSON_BLOB_NAME="myblob.json") + @mock.patch("ietf.sync.errata.get_errata_last_updated") + def test_update_errata_dirty_time(self, mock_last_updated): + ERRATA_SLUG = DirtyBits.Slugs.ERRATA + + # No time available + mock_last_updated.side_effect = FileNotFoundError + self.assertIsNone(DirtyBits.objects.filter(slug=ERRATA_SLUG).first()) + self.assertIsNone(update_errata_dirty_time()) # no blob yet + self.assertIsNone(DirtyBits.objects.filter(slug=ERRATA_SLUG).first()) + + # Now set a time + first_timestamp = timezone.now() - datetime.timedelta(hours=3) + mock_last_updated.return_value = first_timestamp + mock_last_updated.side_effect = None + result = update_errata_dirty_time() + self.assertTrue(isinstance(result, DirtyBits)) + result.refresh_from_db() + self.assertEqual(result.slug, ERRATA_SLUG) + self.assertEqual(result.processed_time, None) + self.assertEqual(result.dirty_time, first_timestamp) + + # Update the time + second_timestamp = timezone.now() + mock_last_updated.return_value = second_timestamp + second_result = update_errata_dirty_time() + self.assertEqual(result.pk, second_result.pk) # should be the same record + result.refresh_from_db() + self.assertEqual(result.slug, ERRATA_SLUG) + self.assertEqual(result.processed_time, None) + self.assertEqual(result.dirty_time, second_timestamp) + + def test_mark_errata_as_processed(self): + ERRATA_SLUG = DirtyBits.Slugs.ERRATA + first_timestamp = timezone.now() + mark_errata_as_processed(first_timestamp) # no DirtyBits is not an error + self.assertIsNone(DirtyBits.objects.filter(slug=ERRATA_SLUG).first()) + dbits = DirtyBits.objects.create(slug=ERRATA_SLUG, dirty_time=first_timestamp) + second_timestamp = timezone.now() + mark_errata_as_processed(second_timestamp) + dbits.refresh_from_db() + self.assertEqual(dbits.dirty_time, first_timestamp) + self.assertEqual(dbits.processed_time, second_timestamp) + + class TaskTests(TestCase): @override_settings( RFC_EDITOR_INDEX_URL="https://rfc-editor.example.com/index/", @@ -1215,3 +1420,28 @@ def test_load_rfcs_into_blobdb_task( self.assertEqual(mock_kwargs, {}) + @mock.patch("ietf.sync.tasks.update_errata_from_rfceditor") + @mock.patch("ietf.sync.tasks.mark_rfcindex_as_dirty") + @mock.patch("ietf.sync.tasks.mark_errata_as_processed") + @mock.patch("ietf.sync.tasks.errata_are_dirty") + def test_update_errata_from_rfceditor_task( + self, + mock_errata_are_dirty, + mock_mark_errata_processed, + mock_mark_rfcindex_dirty, + mock_update, + ): + mock_errata_are_dirty.return_value = False + update_errata_from_rfceditor_task() + self.assertTrue(mock_errata_are_dirty.called) + self.assertFalse(mock_mark_errata_processed.called) + self.assertFalse(mock_mark_rfcindex_dirty.called) + self.assertFalse(mock_update.called) + + mock_errata_are_dirty.reset_mock() + mock_errata_are_dirty.return_value = True + update_errata_from_rfceditor_task() + self.assertTrue(mock_errata_are_dirty.called) + self.assertTrue(mock_mark_errata_processed.called) + self.assertTrue(mock_mark_rfcindex_dirty.called) + self.assertTrue(mock_update.called) diff --git a/ietf/templates/doc/document_rfc.html b/ietf/templates/doc/document_rfc.html index 7612ef8910..d4b309a964 100644 --- a/ietf/templates/doc/document_rfc.html +++ b/ietf/templates/doc/document_rfc.html @@ -124,6 +124,15 @@ Referenced by + {% if doc.rfc_number >= settings.FIRST_V3_RFC %} + + + + Get editor source + + {% endif %} RFC {{ rfc.rfc_number }} — Not-prepped XML +

+ The not-prepped XML + is the RFC XML v3 source for an RFC at the moment in the publication process + just before the prep tool was used to expand default + values, generate section numbers, resolve cross-references, and embed + boilerplate. +

+ It is useful for authors who want to begin a new draft based on + the RFC's text, such as when creating a bis-draft, and for tools that process + author-facing RFC XML. +

+

+ + + Download not-prepped XML for RFC {{ rfc.rfc_number }} + +

+{% endblock %} diff --git a/ietf/utils/admin.py b/ietf/utils/admin.py index e6324ad7cd..cb8841cdc6 100644 --- a/ietf/utils/admin.py +++ b/ietf/utils/admin.py @@ -1,71 +1,30 @@ -# Copyright The IETF Trust 2011-2020, All Rights Reserved -# -*- coding: utf-8 -*- +# Copyright The IETF Trust 2011-2026, All Rights Reserved from django.contrib import admin -from django.utils.encoding import force_str - -def name(obj): - if hasattr(obj, 'abbrev'): - return obj.abbrev() - elif hasattr(obj, 'name'): - if callable(obj.name): - name = obj.name() - else: - name = force_str(obj.name) - if name: - return name - return str(obj) - -def admin_link(field, label=None, ordering="", display=name, suffix=""): - if not label: - label = field.capitalize().replace("_", " ").strip() - if ordering == "": - ordering = field - def _link(self): - obj = self - for attr in field.split("__"): - obj = getattr(obj, attr) - if callable(obj): - obj = obj() - if hasattr(obj, "all"): - objects = obj.all() - elif callable(obj): - objects = obj() - if not hasattr(objects, "__iter__"): - objects = [ objects ] - elif hasattr(obj, "__iter__"): - objects = obj - else: - objects = [ obj ] - chunks = [] - for obj in objects: - app = obj._meta.app_label - model = obj.__class__.__name__.lower() - id = obj.pk - chunks += [ '%(display)s' % - {'app':app, "model": model, "id":id, "display": display(obj), "suffix":suffix, } ] - return ", ".join(chunks) - _link.allow_tags = True - _link.short_description = label - _link.admin_order_field = ordering - return _link +from .models import DumpInfo, DirtyBits class SaferStackedInline(admin.StackedInline): """StackedInline without delete by default""" + can_delete = False # no delete button show_change_link = True # show a link to the resource (where it can be deleted) class SaferTabularInline(admin.TabularInline): """TabularInline without delete by default""" + can_delete = False # no delete button show_change_link = True # show a link to the resource (where it can be deleted) -from .models import DumpInfo +@admin.register(DumpInfo) class DumpInfoAdmin(admin.ModelAdmin): - list_display = ['date', 'host', 'tz'] - list_filter = ['date'] -admin.site.register(DumpInfo, DumpInfoAdmin) + list_display = ["date", "host", "tz"] + list_filter = ["date"] + + +@admin.register(DirtyBits) +class DirtyBitsAdmin(admin.ModelAdmin): + list_display = ["slug", "dirty_time", "processed_time"] diff --git a/ietf/utils/migrations/0003_dirtybits.py b/ietf/utils/migrations/0003_dirtybits.py new file mode 100644 index 0000000000..11f6ed09f6 --- /dev/null +++ b/ietf/utils/migrations/0003_dirtybits.py @@ -0,0 +1,37 @@ +# Copyright The IETF Trust 2026, All Rights Reserved + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("utils", "0002_delete_versioninfo"), + ] + + operations = [ + migrations.CreateModel( + name="DirtyBits", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "slug", + models.CharField( + choices=[("rfcindex", "RFC Index")], max_length=40, unique=True + ), + ), + ("dirty_time", models.DateTimeField(blank=True, null=True)), + ("processed_time", models.DateTimeField(blank=True, null=True)), + ], + options={ + "verbose_name_plural": "dirty bits", + }, + ), + ] diff --git a/ietf/utils/migrations/0004_alter_dirtybits_slug.py b/ietf/utils/migrations/0004_alter_dirtybits_slug.py new file mode 100644 index 0000000000..e17ea6cadd --- /dev/null +++ b/ietf/utils/migrations/0004_alter_dirtybits_slug.py @@ -0,0 +1,21 @@ +# Copyright The IETF Trust 2026, All Rights Reserved + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("utils", "0003_dirtybits"), + ] + + operations = [ + migrations.AlterField( + model_name="dirtybits", + name="slug", + field=models.CharField( + choices=[("rfcindex", "RFC Index"), ("errata", "Errata Tags")], + max_length=40, + unique=True, + ), + ), + ] diff --git a/ietf/utils/models.py b/ietf/utils/models.py index 21af5766e9..64f7f253f2 100644 --- a/ietf/utils/models.py +++ b/ietf/utils/models.py @@ -1,14 +1,36 @@ -# Copyright The IETF Trust 2015-2020, All Rights Reserved +# Copyright The IETF Trust 2015-2026, All Rights Reserved import itertools from django.db import models + +class DirtyBits(models.Model): + """A weak semaphore mechanism for coordination with celery beat tasks + + Web workers will set the "dirty_time" value for a given dirtybit slug. + Celery workers will do work if "processed_time" < "dirty_time" and update + "processed_time". + """ + + class Slugs(models.TextChoices): + RFCINDEX = "rfcindex", "RFC Index" + ERRATA = "errata", "Errata Tags" + + # next line can become `...choices=Slugs)` when we get to Django 5.x + slug = models.CharField(max_length=40, blank=False, choices=Slugs.choices, unique=True) + dirty_time = models.DateTimeField(null=True, blank=True) + processed_time = models.DateTimeField(null=True, blank=True) + + class Meta: + verbose_name_plural = "dirty bits" + + class DumpInfo(models.Model): date = models.DateTimeField() host = models.CharField(max_length=128) tz = models.CharField(max_length=32, default='UTC') - + class ForeignKey(models.ForeignKey): "A local ForeignKey proxy which provides the on_delete value required under Django 2.0." def __init__(self, to, on_delete=models.CASCADE, **kwargs): diff --git a/ietf/utils/resources.py b/ietf/utils/resources.py index 1252cfef14..63206eb33a 100644 --- a/ietf/utils/resources.py +++ b/ietf/utils/resources.py @@ -1,6 +1,4 @@ -# Copyright The IETF Trust 2014-2019, All Rights Reserved -# -*- coding: utf-8 -*- -# Autogenerated by the mkresources management command 2014-11-13 05:39 +# Copyright The IETF Trust 2014-2026, All Rights Reserved from ietf.api import ModelResource @@ -12,7 +10,7 @@ from django.contrib.contenttypes.models import ContentType from ietf import api -from ietf.utils.models import DumpInfo +from ietf.utils.models import DirtyBits, DumpInfo class UserResource(ModelResource): @@ -43,3 +41,9 @@ class Meta: "host": ALL, } api.utils.register(DumpInfoResource()) + + +class DirtyBitsResource(ModelResource): + class Meta: + queryset = DirtyBits.objects.none() +api.utils.register(DirtyBitsResource()) diff --git a/ietf/utils/searchindex.py b/ietf/utils/searchindex.py index e4427b88b5..87951abb60 100644 --- a/ietf/utils/searchindex.py +++ b/ietf/utils/searchindex.py @@ -2,12 +2,15 @@ """Search indexing utilities""" import re +from itertools import batched from math import floor +from typing import Iterable import httpx # just for exceptions import typesense import typesense.exceptions from django.conf import settings +from typesense.types.document import DocumentSchema from ietf.doc.models import Document, StoredObject from ietf.doc.storage_utils import retrieve_str @@ -42,6 +45,24 @@ def enabled(): return _settings["TYPESENSE_API_URL"] != "" +def get_typesense_client() -> typesense.Client: + _settings = get_settings() + client = typesense.Client( + { + "api_key": _settings["TYPESENSE_API_KEY"], + "nodes": [_settings["TYPESENSE_API_URL"]], + } + ) + return client + + +def get_collection_name() -> str: + _settings = get_settings() + collection_name = _settings["TYPESENSE_COLLECTION_NAME"] + assert isinstance(collection_name, str) + return collection_name + + def _sanitize_text(content): """Sanitize content or abstract text for search""" # REs (with approximate names) @@ -62,9 +83,10 @@ def _sanitize_text(content): return content.strip() -def update_or_create_rfc_entry(rfc: Document): +def typesense_doc_from_rfc(rfc: Document) -> DocumentSchema: assert rfc.type_id == "rfc" assert rfc.rfc_number is not None + assert rfc.pages is not None keywords: list[str] = rfc.keywords # help type checking @@ -75,8 +97,8 @@ def update_or_create_rfc_entry(rfc: Document): f"Indexing as {subseries[0].name}" ) subseries = subseries[0] if len(subseries) > 0 else None - obsoleted_by = rfc.relations_that("obs") - updated_by = rfc.relations_that("updates") + obsoleted_by = rfc.related_that("obs") + updated_by = rfc.related_that("updates") stored_txt = ( StoredObject.objects.exclude_deleted() @@ -91,13 +113,14 @@ def update_or_create_rfc_entry(rfc: Document): except Exception as err: log(f"Unable to retrieve {stored_txt} from storage: {err}") - ts_id = f"doc-{rfc.pk}" ts_document = { + "id": f"doc-{rfc.pk}", "rfcNumber": rfc.rfc_number, "rfc": str(rfc.rfc_number), "filename": rfc.name, "title": rfc.title, "abstract": _sanitize_text(rfc.abstract), + "pages": rfc.pages, "keywords": keywords, "type": "rfc", "state": [state.name for state in rfc.states.all()], @@ -143,13 +166,207 @@ def update_or_create_rfc_entry(rfc: Document): ts_document["adName"] = rfc.ad.name if content != "": ts_document["content"] = _sanitize_text(content) - _settings = get_settings() - client = typesense.Client( + return ts_document + + +def update_or_create_rfc_entry(rfc: Document): + """Update/create index entries for one RFC""" + ts_document = typesense_doc_from_rfc(rfc) + client = get_typesense_client() + client.collections[get_collection_name()].documents.upsert(ts_document) + + +def update_or_create_rfc_entries( + rfcs: Iterable[Document], batchsize: int | None = None +): + """Update/create index entries for RFCs in bulk + + If batchsize is set, computes index data in batches of batchsize and adds to the + index. Will make a total of (len(rfcs) // batchsize) + 1 API calls. + + N.b. that typesense has a server-side batch size that defaults to 40, which should + "almost never be changed from the default." This does not change that. Further, + the python client library's import_ method has a batch_size parameter that does + client-side batching. We don't use that, either. + """ + success_count = 0 + fail_count = 0 + client = get_typesense_client() + batches = [rfcs] if batchsize is None else batched(rfcs, batchsize) + for batch in batches: + tdoc_batch = [typesense_doc_from_rfc(rfc) for rfc in batch] + results = client.collections[get_collection_name()].documents.import_( + tdoc_batch, {"action": "upsert"} + ) + for tdoc, result in zip(tdoc_batch, results): + if result["success"]: + success_count += 1 + else: + fail_count += 1 + log(f"Failed to index RFC {tdoc['rfcNumber']}: {result['error']}") + log(f"Added {success_count} RFCs to the index, failed to add {fail_count}") + + +DOCS_SCHEMA = { + "enable_nested_fields": True, + "default_sorting_field": "ranking", + "fields": [ + # RFC number in integer form, for sorting asc/desc in search results + # Omit field for drafts { - "api_key": _settings["TYPESENSE_API_KEY"], - "nodes": [_settings["TYPESENSE_API_URL"]], - } - ) - client.collections[_settings["TYPESENSE_COLLECTION_NAME"]].documents.upsert( - {"id": ts_id} | ts_document - ) + "name": "rfcNumber", + "type": "int32", + "facet": False, + "optional": True, + "sort": True, + }, + # RFC number in string form, for direct matching with ranking + # Omit field for drafts + {"name": "rfc", "type": "string", "facet": False, "optional": True}, + # For drafts that correspond to an RFC, insert the RFC number + # Omit field for rfcs or if not relevant + {"name": "ref", "type": "string", "facet": False, "optional": True}, + # Filename of the document (without the extension, e.g. "rfc1234" + # or "draft-ietf-abc-def-02") + {"name": "filename", "type": "string", "facet": False, "infix": True}, + # Title of the draft / rfc + {"name": "title", "type": "string", "facet": False}, + # Abstract of the draft / rfc + {"name": "abstract", "type": "string", "facet": False}, + # Number of pages + {"name": "pages", "type": "int32", "facet": False}, + # A list of search keywords if relevant, set to empty array otherwise + {"name": "keywords", "type": "string[]", "facet": True}, + # Type of the document + # Accepted values: "draft" or "rfc" + {"name": "type", "type": "string", "facet": True}, + # State(s) of the document (e.g. "Published", "Adopted by a WG", etc.) + # Use the full name, not the slug + {"name": "state", "type": "string[]", "facet": True, "optional": True}, + # Status (Standard Level Name) + # Object with properties "slug" and "name" + # e.g.: { slug: "std", "name": "Internet Standard" } + {"name": "status", "type": "object", "facet": True, "optional": True}, + # The subseries it is part of. (e.g. "BCP") + # Omit otherwise. + { + "name": "subseries.acronym", + "type": "string", + "facet": True, + "optional": True, + }, + # The subseries number it is part of. (e.g. 123) + # Omit otherwise. + { + "name": "subseries.number", + "type": "int32", + "facet": True, + "sort": True, + "optional": True, + }, + # The total of RFCs in the subseries + # Omit if not part of a subseries + { + "name": "subseries.total", + "type": "int32", + "facet": False, + "sort": False, + "optional": True, + }, + # Date of the document, in unix epoch seconds (can be negative for < 1970) + {"name": "date", "type": "int64", "facet": False}, + # Expiration date of the document, in unix epoch seconds (can be negative + # for < 1970). Omit field for RFCs + {"name": "expires", "type": "int64", "facet": False, "optional": True}, + # Publication date of the RFC, in unix epoch seconds (can be negative + # for < 1970). Omit field for drafts + { + "name": "publicationDate", + "type": "int64", + "facet": True, + "optional": True, + }, + # Working Group + # Object with properties "acronym", "name" and "full" + # e.g.: + # { + # "acronym": "ntp", + # "name": "Network Time Protocols", + # "full": "ntp - Network Time Protocols", + # } + {"name": "group", "type": "object", "facet": True, "optional": True}, + # Area + # Object with properties "acronym", "name" and "full" + # e.g.: + # { + # "acronym": "mpls", + # "name": "Multiprotocol Label Switching", + # "full": "mpls - Multiprotocol Label Switching", + # } + {"name": "area", "type": "object", "facet": True, "optional": True}, + # Stream + # Object with properties "slug" and "name" + # e.g.: { slug: "ietf", "name": "IETF" } + {"name": "stream", "type": "object", "facet": True, "optional": True}, + # List of authors + # Array of objects with properties "name" and "affiliation" + # e.g.: + # [ + # {"name": "John Doe", "affiliation": "ACME Inc."}, + # {"name": "Ada Lovelace", "affiliation": "Babbage Corps."}, + # ] + {"name": "authors", "type": "object[]", "facet": True, "optional": True}, + # Area Director Name (e.g. "Leonardo DaVinci") + {"name": "adName", "type": "string", "facet": True, "optional": True}, + # Whether the document should be hidden by default in search results or not. + {"name": "flags.hiddenDefault", "type": "bool", "facet": True}, + # Whether the document is obsoleted by another document or not. + {"name": "flags.obsoleted", "type": "bool", "facet": True}, + # Whether the document is updated by another document or not. + {"name": "flags.updated", "type": "bool", "facet": True}, + # List of documents that obsolete this document. + # Array of strings. Use RFC number for RFCs. (e.g. ["123", "456"]) + # Omit if none. Must be provided if "flags.obsoleted" is set to True. + { + "name": "obsoletedBy", + "type": "string[]", + "facet": False, + "optional": True, + }, + # List of documents that update this document. + # Array of strings. Use RFC number for RFCs. (e.g. ["123", "456"]) + # Omit if none. Must be provided if "flags.updated" is set to True. + {"name": "updatedBy", "type": "string[]", "facet": False, "optional": True}, + # Sanitized content of the document. + # Make sure to remove newlines, double whitespaces, symbols and tags. + { + "name": "content", + "type": "string", + "facet": False, + "optional": True, + "store": False, + }, + # Ranking value to use when no explicit sorting is used during search + # Set to the RFC number for RFCs and the revision number for drafts + # This ensures newer RFCs get listed first in the default search results + # (without a query) + {"name": "ranking", "type": "int32", "facet": False}, + ], +} + + +def create_collection(): + collection_name = get_collection_name() + log(f"Creating '{collection_name}' collection") + client = get_typesense_client() + client.collections.create({"name": get_collection_name()} | DOCS_SCHEMA) + + +def delete_collection(): + collection_name = get_collection_name() + log(f"Deleting '{collection_name}' collection") + client = get_typesense_client() + try: + client.collections[collection_name].delete() + except typesense.exceptions.ObjectNotFound: + pass diff --git a/ietf/utils/tests.py b/ietf/utils/tests.py index 3288309095..99c33f34b3 100644 --- a/ietf/utils/tests.py +++ b/ietf/utils/tests.py @@ -60,7 +60,6 @@ set_url_coverage, ) from ietf.utils.test_utils import TestCase, unicontent -from ietf.utils.text import parse_unicode from ietf.utils.timezone import timezone_not_near_midnight from ietf.utils.xmldraft import XMLDraft, InvalidMetadataError, capture_xml2rfc_output @@ -864,24 +863,6 @@ def test_assertion(self): assertion('False') settings.SERVER_MODE = 'test' -class TestRFC2047Strings(TestCase): - def test_parse_unicode(self): - names = ( - ('=?utf-8?b?4Yuz4YuK4Ym1IOGJoOGJgOGIiA==?=', 'ዳዊት በቀለ'), - ('=?utf-8?b?5Li9IOmDnA==?=', '丽 郜'), - ('=?utf-8?b?4KSV4KSu4KWN4KSs4KWL4KScIOCkoeCkvuCksA==?=', 'कम्बोज डार'), - ('=?utf-8?b?zpfPgc6szrrOu861zrnOsSDOm865z4zOvc+Ezrc=?=', 'Ηράκλεια Λιόντη'), - ('=?utf-8?b?15nXqdeo15DXnCDXqNeV15bXoNek15zXkw==?=', 'ישראל רוזנפלד'), - ('=?utf-8?b?5Li95Y2OIOeahw==?=', '丽华 皇'), - ('=?utf-8?b?77ul77qu766V77qzIO+tlu+7ru+vvu+6ju+7pw==?=', 'ﻥﺮﮕﺳ ﭖﻮﯾﺎﻧ'), - ('=?utf-8?b?77uh77uu77qz77uu76++IO+6su+7tO+7p++6jSDvurDvu6Pvuo7vu6jvr74=?=', 'ﻡﻮﺳﻮﯾ ﺲﻴﻧﺍ ﺰﻣﺎﻨﯾ'), - ('=?utf-8?b?ScOxaWdvIFNhbsOnIEliw6HDsWV6IGRlIGxhIFBlw7Fh?=', 'Iñigo Sanç Ibáñez de la Peña'), - ('Mart van Oostendorp', 'Mart van Oostendorp'), - ('', ''), - ) - for encoded_str, unicode in names: - self.assertEqual(unicode, parse_unicode(encoded_str)) - class TestAndroidSiteManifest(TestCase): def test_manifest(self): r = self.client.get(urlreverse('site.webmanifest')) diff --git a/ietf/utils/tests_searchindex.py b/ietf/utils/tests_searchindex.py index 8740716c85..e9fbf52020 100644 --- a/ietf/utils/tests_searchindex.py +++ b/ietf/utils/tests_searchindex.py @@ -1,6 +1,7 @@ # Copyright The IETF Trust 2026, All Rights Reserved from unittest import mock +import typesense.exceptions from django.conf import settings from django.test.utils import override_settings @@ -51,42 +52,30 @@ def test_sanitize_text(self): "TYPESENSE_COLLECTION_NAME": "frogs", } ) - @mock.patch("ietf.utils.searchindex.typesense.Client") - def test_update_or_create_rfc_entry(self, mock_ts_client_constructor): + def test_typesense_doc_from_rfc(self): not_rfc = WgDraftFactory() assert isinstance(not_rfc, Document) with self.assertRaises(AssertionError): - searchindex.update_or_create_rfc_entry(not_rfc) - self.assertFalse(mock_ts_client_constructor.called) + searchindex.typesense_doc_from_rfc(not_rfc) invalid_rfc = WgRfcFactory(name="rfc1000000", rfc_number=None) assert isinstance(invalid_rfc, Document) with self.assertRaises(AssertionError): - searchindex.update_or_create_rfc_entry(invalid_rfc) - self.assertFalse(mock_ts_client_constructor.called) + searchindex.typesense_doc_from_rfc(invalid_rfc) rfc = PublishedRfcDocEventFactory().doc assert isinstance(rfc, Document) - searchindex.update_or_create_rfc_entry(rfc) - self.assertTrue(mock_ts_client_constructor.called) - # walk the tree down to the method we expected to be called... - mock_upsert = mock_ts_client_constructor.return_value.collections[ - "frogs" - ].documents.upsert # matches value in override_settings above - self.assertTrue(mock_upsert.called) - upserted_dict = mock_upsert.call_args[0][0] + result = searchindex.typesense_doc_from_rfc(rfc) # Check a few values, not exhaustive - self.assertEqual(upserted_dict["id"], f"doc-{rfc.pk}") - self.assertEqual(upserted_dict["rfcNumber"], rfc.rfc_number) - self.assertEqual( - upserted_dict["abstract"], searchindex._sanitize_text(rfc.abstract) - ) - self.assertNotIn("adName", upserted_dict) - self.assertNotIn("content", upserted_dict) # no blob - self.assertNotIn("subseries", upserted_dict) + self.assertEqual(result["id"], f"doc-{rfc.pk}") + self.assertEqual(result["rfcNumber"], rfc.rfc_number) + self.assertEqual(result["abstract"], searchindex._sanitize_text(rfc.abstract)) + self.assertEqual(result["pages"], rfc.pages) + self.assertNotIn("adName", result) + self.assertNotIn("content", result) # no blob + self.assertNotIn("subseries", result) # repeat, this time with contents, an AD, and subseries docs - mock_upsert.reset_mock() store_str( kind="rfc", name=f"txt/{rfc.name}.txt", @@ -99,17 +88,15 @@ def test_update_or_create_rfc_entry(self, mock_ts_client_constructor): # (the typesense schema does not support this for real at the moment) BcpFactory(contains=[rfc], name="bcp1234") StdFactory(contains=[rfc], name="std1234") - searchindex.update_or_create_rfc_entry(rfc) - self.assertTrue(mock_upsert.called) - upserted_dict = mock_upsert.call_args[0][0] + result = searchindex.typesense_doc_from_rfc(rfc) # Check a few values, not exhaustive self.assertEqual( - upserted_dict["content"], + result["content"], searchindex._sanitize_text("The contents of this RFC"), ) - self.assertEqual(upserted_dict["adName"], "Alfred D. Rector") - self.assertIn("subseries", upserted_dict) - ss_dict = upserted_dict["subseries"] + self.assertEqual(result["adName"], "Alfred D. Rector") + self.assertIn("subseries", result) + ss_dict = result["subseries"] # We should get one of the two subseries docs, but neither is more correct # than the other... self.assertTrue( @@ -119,10 +106,108 @@ def test_update_or_create_rfc_entry(self, mock_ts_client_constructor): ) ) - # Finally, delete the contents blob and make sure things don't blow up - mock_upsert.reset_mock() + # Finally, delete the contents blob and make sure things don't blow up Blob.objects.get(bucket="rfc", name=f"txt/{rfc.name}.txt").delete() + result = searchindex.typesense_doc_from_rfc(rfc) + self.assertNotIn("content", result) + + @override_settings( + SEARCHINDEX_CONFIG={ + "TYPESENSE_API_URL": "http://ts.example.com", + "TYPESENSE_API_KEY": "test-api-key", + "TYPESENSE_COLLECTION_NAME": "frogs", + } + ) + @mock.patch("ietf.utils.searchindex.typesense_doc_from_rfc") + @mock.patch("ietf.utils.searchindex.typesense.Client") + def test_update_or_create_rfc_entry( + self, mock_ts_client_constructor, mock_tdoc_from_rfc + ): + fake_tdoc = object() + mock_tdoc_from_rfc.return_value = fake_tdoc + rfc = WgRfcFactory() + assert isinstance(rfc, Document) searchindex.update_or_create_rfc_entry(rfc) + self.assertTrue(mock_ts_client_constructor.called) + # walk the tree down to the method we expected to be called... + mock_upsert = mock_ts_client_constructor.return_value.collections[ + "frogs" # matches value in override_settings above + ].documents.upsert self.assertTrue(mock_upsert.called) - upserted_dict = mock_upsert.call_args[0][0] - self.assertNotIn("content", upserted_dict) + self.assertEqual(mock_upsert.call_args, mock.call(fake_tdoc)) + + @override_settings( + SEARCHINDEX_CONFIG={ + "TYPESENSE_API_URL": "http://ts.example.com", + "TYPESENSE_API_KEY": "test-api-key", + "TYPESENSE_COLLECTION_NAME": "frogs", + } + ) + @mock.patch("ietf.utils.searchindex.typesense_doc_from_rfc") + @mock.patch("ietf.utils.searchindex.typesense.Client") + def test_update_or_create_rfc_entries( + self, mock_ts_client_constructor, mock_tdoc_from_rfc + ): + fake_tdoc = object() + mock_tdoc_from_rfc.return_value = fake_tdoc + rfc = WgRfcFactory() + assert isinstance(rfc, Document) + searchindex.update_or_create_rfc_entries([rfc] * 50) # list of docs... + self.assertEqual(mock_ts_client_constructor.call_count, 1) + # walk the tree down to the method we expected to be called... + mock_import_ = mock_ts_client_constructor.return_value.collections[ + "frogs" # matches value in override_settings above + ].documents.import_ + self.assertEqual(mock_import_.call_count, 1) + self.assertEqual( + mock_import_.call_args, mock.call([fake_tdoc] * 50, {"action": "upsert"}) + ) + + mock_import_.reset_mock() + searchindex.update_or_create_rfc_entries([rfc] * 50, batchsize=20) + self.assertEqual(mock_ts_client_constructor.call_count, 2) # one more + # walk the tree down to the method we expected to be called... + mock_import_ = mock_ts_client_constructor.return_value.collections[ + "frogs" # matches value in override_settings above + ].documents.import_ + self.assertEqual(mock_import_.call_count, 3) + self.assertEqual( + mock_import_.call_args_list, + [ + mock.call([fake_tdoc] * 20, {"action": "upsert"}), + mock.call([fake_tdoc] * 20, {"action": "upsert"}), + mock.call([fake_tdoc] * 10, {"action": "upsert"}), + ], + ) + + @override_settings( + SEARCHINDEX_CONFIG={ + "TYPESENSE_API_URL": "http://ts.example.com", + "TYPESENSE_API_KEY": "test-api-key", + "TYPESENSE_COLLECTION_NAME": "frogs", + } + ) + @mock.patch("ietf.utils.searchindex.typesense.Client") + def test_create_collection(self, mock_ts_client_constructor): + searchindex.create_collection() + self.assertEqual(mock_ts_client_constructor.call_count, 1) + mock_collections = mock_ts_client_constructor.return_value.collections + self.assertTrue(mock_collections.create.called) + self.assertEqual(mock_collections.create.call_args[0][0]["name"], "frogs") + + @override_settings( + SEARCHINDEX_CONFIG={ + "TYPESENSE_API_URL": "http://ts.example.com", + "TYPESENSE_API_KEY": "test-api-key", + "TYPESENSE_COLLECTION_NAME": "frogs", + } + ) + @mock.patch("ietf.utils.searchindex.typesense.Client") + def test_delete_collection(self, mock_ts_client_constructor): + searchindex.delete_collection() + self.assertEqual(mock_ts_client_constructor.call_count, 1) + mock_collections = mock_ts_client_constructor.return_value.collections + self.assertTrue(mock_collections["frogs"].delete.called) + + mock_collections["frogs"].side_effect = typesense.exceptions.ObjectNotFound + searchindex.delete_collection() # should ignore the exception diff --git a/ietf/utils/tests_text.py b/ietf/utils/tests_text.py new file mode 100644 index 0000000000..51aa2eff13 --- /dev/null +++ b/ietf/utils/tests_text.py @@ -0,0 +1,71 @@ +# Copyright The IETF Trust 2021-2026, All Rights Reserved +from ietf.utils.test_utils import TestCase +from ietf.utils.text import parse_unicode, decode_document_content + + +class TestDecoders(TestCase): + def test_parse_unicode(self): + names = ( + ("=?utf-8?b?4Yuz4YuK4Ym1IOGJoOGJgOGIiA==?=", "ዳዊት በቀለ"), + ("=?utf-8?b?5Li9IOmDnA==?=", "丽 郜"), + ("=?utf-8?b?4KSV4KSu4KWN4KSs4KWL4KScIOCkoeCkvuCksA==?=", "कम्बोज डार"), + ("=?utf-8?b?zpfPgc6szrrOu861zrnOsSDOm865z4zOvc+Ezrc=?=", "Ηράκλεια Λιόντη"), + ("=?utf-8?b?15nXqdeo15DXnCDXqNeV15bXoNek15zXkw==?=", "ישראל רוזנפלד"), + ("=?utf-8?b?5Li95Y2OIOeahw==?=", "丽华 皇"), + ("=?utf-8?b?77ul77qu766V77qzIO+tlu+7ru+vvu+6ju+7pw==?=", "ﻥﺮﮕﺳ ﭖﻮﯾﺎﻧ"), + ( + "=?utf-8?b?77uh77uu77qz77uu76++IO+6su+7tO+7p++6jSDvurDvu6Pvuo7vu6jvr74=?=", + "ﻡﻮﺳﻮﯾ ﺲﻴﻧﺍ ﺰﻣﺎﻨﯾ", + ), + ( + "=?utf-8?b?ScOxaWdvIFNhbsOnIEliw6HDsWV6IGRlIGxhIFBlw7Fh?=", + "Iñigo Sanç Ibáñez de la Peña", + ), + ("Mart van Oostendorp", "Mart van Oostendorp"), + ("", ""), + ) + for encoded_str, unicode in names: + self.assertEqual(unicode, parse_unicode(encoded_str)) + + def test_decode_document_content(self): + utf8_bytes = "𒀭𒊩𒌆𒄈𒋢".encode("utf-8") # ends with 4-byte character + latin1_bytes = "àéîøü".encode("latin-1") + other_bytes = "àéîøü".encode("macintosh") # different from its latin-1 encoding + assert other_bytes.decode("macintosh") != other_bytes.decode("latin-1"),\ + "test broken: other_bytes must decode differently as latin-1" + + # simplest case + self.assertEqual( + decode_document_content(utf8_bytes), + utf8_bytes.decode(), + ) + # losing 1-4 bytes from the end leave the last character incomplete; the + # decoder should decode all but that last character + self.assertEqual( + decode_document_content(utf8_bytes[:-1]), + utf8_bytes.decode()[:-1], + ) + self.assertEqual( + decode_document_content(utf8_bytes[:-2]), + utf8_bytes.decode()[:-1], + ) + self.assertEqual( + decode_document_content(utf8_bytes[:-3]), + utf8_bytes.decode()[:-1], + ) + self.assertEqual( + decode_document_content(utf8_bytes[:-4]), + utf8_bytes.decode()[:-1], + ) + + # latin-1 is also simple + self.assertEqual( + decode_document_content(latin1_bytes), + latin1_bytes.decode("latin-1"), + ) + + # other character sets are just treated as latin1 (bug? feature? you decide) + self.assertEqual( + decode_document_content(other_bytes), + other_bytes.decode("latin-1"), + ) diff --git a/ietf/utils/text.py b/ietf/utils/text.py index 590ec3fd30..2763056e1a 100644 --- a/ietf/utils/text.py +++ b/ietf/utils/text.py @@ -263,3 +263,21 @@ def parse_unicode(text): else: text = decoded_string return text + + +def decode_document_content(content: bytes) -> str: + """Decode document contents as utf-8 or latin1 + + Method was developed in DocumentInfo.text() where it gave acceptable results + for existing documents / RFCs. + """ + try: + return content.decode("utf-8") + except UnicodeDecodeError: + pass + for back in range(1, 4): + try: + return content[:-back].decode("utf-8") + except UnicodeDecodeError: + pass + return content.decode("latin-1") # everything is legal in latin-1 diff --git a/ietf/utils/xmldraft.py b/ietf/utils/xmldraft.py index f555a0a16a..325b8499a9 100644 --- a/ietf/utils/xmldraft.py +++ b/ietf/utils/xmldraft.py @@ -102,6 +102,17 @@ def _document_name(self, ref): number = int(maybe_number) return f"{label}{number}" + target = ref.get("target") + if isinstance(target, str): + target = target.lower() + if target.startswith("https://datatracker.ietf.org/doc/"): + # len("https://datatracker.ietf.org/doc/")==33 + m = re.match(r"^(draft-[a-z0-9-]*[a-z0-9])([/-]\d{2})?/?$",target[33:]) + if m: + name = m.group(1) + return name + + # if we couldn't find a match so far, try the seriesInfo series_query = " or ".join(f"@name='{x.upper()}'" for x in series) for info in ref.xpath( diff --git a/k8s/auth.yaml b/k8s/auth.yaml index 392e306b54..2bdb064447 100644 --- a/k8s/auth.yaml +++ b/k8s/auth.yaml @@ -15,16 +15,6 @@ spec: labels: app: auth spec: - affinity: - podAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - datatracker - topologyKey: "kubernetes.io/hostname" securityContext: runAsNonRoot: true containers: diff --git a/k8s/beat.yaml b/k8s/beat.yaml index cc98beecf6..b4291c7e31 100644 --- a/k8s/beat.yaml +++ b/k8s/beat.yaml @@ -17,16 +17,6 @@ spec: labels: app: beat spec: - affinity: - podAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - datatracker - topologyKey: "kubernetes.io/hostname" securityContext: runAsNonRoot: true containers: @@ -69,4 +59,4 @@ spec: name: files-cfgmap dnsPolicy: ClusterFirst restartPolicy: Always - terminationGracePeriodSeconds: 600 + terminationGracePeriodSeconds: 10 diff --git a/k8s/celery.yaml b/k8s/celery.yaml index a2799f2a6d..2f4c0fd439 100644 --- a/k8s/celery.yaml +++ b/k8s/celery.yaml @@ -17,16 +17,6 @@ spec: labels: app: celery spec: - affinity: - podAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - datatracker - topologyKey: "kubernetes.io/hostname" securityContext: runAsNonRoot: true containers: diff --git a/k8s/memcached.yaml b/k8s/memcached.yaml index 8f73f3d0d5..5a4c9f0aed 100644 --- a/k8s/memcached.yaml +++ b/k8s/memcached.yaml @@ -13,16 +13,6 @@ spec: labels: app: memcached spec: - affinity: - podAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - datatracker - topologyKey: "kubernetes.io/hostname" securityContext: runAsNonRoot: true containers: diff --git a/k8s/rabbitmq.yaml b/k8s/rabbitmq.yaml index 780a399239..346b54c93e 100644 --- a/k8s/rabbitmq.yaml +++ b/k8s/rabbitmq.yaml @@ -13,16 +13,6 @@ spec: labels: app: rabbitmq spec: - affinity: - podAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - datatracker - topologyKey: "kubernetes.io/hostname" securityContext: runAsNonRoot: true containers: diff --git a/k8s/replicator.yaml b/k8s/replicator.yaml index 9c462bd96b..a28d9e8a16 100644 --- a/k8s/replicator.yaml +++ b/k8s/replicator.yaml @@ -17,16 +17,6 @@ spec: labels: app: replicator spec: - affinity: - podAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - datatracker - topologyKey: "kubernetes.io/hostname" securityContext: runAsNonRoot: true containers: diff --git a/k8s/settings_local.py b/k8s/settings_local.py index 3a7e524f5a..251f11234f 100644 --- a/k8s/settings_local.py +++ b/k8s/settings_local.py @@ -511,3 +511,8 @@ def _multiline_to_list(s): "DATATRACKER_ERRATA_METADATA_NOTIFICATION_URL must be set if " "DATATRACKER_ERRATA_METADATA_NOTIFICATION_API_KEY is provided" ) + +# name (with path) of errata.json in the red bucket +ERRATA_JSON_BLOB_NAME = os.environ.get( + "DATATRACKER_ERRATA_JSON_BLOB_NAME", "other/errata.json" +)