diff --git a/ietf/api/__init__.py b/ietf/api/__init__.py index d4562f97dd..00733717c4 100644 --- a/ietf/api/__init__.py +++ b/ietf/api/__init__.py @@ -177,8 +177,15 @@ def dehydrate(self, bundle, for_list=True): return dehydrated + +# XML 1.0 forbids all control characters except tab (#x9), LF (#xA), and CR (#xD). +# Replace each with its Unicode control picture (U+2400 + codepoint) so the +# substitution is lossless and the result is valid XML. +_XML_INVALID_CTRL_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]") + + class Serializer(tastypie.serializers.Serializer): - OPTION_ESCAPE_NULLS = "datatracker-escape-nulls" + OPTION_ESCAPE_XML_INVALID = "datatracker-escape-xml-invalid" def format_datetime(self, data): return data.astimezone(datetime.UTC).replace(tzinfo=None).isoformat(timespec="seconds") + "Z" @@ -186,18 +193,17 @@ def format_datetime(self, data): def to_simple(self, data, options): options = options or {} simple_data = super().to_simple(data, options) - if ( - options.get(self.OPTION_ESCAPE_NULLS, False) - and isinstance(simple_data, str) - ): - # replace nulls with unicode "symbol for null character", \u2400 - simple_data = simple_data.replace("\x00", "\u2400") + if options.get(self.OPTION_ESCAPE_XML_INVALID, False) and isinstance(simple_data, str): + # Replace control chars invalid in XML 1.0 with their Unicode + # control pictures (U+2400-U+241F) so lxml won't reject the string. + simple_data = _XML_INVALID_CTRL_RE.sub( + lambda m: chr(ord(m.group()) + 0x2400), simple_data + ) return simple_data def to_etree(self, data, options=None, name=None, depth=0): - # lxml does not escape nulls on its own, so ask to_simple() to do it. - # This is mostly (only?) an issue when generating errors responses for - # fuzzers. + # lxml rejects control characters that are invalid in XML 1.0. + # Ask to_simple() to escape them before they reach lxml. options = options or {} - options[self.OPTION_ESCAPE_NULLS] = True + options[self.OPTION_ESCAPE_XML_INVALID] = True return super().to_etree(data, options, name, depth) diff --git a/ietf/api/tests.py b/ietf/api/tests.py index 2a44791a5c..887969cec1 100644 --- a/ietf/api/tests.py +++ b/ietf/api/tests.py @@ -1542,20 +1542,27 @@ def test_all_model_resources_exist(self): self.assertIn(model._meta.model_name, list(app_resources.keys()), "There doesn't seem to be any API resource for model %s.models.%s"%(app.__name__,model.__name__,)) - def test_serializer_to_etree_handles_nulls(self): - """Serializer to_etree() should handle a null character""" + def test_serializer_to_etree_handles_xml_invalid_control_chars(self): + """Serializer to_etree() must not raise ValueError for any XML-invalid control character.""" serializer = Serializer() + # Ordinary strings and strings with valid whitespace must pass through unchanged. try: - serializer.to_etree("string with no nulls in it") + serializer.to_etree("string with no special chars") + serializer.to_etree("tab\there lf\nhere cr\rhere") except ValueError: self.fail("serializer.to_etree raised ValueError on an ordinary string") - try: - serializer.to_etree("string with a \x00 in it") - except ValueError: - self.fail( - "serializer.to_etree raised ValueError on a string " - "containing a null character" - ) + # Every control character that XML 1.0 forbids must be escaped rather than + # causing a ValueError. This is the class of characters that triggered the + # production exception (lxml.etree._utf8 rejects them all). + invalid_chars = [chr(c) for c in list(range(0x00, 0x09)) + [0x0b, 0x0c] + list(range(0x0e, 0x20))] + for ch in invalid_chars: + try: + serializer.to_etree(f"string with {ch!r} in it") + except ValueError: + self.fail( + f"serializer.to_etree raised ValueError on a string " + f"containing control character U+{ord(ch):04X}" + ) class RfcdiffSupportTests(TestCase):