Improve robustness of pdfization. Tune the test crawler. Commit ready for merge.

rjsparks · rjsparks · commit b1585124d6ea · 2022-01-06T20:17:55.000Z
- Legacy-Id: 19813
diff --git a/bin/test-crawl b/bin/test-crawl
@@ -232,6 +232,8 @@ def skip_url(url):
             # Skip most html conversions, not worth the time
             "^/doc/html/draft-[0-9ac-z]",
             "^/doc/html/draft-b[0-9b-z]",
+            "^/doc/pdf/draft-[0-9ac-z]",
+            "^/doc/pdf/draft-b[0-9b-z]",
             "^/doc/html/charter-.*",
             "^/doc/html/status-.*",
             "^/doc/html/rfc.*",
diff --git a/ietf/doc/models.py b/ietf/doc/models.py
@@ -577,7 +577,11 @@ def pdfized(self):
             pdf = None
         if not pdf:
             html = rfc2html.markup(text, path=settings.PDFIZER_URL_PREFIX)
-            pdf = wpHTML(string=html).write_pdf(stylesheets=[io.BytesIO(b'html { font-size: 94%;}')])
+            try:
+                pdf = wpHTML(string=html.replace('\xad','')).write_pdf(stylesheets=[io.BytesIO(b'html { font-size: 94%;}')])
+            except AssertionError:
+                log.log(f'weasyprint failed with an assert on {self.name}')
+                pdf = None
             if pdf:
                 cache.set(cache_key, pdf, settings.PDFIZER_CACHE_TIME)
         return pdf
diff --git a/ietf/doc/utils.py b/ietf/doc/utils.py
@@ -1051,8 +1051,9 @@ def build_file_urls(doc):
             label = "plain text" if t == "txt" else t
             file_urls.append((label, base + doc.name + "-" + doc.rev + "." + t))
 
-        file_urls.append(("htmlized", urlreverse('ietf.doc.views_doc.document_html', kwargs=dict(name=doc.name, rev=doc.rev))))
-        file_urls.append(("pdfized", urlreverse('ietf.doc.views_doc.document_pdfized', kwargs=dict(name=doc.name, rev=doc.rev))))
+        if doc.text():
+            file_urls.append(("htmlized", urlreverse('ietf.doc.views_doc.document_html', kwargs=dict(name=doc.name, rev=doc.rev))))
+            file_urls.append(("pdfized", urlreverse('ietf.doc.views_doc.document_pdfized', kwargs=dict(name=doc.name, rev=doc.rev))))
         file_urls.append(("bibtex", urlreverse('ietf.doc.views_doc.document_bibtex',kwargs=dict(name=doc.name,rev=doc.rev))))
 
     return file_urls, found_types