Skip to content

Commit adbf8ac

Browse files
committed
Provide pdfs of htmlized (pdfized) documents to replace tools.ietf.org/pdf/ at /doc/pdf. Commit ready for merge.
- Legacy-Id: 19753
1 parent 024cfc3 commit adbf8ac

7 files changed

Lines changed: 122 additions & 2 deletions

File tree

ietf/doc/factories.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,12 @@ def states(obj, create, extracted, **kwargs):
147147
else:
148148
obj.set_state(State.objects.get(type_id='draft',slug='rfc'))
149149

150+
@factory.post_generation
151+
def reset_canonical_name(obj, create, extracted, **kwargs):
152+
if hasattr(obj, '_canonical_name'):
153+
del obj._canonical_name
154+
return None
155+
150156
class WgDraftFactory(BaseDocumentFactory):
151157

152158
type_id = 'draft'
@@ -186,6 +192,11 @@ def states(obj, create, extracted, **kwargs):
186192
obj.set_state(State.objects.get(type_id='draft',slug='rfc'))
187193
obj.set_state(State.objects.get(type_id='draft-iesg', slug='pub'))
188194

195+
@factory.post_generation
196+
def reset_canonical_name(obj, create, extracted, **kwargs):
197+
if hasattr(obj, '_canonical_name'):
198+
del obj._canonical_name
199+
return None
189200

190201
class RgDraftFactory(BaseDocumentFactory):
191202

@@ -230,6 +241,12 @@ def states(obj, create, extracted, **kwargs):
230241
obj.set_state(State.objects.get(type_id='draft-stream-irtf', slug='pub'))
231242
obj.set_state(State.objects.get(type_id='draft-iesg',slug='idexists'))
232243

244+
@factory.post_generation
245+
def reset_canonical_name(obj, create, extracted, **kwargs):
246+
if hasattr(obj, '_canonical_name'):
247+
del obj._canonical_name
248+
return None
249+
233250

234251
class CharterFactory(BaseDocumentFactory):
235252

ietf/doc/models.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import time
1111

1212
from typing import Optional, TYPE_CHECKING
13+
from weasyprint import HTML as wpHTML
1314

1415
from django.db import models
1516
from django.core import checks
@@ -565,6 +566,22 @@ def htmlized(self):
565566
cache.set(cache_key, html, settings.HTMLIZER_CACHE_TIME)
566567
return html
567568

569+
def pdfized(self):
570+
name = self.get_base_name()
571+
text = self.text()
572+
cache = caches['pdfized']
573+
cache_key = name.split('.')[0]
574+
try:
575+
pdf = cache.get(cache_key)
576+
except EOFError:
577+
pdf = None
578+
if not pdf:
579+
html = rfc2html.markup(text, path=settings.PDFIZER_URL_PREFIX)
580+
pdf = wpHTML(string=html).write_pdf(stylesheets=[io.BytesIO(b'html { font-size: 94%;}')])
581+
if pdf:
582+
cache.set(cache_key, pdf, settings.PDFIZER_CACHE_TIME)
583+
return pdf
584+
568585
def references(self):
569586
return self.relations_that_doc(('refnorm','refinfo','refunk','refold'))
570587

ietf/doc/tests.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2733,4 +2733,38 @@ def test_non_draft(self):
27332733
charter = CharterFactory()
27342734
self.should_404(dict(name=charter.name))
27352735

2736+
class PdfizedTests(TestCase):
27362737

2738+
def __init__(self, *args, **kwargs):
2739+
self.view = "ietf.doc.views_doc.document_pdfized"
2740+
super(self.__class__, self).__init__(*args, **kwargs)
2741+
2742+
def should_succeed(self, argdict):
2743+
url = urlreverse(self.view, kwargs=argdict)
2744+
r = self.client.get(url)
2745+
self.assertEqual(r.status_code,200)
2746+
self.assertEqual(r.get('Content-Type'),'application/pdf;charset=utf-8')
2747+
2748+
def should_404(self, argdict):
2749+
url = urlreverse(self.view, kwargs=argdict)
2750+
r = self.client.get(url)
2751+
self.assertEqual(r.status_code, 404)
2752+
2753+
def test_pdfized(self):
2754+
rfc = WgRfcFactory(create_revisions=range(0,2))
2755+
2756+
dir = settings.RFC_PATH
2757+
with (Path(dir) / f'{rfc.canonical_name()}.txt').open('w') as f:
2758+
f.write('text content')
2759+
dir = settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR
2760+
for r in range(0,2):
2761+
with (Path(dir) / f'{rfc.name}-{r:02d}.txt').open('w') as f:
2762+
f.write('text content')
2763+
2764+
self.should_succeed(dict(name=rfc.canonical_name()))
2765+
self.should_succeed(dict(name=rfc.name))
2766+
for r in range(0,2):
2767+
self.should_succeed(dict(name=rfc.name,rev=f'{r:02d}'))
2768+
for ext in ('pdf','txt','html','anythingatall'):
2769+
self.should_succeed(dict(name=rfc.name,rev=f'{r:02d}',ext=ext))
2770+
self.should_404(dict(name=rfc.name,rev='02'))

ietf/doc/urls.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
url(r'^html/%(name)s(?:-%(rev)s)?(\.txt|\.html)?/?$' % settings.URL_REGEXPS, views_doc.document_html),
7373

7474
url(r'^id/%(name)s(?:-%(rev)s)?(?:\.(?P<ext>(txt|html|xml)))?/?$' % settings.URL_REGEXPS, views_doc.document_raw_id),
75+
url(r'^pdf/%(name)s(?:-%(rev)s)?(?:\.(?P<ext>[a-z]+))?/?$' % settings.URL_REGEXPS, views_doc.document_pdfized),
7576

7677
# End of block that should be an idealized docs.ietf.org service instead
7778

ietf/doc/views_doc.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -769,15 +769,18 @@ def document_html(request, name, rev=None):
769769
return redirect('ietf.doc.views_doc.document_html', name=found.matched_name)
770770

771771
doc = found.documents.get()
772-
if not os.path.exists(doc.get_file_name()):
773-
raise Http404("File not found: %s" % doc.get_file_name())
772+
774773

775774
if found.matched_rev or found.matched_name.startswith('rfc'):
776775
rev = found.matched_rev
777776
else:
778777
rev = doc.rev
779778
if rev:
780779
doc = doc.history_set.filter(rev=rev).first() or doc.fake_history_obj(rev)
780+
781+
if not os.path.exists(doc.get_file_name()):
782+
raise Http404("File not found: %s" % doc.get_file_name())
783+
781784
if doc.type_id in ['draft',]:
782785
doc.supermeta = build_doc_supermeta_block(doc)
783786
doc.meta = build_doc_meta_block(doc, settings.HTMLIZER_URL_PREFIX)
@@ -803,6 +806,36 @@ def document_html(request, name, rev=None):
803806

804807
return render(request, "doc/document_html.html", {"doc":doc, "doccolor":doccolor })
805808

809+
def document_pdfized(request, name, rev=None, ext=None):
810+
811+
found = fuzzy_find_documents(name, rev)
812+
num_found = found.documents.count()
813+
if num_found == 0:
814+
raise Http404("Document not found: %s" % name)
815+
if num_found > 1:
816+
raise Http404("Multiple documents matched: %s" % name)
817+
818+
if found.matched_name.startswith('rfc') and name != found.matched_name:
819+
return redirect('ietf.doc.views_doc.document_pdfized', name=found.matched_name)
820+
821+
doc = found.documents.get()
822+
823+
if found.matched_rev or found.matched_name.startswith('rfc'):
824+
rev = found.matched_rev
825+
else:
826+
rev = doc.rev
827+
if rev:
828+
doc = doc.history_set.filter(rev=rev).first() or doc.fake_history_obj(rev)
829+
830+
if not os.path.exists(doc.get_file_name()):
831+
raise Http404("File not found: %s" % doc.get_file_name())
832+
833+
pdf = doc.pdfized()
834+
if pdf:
835+
return HttpResponse(pdf,content_type='application/pdf;charset=utf-8')
836+
else:
837+
raise Http404
838+
806839
def check_doc_email_aliases():
807840
pattern = re.compile(r'^expand-(.*?)(\..*?)?@.*? +(.*)$')
808841
good_count = 0

ietf/settings.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -743,6 +743,13 @@ def skip_unreadable_post(record):
743743
'MAX_ENTRIES': 100000, # 100,000
744744
},
745745
},
746+
'pdfized': {
747+
'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache',
748+
'LOCATION': '/a/cache/datatracker/pdfized',
749+
'OPTIONS': {
750+
'MAX_ENTRIES': 100000, # 100,000
751+
},
752+
},
746753
'slowpages': {
747754
'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache',
748755
'LOCATION': '/a/cache/datatracker/slowpages',
@@ -755,6 +762,8 @@ def skip_unreadable_post(record):
755762
HTMLIZER_VERSION = 1
756763
HTMLIZER_URL_PREFIX = "/doc/html"
757764
HTMLIZER_CACHE_TIME = 60*60*24*14 # 14 days
765+
PDFIZER_CACHE_TIME = HTMLIZER_CACHE_TIME
766+
PDFIZER_URL_PREFIX = IDTRACKER_BASE_URL+"/doc/pdf"
758767

759768
# Email settings
760769
IPR_EMAIL_FROM = 'ietf-ipr@ietf.org'
@@ -1267,6 +1276,14 @@ def skip_unreadable_post(record):
12671276
'MAX_ENTRIES': 1000,
12681277
},
12691278
},
1279+
'pdfized': {
1280+
'BACKEND': 'django.core.cache.backends.dummy.DummyCache',
1281+
#'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache',
1282+
'LOCATION': '/var/cache/datatracker/pdfized',
1283+
'OPTIONS': {
1284+
'MAX_ENTRIES': 1000,
1285+
},
1286+
},
12701287
'slowpages': {
12711288
'BACKEND': 'django.core.cache.backends.dummy.DummyCache',
12721289
#'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache',

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ tqdm>=3.7.0
7070
#Trac>=1.0.10,<1.2
7171
Unidecode>=0.4.18,<1.2.0
7272
#wsgiref>=0.1.2
73+
weasyprint>=53.4
7374
xml2rfc>=2.35.0
7475
xym>=0.4.4,!=0.4.7,<1.0
7576
#zxcvbn-python>=4.4.14 # Not needed until we do back-end password entropy validation

0 commit comments

Comments
 (0)