Started refactoring of reading text from document files (drafts, charters, etc.) in order to normalise on one way of doing this, and making that return unicode rather than undecoded bytes. This is the first step of two, in order to gauge the possible issues and report on discrepancies.

levkowetz · levkowetz · commit 967ece7e7d19 · 2017-12-08T21:51:11.000Z
- Legacy-Id: 14406
diff --git a/ietf/doc/mails.py b/ietf/doc/mails.py
@@ -8,13 +8,16 @@
 from django.conf import settings
 from django.urls import reverse as urlreverse
 
+import debug                            # pyflakes:ignore
+
 from ietf.utils.mail import send_mail, send_mail_text
 from ietf.ipr.utils import iprs_from_docs, related_docs
 from ietf.doc.models import WriteupDocEvent, LastCallDocEvent, DocAlias, ConsensusDocEvent
 from ietf.doc.utils import needed_ballot_positions, get_document_content
 from ietf.group.models import Role
 from ietf.doc.models import Document
 from ietf.mailtrigger.utils import gather_address_lists
+from ietf.utils import log
 
 def email_state_changed(request, doc, text, mailtrigger_id=None):
     (to,cc) = gather_address_lists(mailtrigger_id or 'doc_state_edited',doc=doc)
@@ -515,7 +518,13 @@ def email_charter_internal_review(request, charter):
                         os.path.join(settings.CHARTER_PATH,filename),
                         split=False,
                         markup=False,
-                   )
+                   ).decode('utf-8')
+    utext = charter.text_or_error()     # pyflakes:ignore
+    if charter_text and charter_text != utext and not 'Error; cannot read' in charter_text:
+        debug.show('charter_text[:64]')
+        debug.show('utext[:64]')
+        log.assertion('charter_text == utext')
+
     send_mail(request, addrs.to, settings.DEFAULT_FROM_EMAIL,
               'Internal %s Review: %s (%s)'%(charter.group.type.name,charter.group.name,charter.group.acronym),
               'doc/mail/charter_internal_review.txt',
diff --git a/ietf/doc/models.py b/ietf/doc/models.py
@@ -449,6 +449,9 @@ def text(self):
         #
         return text
 
+    def text_or_error(self):
+        return self.text() or "Error; cannot read (%s)"%self.get_file_name()
+
     def htmlized(self):
         name = self.get_base_name()
         text = self.text()
diff --git a/ietf/doc/templatetags/ietf_filters.py b/ietf/doc/templatetags/ietf_filters.py
@@ -18,7 +18,7 @@
 from ietf.doc.models import ConsensusDocEvent
 from ietf.doc.utils import get_document_content
 from ietf.utils.text import wordwrap, fill, wrap_text_if_unwrapped
-
+from ietf.utils import log
 
 register = template.Library()
 
@@ -509,7 +509,13 @@ def document_content(doc):
     if doc is None:
         return None
     path = os.path.join(doc.get_file_path(),doc.filename_with_rev())
-    return get_document_content(doc.name,path,markup=False)
+    content = get_document_content(doc.name,path,markup=False)
+    utext = doc.text_or_error()         # pyflakes:ignore
+    if content and content != utext and not 'Error; cannot read' in content:
+        debug.show('content[:64]')
+        debug.show('utext[:64]')
+        log.assertion('content == utext')
+    return content
 
 @register.filter
 def format_timedelta(timedelta):
diff --git a/ietf/doc/utils.py b/ietf/doc/utils.py
@@ -22,7 +22,7 @@
 from ietf.name.models import DocReminderTypeName, DocRelationshipName
 from ietf.group.models import Role
 from ietf.ietfauth.utils import has_role
-from ietf.utils import draft, markup_txt
+from ietf.utils import draft
 from ietf.utils.mail import send_mail
 from ietf.mailtrigger.utils import gather_address_lists
 
@@ -299,17 +299,19 @@ def get_unicode_document_content(key, filename, codec='utf-8', errors='ignore'):
     return raw_content
 
 def get_document_content(key, filename, split=True, markup=True):
+    #log.unreachable("2017-12-05")
     try:
         with open(filename, 'rb') as f:
             raw_content = f.read()
     except IOError:
         error = "Error; cannot read ("+key+")"
         return error
 
-    if markup:
-        return markup_txt.markup(raw_content, split)
-    else:
-        return raw_content
+#     if markup:
+#         return markup_txt.markup(raw_content, split)
+#     else:
+#         return raw_content
+    return raw_content
 
 def tags_suffix(tags):
     return (u"::" + u"::".join(t.name for t in tags)) if tags else u""
diff --git a/ietf/doc/views_conflict_review.py b/ietf/doc/views_conflict_review.py
@@ -254,7 +254,12 @@ def edit_ad(request, name):
 def default_approval_text(review):
 
     filename = "%s-%s.txt" % (review.canonical_name(), review.rev)
-    current_text = get_document_content(filename, os.path.join(settings.CONFLICT_REVIEW_PATH, filename), split=False, markup=False)
+    current_text = get_document_content(filename, os.path.join(settings.CONFLICT_REVIEW_PATH, filename), split=False, markup=False).decode('utf-8')
+    utext = review.text_or_error()      # pyflakes:ignore
+    if current_text and current_text != utext and not 'Error; cannot read' in current_text:
+        debug.show('current_text[:64]')
+        debug.show('utext[:64]')
+        log.assertion('current_text == utext')
 
     conflictdoc = review.relateddocument_set.get(relationship__slug='conflrev').target.document
     if conflictdoc.stream_id=='ise':
diff --git a/ietf/doc/views_doc.py b/ietf/doc/views_doc.py
@@ -66,6 +66,8 @@
 from ietf.review.models import ReviewRequest
 from ietf.review.utils import can_request_review_of_doc, review_requests_to_list_for_docs
 from ietf.review.utils import no_review_from_teams_on_doc
+from ietf.utils import markup_txt, log
+from ietf.utils.text import maybe_split
 
 
 def render_document_top(request, doc, tab, name):
@@ -186,7 +188,13 @@ def document_main(request, name, rev=None):
             filename = name + ".txt"
 
             content = get_document_content(filename, os.path.join(settings.RFC_PATH, filename),
-                                           split_content, markup=True)
+                                           split_content, markup=True).decode('utf-8')
+            utext = doc.text_or_error() # pyflakes:ignore
+            if content and content != utext and not 'Error; cannot read' in content:
+                debug.show('content[:64]')
+                debug.show('utext[:64]')
+                log.assertion('content == utext')
+            content = markup_txt.markup(maybe_split(content, split=split_content))
 
             # file types
             base_path = os.path.join(settings.RFC_PATH, name + ".")
@@ -216,7 +224,13 @@ def document_main(request, name, rev=None):
             filename = "%s-%s.txt" % (draft_name, doc.rev)
 
             content = get_document_content(filename, os.path.join(settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR, filename),
-                                           split_content, markup=True)
+                                           split_content, markup=True).decode('utf-8')
+            utext = doc.text_or_error() # pyflakes:ignore
+            if content and content != utext and not 'Error; cannot read' in content:
+                debug.show('content[:64]')
+                debug.show('utext[:64]')
+                log.assertion('content == utext')
+            content = markup_txt.markup(maybe_split(content, split=split_content)) 
 
             # file types
             base_path = os.path.join(settings.INTERNET_DRAFT_PATH, doc.name + "-" + doc.rev + ".")
@@ -439,7 +453,13 @@ def document_main(request, name, rev=None):
     if doc.type_id == "charter":
         filename = "%s-%s.txt" % (doc.canonical_name(), doc.rev)
 
-        content = get_document_content(filename, os.path.join(settings.CHARTER_PATH, filename), split=False, markup=True)
+        content = get_document_content(filename, os.path.join(settings.CHARTER_PATH, filename), split=False, markup=True).decode('utf-8')
+        utext = doc.text_or_error()     # pyflakes:ignore
+        if content and content != utext and not 'Error; cannot read' in content:
+            debug.show('content[:64]')
+            debug.show('utext[:64]')
+            log.assertion('content == utext')
+        content = markup_txt.markup(content)
 
         ballot_summary = None
         if doc.get_state_slug() in ("intrev", "iesgrev"):
@@ -480,9 +500,15 @@ def document_main(request, name, rev=None):
 
         if doc.rev == "00" and not os.path.isfile(pathname):
             # This could move to a template
-            content = "A conflict review response has not yet been proposed."
+            content = u"A conflict review response has not yet been proposed."
         else:     
-            content = get_document_content(filename, pathname, split=False, markup=True)
+            content = get_document_content(filename, pathname, split=False, markup=True).decode('utf-8')
+            utext = doc.text_or_error() # pyflakes:ignore
+            if content and content != utext and not 'Error; cannot read' in content:
+                debug.show('content[:64]')
+                debug.show('utext[:64]')
+                log.assertion('content == utext')
+            content = markup_txt.markup(content)
 
         ballot_summary = None
         if doc.get_state_slug() in ("iesgeval") and doc.active_ballot():
@@ -507,9 +533,14 @@ def document_main(request, name, rev=None):
 
         if doc.rev == "00" and not os.path.isfile(pathname):
             # This could move to a template
-            content = "Status change text has not yet been proposed."
+            content = u"Status change text has not yet been proposed."
         else:     
-            content = get_document_content(filename, pathname, split=False)
+            content = get_document_content(filename, pathname, split=False).decode('utf-8')
+            utext = doc.text_or_error() # pyflakes:ignore
+            if content and content != utext and not 'Error; cannot read' in content:
+                debug.show('content[:64]')
+                debug.show('utext[:64]')
+                log.assertion('content == utext')
 
         ballot_summary = None
         if doc.get_state_slug() in ("iesgeval"):
@@ -562,7 +593,12 @@ def document_main(request, name, rev=None):
                 url = urlbase + extension 
 
             if extension == ".txt":
-                content = get_document_content(basename, pathname + extension, split=False)
+                content = get_document_content(basename, pathname + extension, split=False).decode('utf-8')
+                utext = doc.text_or_error()      # pyflakes:ignore
+                if content != utext:
+                    debug.show('content[:64]')
+                    debug.show('utext[:64]')
+                log.assertion('content == utext')
                 t = "plain text"
 
             other_types.append((t, url))
diff --git a/ietf/doc/views_status_change.py b/ietf/doc/views_status_change.py
@@ -282,7 +282,12 @@ def newstatus(relateddoc):
 def default_approval_text(status_change,relateddoc):
 
     filename = "%s-%s.txt" % (status_change.canonical_name(), status_change.rev)
-    current_text = get_document_content(filename, os.path.join(settings.STATUS_CHANGE_PATH, filename), split=False, markup=False)
+    current_text = get_document_content(filename, os.path.join(settings.STATUS_CHANGE_PATH, filename), split=False, markup=False).decode('utf-8')
+    utext = status_change.text_or_error() # pyflakes:ignore
+    if current_text and current_text != utext and not 'Error; cannot read' in current_text:
+        debug.show('current_text[:64]')
+        debug.show('utext[:64]')
+        log.assertion('current_text == utext')
 
     if relateddoc.target.document.std_level.slug in ('std','ps','ds','bcp',):
         action = "Protocol Action"
diff --git a/ietf/meeting/forms.py b/ietf/meeting/forms.py
@@ -18,6 +18,7 @@
 from ietf.message.models import Message
 from ietf.person.models import Person
 from ietf.utils.fields import DatepickerDateField, DurationField
+from ietf.utils import log
 
 # need to insert empty option for use in ChoiceField
 # countries.insert(0, ('', '-'*9 ))
@@ -220,7 +221,14 @@ def __init__(self, *args, **kwargs):
             if self.instance.agenda():
                 doc = self.instance.agenda()
                 path = os.path.join(doc.get_file_path(), doc.filename_with_rev())
-                self.initial['agenda'] = get_document_content(os.path.basename(path), path, markup=False)
+                content = get_document_content(os.path.basename(path), path, markup=False).decode('utf-8')
+                utext = doc.text_or_error() # pyflakes:ignore
+                if content and content != utext and not 'Error; cannot read' in content:
+                    debug.show('content[:64]')
+                    debug.show('utext[:64]')
+                    log.assertion('content == utext')
+                self.initial['agenda'] = content
+                
 
     def clean_date(self):
         '''Date field validator.  We can't use required on the input because
diff --git a/ietf/secr/telechat/views.py b/ietf/secr/telechat/views.py
@@ -6,6 +6,8 @@
 from django.shortcuts import render, get_object_or_404, redirect
 from django.utils.functional import curry
 
+import debug                            # pyflakes:ignore
+
 from ietf.doc.models import DocEvent, Document, BallotDocEvent, BallotPositionDocEvent, BallotType, WriteupDocEvent
 from ietf.doc.utils import get_document_content, add_state_change_event
 from ietf.person.models import Person
@@ -15,7 +17,7 @@
 from ietf.iesg.agenda import agenda_data, get_doc_section
 from ietf.ietfauth.utils import role_required
 from ietf.secr.telechat.forms import BallotForm, ChangeStateForm, DateSelectForm, TELECHAT_TAGS
-
+from ietf.utils import log
 
 
 '''
@@ -70,7 +72,12 @@ def get_doc_writeup(doc):
             writeup = latest.text
     elif doc.type_id == 'conflrev':
         path = os.path.join(doc.get_file_path(),doc.filename_with_rev())
-        writeup = get_document_content(doc.name,path,split=False,markup=False)
+        writeup = get_document_content(doc.name,path,split=False,markup=False).decode('utf-8')
+        utext = doc.text_or_error()     # pyflakes:ignore
+        if writeup and writeup != utext and not 'Error; cannot read' in writeup:
+            debug.show('writeup[:64]')
+            debug.show('utext[:64]')
+            log.assertion('writeup == utext')
     return writeup
 
 def get_last_telechat_date():
diff --git a/ietf/utils/markup_txt.py b/ietf/utils/markup_txt.py
@@ -30,26 +30,37 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from django.utils.html import escape
-import string
 import re
+import six
+import string
 
+from django.utils.html import escape
+
+from ietf.utils import log
 from ietf.utils.text import wordwrap
 
-def markup(content, split=True, width=None):
+def markup_ascii(content, width=None):
+    log.unreachable('2017-12-08')
+    if six.PY2:
+        assert isinstance(content, basestring)
+        # at this point, "content" is normal string
+        # fix most common non-ASCII characters
+        t1 = string.maketrans("\x91\x92\x93\x94\x95\x96\x97\xc6\xe8\xe9", "\'\'\"\"o--\'ee")
+        # map everything except printable ASCII, TAB, LF, FF to "?"
+        t2 = string.maketrans('','')
+        t3 = "?"*9 + "\t\n?\f" + "?"*19 + t2[32:127] + "?"*129
+        t4 = t1.translate(t3)
+        content = content.translate(t4)
+    else:
+        log.assertion('six.PY2')
+    return markup(content.decode('ascii'), width)
+
+def markup(content, width=None):
+    log.assertion('isinstance(content, six.text_type)')
     # normalize line endings to LF only
     content = content.replace("\r\n", "\n")
     content = content.replace("\r", "\n")
 
-    # at this point, "content" is normal string
-    # fix most common non-ASCII characters
-    t1 = string.maketrans("\x91\x92\x93\x94\x95\x96\x97\xc6\xe8\xe9", "\'\'\"\"o--\'ee")
-    # map everything except printable ASCII, TAB, LF, FF to "?"
-    t2 = string.maketrans('','')
-    t3 = "?"*9 + "\t\n?\f" + "?"*19 + t2[32:127] + "?"*129
-    t4 = t1.translate(t3)
-    content = content.translate(t4)
-
     # remove leading white space
     content = content.lstrip()
     # remove runs of blank lines
@@ -69,36 +80,4 @@ def markup(content, split=True, width=None):
 
     content = re.sub("\n\n([0-9]+\\.|[A-Z]\\.[0-9]|Appendix|Status of|Abstract|Table of|Full Copyright|Copyright|Intellectual Property|Acknowled|Author|Index)(.*)(?=\n\n)", """\n\n<span class="m_h">\g<1>\g<2></span>""", content)
 
-    if split:
-        n = content.find("\n", 5000)
-        content1 = "<pre>"+content[:n+1]+"</pre>\n"
-        return content1
-        #content2 = "<pre>"+content[n+1:]+"</pre>\n"
-        #return (content1, content2)
-    else:
-        return "<pre>" + content + "</pre>\n"
-
-def markup_unicode(content, split=True, width=None, container_classes=None):
-    # normalize line endings to LF only
-    content = content.replace("\r\n", "\n")
-    content = content.replace("\r", "\n")
-
-    # remove leading white space
-    content = content.lstrip()
-    # remove runs of blank lines
-    content = re.sub("\n\n\n+", "\n\n", content)
-
-    # maybe wordwrap.  This must be done before the escaping below.
-    if width:
-        content = wordwrap(content, width)
-
-    # expand tabs + escape 
-    content_to_show = escape(content.expandtabs())
-
-    if split:
-        n = content.find("\n", 5000)
-        content_to_show = content_to_show[:n+1]
-    
-    pre = '<pre class="%s" >' % container_classes if container_classes else '<pre>'
-
-    return pre+content_to_show+'</pre>\n'
+    return "<pre>" + content + "</pre>\n"
diff --git a/ietf/utils/text.py b/ietf/utils/text.py
@@ -124,3 +124,12 @@ def isascii(text):
         return True
     except UnicodeEncodeError:
         return False
+
+def maybe_split(text, split=True, pos=5000):
+    if split:
+        n = text.find("\n", pos)
+        text = text[:n+1]
+    return text
+
+        
+    

Original file line number	Diff line number	Diff line change
`@@ -449,6 +449,9 @@ def text(self):`
`449`	`449`	`#`
`450`	`450`	`return text`
`451`	`451`
	`452`	`+ def text_or_error(self):`
	`453`	`+ return self.text() or "Error; cannot read (%s)"%self.get_file_name()`
	`454`	`+`
`452`	`455`	`def htmlized(self):`
`453`	`456`	`name = self.get_base_name()`
`454`	`457`	`text = self.text()`