feat: support justhtml parsing library to convert email to plain text

rouilj · rouilj · commit 6a05a1d13d5c · 2025-12-14T22:40:46.000-05:00
justhtml is an pure python, fast, HTML5 compliant parser. It is now an
option for converting html only emails to plain text. Its output
format differs slightly from dehtml or beautifulsoup. Mostly by
removing extra blank lines.

dehtml.py:
  Using the stream parser of justhtml. Unable to get the full
  document parser to successfully strip script and style blocks.

  If I can fix this and use the standard parser, I can in theory
  generate markdown from the DOM tree generated by justhtml.

  Updated test case to include inline elements that should not cause a
  line break when they are encountered. Running dehtml as: `python
  roundup/dehtml.py foo.html` will load foo.html and parse it using
  all available parsers.

configuration.py: justhtml is available as an option.

docs: updated CHANGES.txt, doc/tracker_config.txt added beautifulsoup
and justhtml to the optional software section of doc/installtion.txt.

test_mailgw.py, .github/workflows/ci-test Updated tests and install
justhtml as part of CI.
diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml
@@ -240,7 +240,7 @@ jobs:
           # pygments for markdown2 to highlight code blocks
           pip install markdown2 pygments
           # docutils for ReStructuredText
-          pip install beautifulsoup4 brotli docutils jinja2 \
+          pip install beautifulsoup4 justhtml brotli docutils jinja2 \
             mistune==0.8.4 pyjwt pytz whoosh
           # gpg on PyPi is currently broken with newer OS platform
           #   ubuntu 24.04
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -64,6 +64,10 @@ Features:
   config.ini. (John Rouillard)
 - issue2551152 - added basic PGP setup/use info to admin_guide. (John
   Rouillard)
+- add support for the 'justhtml' html 5 parser library. It is written
+  in pure Python. Used to convert html emails into plain text. Faster
+  then beautifulsoup4 and it passes the html 5 standard browser test
+  suite. Beautifulsoup is still supported. (John Rouillard)
 
 2025-07-13 2.5.0
 
diff --git a/doc/installation.txt b/doc/installation.txt
@@ -311,6 +311,14 @@ polib
   roundup-gettext, you must install polib_. See the `developer's
   guide`_ for details on translating your tracker.
 
+beautifulsoup, justhtml
+  When HTML only email is received, Roundup can convert it into
+  plain text using the native dehtml parser. To convert HTML
+  email into plain text, beautifulsoup4_ or justhtml_ can also be
+  used. You can choose the converter in the tracker's
+  config. Note that justhtml is pure Python, fast and conforms to
+  HTML 5 standards.
+  
 pywin32 - Windows Service
   You can run Roundup as a Windows service if pywin32_ is installed.
   Otherwise it must be started manually.
@@ -2423,13 +2431,15 @@ the test.
 .. _`adding MySQL users`:
     https://dev.mysql.com/doc/refman/8.0/en/creating-accounts.html
 .. _apache: https://httpd.apache.org/
+.. _beautifulsoup4: https://pypi.org/project/beautifulsoup4/
 .. _brotli: https://pypi.org/project/Brotli/
 .. _`developer's guide`: developers.html
 .. _defusedxml: https://pypi.org/project/defusedxml/
 .. _docutils: https://pypi.org/project/docutils/
 .. _flup: https://pypi.org/project/flup/
 .. _gpg: https://www.gnupg.org/software/gpgme/index.html
 .. _jinja2: https://palletsprojects.com/projects/jinja/
+.. _justhtml: https://pypi.org/project/justhtml/
 .. _markdown: https://python-markdown.github.io/
 .. _markdown2: https://github.com/trentm/python-markdown2
 .. _mistune: https://pypi.org/project/mistune/
diff --git a/doc/tracker_config.txt b/doc/tracker_config.txt
@@ -1112,12 +1112,12 @@
 
   # If an email has only text/html parts, use this module
   # to convert the html to text. Choose from beautifulsoup 4,
-  # dehtml - (internal code), or none to disable conversion.
-  # If 'none' is selected, email without a text/plain part
-  # will be returned to the user with a message. If
+  # justhtml, dehtml - (internal code), or none to disable
+  # conversion. If 'none' is selected, email without a text/plain
+  # part will be returned to the user with a message. If
   # beautifulsoup is selected but not installed dehtml will
   # be used instead.
-  # Allowed values: beautifulsoup, dehtml, none
+  # Allowed values: beautifulsoup, justhtml, dehtml, none
   # Default: none
   convert_htmltotext = none
 
diff --git a/roundup/configuration.py b/roundup/configuration.py
@@ -384,17 +384,17 @@ class HtmlToTextOption(Option):
 
     """What module should be used to convert emails with only text/html
     parts into text for display in roundup. Choose from beautifulsoup
-    4, dehtml - the internal code or none to disable html to text
-    conversion. If beautifulsoup chosen but not available, dehtml will
-    be used.
+    4, justhtml, dehtml - the internal code or none to disable html to
+    text conversion. If beautifulsoup or justhtml is chosen but not
+    available, dehtml will be used.
 
     """
 
-    class_description = "Allowed values: beautifulsoup, dehtml, none"
+    class_description = "Allowed values: beautifulsoup, justhtml, dehtml, none"
 
     def str2value(self, value):
         _val = value.lower()
-        if _val in ("beautifulsoup", "dehtml", "none"):
+        if _val in ("beautifulsoup", "justhtml", "dehtml", "none"):
             return _val
         else:
             raise OptionValueError(self, value, self.class_description)
@@ -1811,11 +1811,11 @@ def str2value(self, value):
         (HtmlToTextOption, "convert_htmltotext", "none",
             "If an email has only text/html parts, use this module\n"
             "to convert the html to text. Choose from beautifulsoup 4,\n"
-            "dehtml - (internal code), or none to disable conversion.\n"
-            "If 'none' is selected, email without a text/plain part\n"
-            "will be returned to the user with a message. If\n"
-            "beautifulsoup is selected but not installed dehtml will\n"
-            "be used instead."),
+            "justhtml, dehtml - (internal code), or none to disable\n"
+            "conversion. If 'none' is selected, email without a text/plain\n"
+            "part will be returned to the user with a message. If\n"
+            "beautifulsoup or justhtml is selected but not installed\n"
+            "dehtml will be used instead."),
         (BooleanOption, "keep_real_from", "no",
             "When handling emails ignore the Resent-From:-header\n"
             "and use the original senders From:-header instead.\n"
diff --git a/roundup/dehtml.py b/roundup/dehtml.py
@@ -5,6 +5,10 @@
 
 from roundup.anypy.strings import u2s, uchr
 
+# ruff PLC0415 ignore imports not at top of file
+# ruff RET505 ignore else  after return
+# ruff: noqa: PLC0415 RET505
+
 _pyver = sys.version_info[0]
 
 
@@ -28,6 +32,108 @@ def html2text(html):
 
                     return u2s(soup.get_text("\n", strip=True))
 
+                self.html2text = html2text
+            elif converter == "justhtml":
+                from justhtml import stream
+
+                def html2text(html):
+                    # The below does not work.
+                    # Using stream parser since I couldn't seem to strip
+                    # 'script' and 'style' blocks. But stream doesn't
+                    # have error reporting or stripping of text nodes
+                    # and dropping empty nodes. Also I would like to try
+                    # its GFM markdown output too even though it keeps
+                    # tables as html and doesn't completely covert as
+                    # this would work well for those supporting markdown.
+                    #
+                    #  ctx used for for testing since I have a truncated
+                    #  test doc. It eliminates error from missing DOCTYPE
+                    #  and head.
+                    #
+                    #from justhtml import JustHTML
+                    # from justhtml.context import FragmentContext
+                    #
+                    #ctx = FragmentContext('html')
+                    #justhtml = JustHTML(html,collect_errors=True,
+                    #                    fragment_context=ctx)
+                    # I still have the text output inside style/script tags.
+                    # with :not(style, script). I do get text contents
+                    # with query("style, script").
+                    #
+                    #return u2s("\n".join(
+                    #     [elem.to_text(separator="\n", strip=True)
+                    #        for elem in justhtml.query(":not(style, script)")])
+                    #          )
+
+                    # define inline elements so I can accumulate all unbroken
+                    # text in a single line with embedded inline elements.
+                    # 'br' is inline but should be treated it as a line break
+                    # and element before/after should not be accumulated
+                    # together.
+                    inline_elements = (
+                        "a",
+                        "address",
+                        "b",
+                        "cite",
+                        "code",
+                        "em",
+                        "i",
+                        "img",
+                        "mark",
+                        "q",
+                        "s",
+                        "small",
+                        "span",
+                        "strong",
+                        "sub",
+                        "sup",
+                        "time")
+
+                    # each line is appended and joined at the end
+                    text = []
+                    # the accumulator for all text in inline elements
+                    text_accumulator = ""
+                    # if set skip all lines till matching end tag found
+                    # used to skip script/style blocks
+                    skip_till_endtag = None
+                    # used to force text_accumulator into text with added
+                    # newline so we have a blank line between paragraphs.
+                    _need_parabreak = False
+
+                    for event, data in stream(html):
+                        if event == "end" and skip_till_endtag == data:
+                            skip_till_endtag = None
+                            continue
+                        if skip_till_endtag:
+                            continue
+                        if (event == "start" and
+                              data[0] in ('script', 'style')):
+                            skip_till_endtag = data[0]
+                            continue
+                        if (event == "start" and
+                              text_accumulator and
+                              data[0] not in inline_elements):
+                            # add accumulator to "text"
+                            text.append(text_accumulator)
+                            text_accumulator = ""
+                            _need_parabreak = False
+                        elif event == "text":
+                            if not data.isspace():
+                                text_accumulator = text_accumulator + data
+                                _need_parabreak = True
+                        elif (_need_parabreak and
+                              event == "start" and
+                              data[0] == "p"):
+                            text.append(text_accumulator + "\n")
+                            text_accumulator = ""
+                            _need_parabreak = False
+
+                    # save anything left in the accumulator at end of document
+                    if text_accumulator:
+                        # add newline to match dehtml and beautifulsoup
+                        text.append(text_accumulator + "\n")
+                    return u2s("\n".join(text))
+
                 self.html2text = html2text
             else:
                 raise ImportError
@@ -96,6 +202,16 @@ def html2text(html):
 
 
 if __name__ == "__main__":
+    # ruff: noqa: B011 S101
+
+    try:
+        assert False
+    except AssertionError:
+        pass
+    else:
+        print("Error, assertions turned off. Test fails")
+        sys.exit(1)
+
     html = """
 <body>
 <script>
@@ -128,10 +244,10 @@ def html2text(html):
 <li class="toctree-l2"><a class="reference internal" href="admin_guide.html">Administration Guide</a></li>
 </ul>
 <div class="section" id="prerequisites">
-<h2><a class="toc-backref" href="#id5">Prerequisites</a></h2>
+<H2><a class="toc-backref" href="#id5">Prerequisites</a></H2>
 <p>Roundup requires Python 2.5 or newer (but not Python 3) with a functioning
 anydbm module. Download the latest version from <a class="reference external" href="http://www.python.org/">http://www.python.org/</a>.
-It is highly recommended that users install the latest patch version
+It is highly recommended that users install the <span>latest patch version</span>
 of python as these contain many fixes to serious bugs.</p>
 <p>Some variants of Linux will need an additional &#8220;python dev&#8221; package
 installed for Roundup installation to work. Debian and derivatives, are
@@ -147,18 +263,42 @@ def html2text(html):
 </body>
 """
 
-    html2text = dehtml("dehtml").html2text
-    if html2text:
-        print(html2text(html))
+    if len(sys.argv) > 1:
+        with open(sys.argv[1]) as h:
+            html = h.read()
 
+    print("==== beautifulsoup")
     try:
         # trap error seen if N_TOKENS not defined when run.
         html2text = dehtml("beautifulsoup").html2text
         if html2text:
-            print(html2text(html))
+            text = html2text(html)
+            assert ('HELP' not in text)
+            assert ('display:block' not in text)
+            print(text)
     except NameError as e:
         print("captured error %s" % e)
 
+    print("==== justhtml")
+    try:
+        html2text = dehtml("justhtml").html2text
+        if html2text:
+            text = html2text(html)
+            assert ('HELP' not in text)
+            assert ('display:block' not in text)
+            print(text)
+    except NameError as e:
+        print("captured error %s" % e)
+
+    print("==== dehtml")
+    html2text = dehtml("dehtml").html2text
+    if html2text:
+        text = html2text(html)
+        assert ('HELP' not in text)
+        assert ('display:block' not in text)
+        print(text)
+
+    print("==== disabled html -> text conversion")
     html2text = dehtml("none").html2text
     if html2text:
         print("FAIL: Error, dehtml(none) is returning a function")
diff --git a/test/test_mailgw.py b/test/test_mailgw.py
@@ -35,6 +35,13 @@
     skip_beautifulsoup = mark_class(pytest.mark.skip(
         reason="Skipping beautifulsoup tests: 'bs4' not installed"))
 
+try:
+    import justhtml
+    skip_justhtml = lambda func, *args, **kwargs: func
+except ImportError:
+    from .pytest_patcher import mark_class
+    skip_justhtml = mark_class(pytest.mark.skip(
+        reason="Skipping justhtml tests: 'justhtml' not installed"))
 
 from roundup.anypy.email_ import message_from_bytes
 from roundup.anypy.strings import b2s, u2s, s2b
@@ -315,6 +322,10 @@ class MailgwTestCase(MailgwTestAbstractBase, StringFragmentCmpHelper, unittest.T
     def testTextHtmlMessageBeautifulSoup(self):
         self.testTextHtmlMessage(converter='beautifulsoup')
 
+    @skip_justhtml
+    def testTextHtmlMessageJusthtml(self):
+        self.testTextHtmlMessage(converter='justhtml')
+        
     def testTextHtmlMessage(self, converter='dehtml'):
         html_message='''Content-Type: text/html;
   charset="iso-8859-1"
@@ -375,10 +386,15 @@ def testTextHtmlMessage(self, converter='dehtml'):
         text_fragments['dehtml'] = ['Roundup\n        Home\nDownload\nDocs\nRoundup Features\nInstalling Roundup\nUpgrading to newer versions of Roundup\nRoundup FAQ\nUser Guide\nCustomising Roundup\nAdministration Guide\nPrerequisites\n\nRoundup requires Python 2.6 or newer (but not Python 3) with a functioning\nanydbm module. Download the latest version from http://www.python.org/.\nIt is highly recommended that users install the latest patch version\nof python as these contain many fixes to serious bugs.\n\nSome variants of Linux will need an additional ', ('python dev', u2s(u'\u201cpython dev\u201d')), ' package\ninstalled for Roundup installation to work. Debian and derivatives, are\nknown to require this.\n\nIf you', (u2s(u'\u2019'), ''), 're on windows, you will either need to be using the ActiveState python\ndistribution (at http://www.activestate.com/Products/ActivePython/), or you', (u2s(u'\u2019'), ''), 'll\nhave to install the win32all package separately (get it from\nhttp://starship.python.net/crew/mhammond/win32/).']
         text_fragments['beautifulsoup'] = ['Roundup\nHome\nDownload\nDocs\nRoundup Features\nInstalling Roundup\nUpgrading to newer versions of Roundup\nRoundup FAQ\nUser Guide\nCustomising Roundup\nAdministration Guide\nPrerequisites\nRoundup requires Python 2.6 or newer (but not Python 3) with a functioning\nanydbm module. Download the latest version from\nhttp://www.python.org/\n.\nIt is highly recommended that users install the latest patch version\nof python as these contain many fixes to serious bugs.\nSome variants of Linux will need an additional ', ('python dev', u2s(u'\u201cpython dev\u201d')), ' package\ninstalled for Roundup installation to work. Debian and derivatives, are\nknown to require this.\nIf you', (u2s(u'\u2019'), "'"), 're on windows, you will either need to be using the ActiveState python\ndistribution (at\nhttp://www.activestate.com/Products/ActivePython/\n), or you’ll\nhave to install the win32all package separately (get it from\nhttp://starship.python.net/crew/mhammond/win32/\n).']
 
+        text_fragments['justhtml'] = ['Roundup\nHome\nDownload\nDocs\nRoundup Features\nInstalling Roundup\nUpgrading to newer versions of Roundup\nRoundup FAQ\nUser Guide\nCustomising Roundup\nAdministration Guide\nPrerequisites\nRoundup requires Python 2.6 or newer (but not Python 3) with a functioning\nanydbm module. Download the latest version from http://www.python.org/.\nIt is highly recommended that users install the latest patch version\nof python as these contain many fixes to serious bugs.\nSome variants of Linux will need an additional ', ('python dev', u2s(u'\u201cpython dev\u201d')), ' package\ninstalled for Roundup installation to work. Debian and derivatives, are\nknown to require this.\nIf you', (u2s(u'\u2019'), "'"), 're on windows, you will either need to be using the ActiveState python\ndistribution (at http://www.activestate.com/Products/ActivePython/), or you’ll\nhave to install the win32all package separately (get it from\nhttp://starship.python.net/crew/mhammond/win32/).']
+        self.maxDiff = 100000
         self.db.config.MAILGW_CONVERT_HTMLTOTEXT = converter
         nodeid = self._handle_mail(html_message)
         assert not os.path.exists(SENDMAILDEBUG)
         msgid = self.db.issue.get(nodeid, 'messages')[0]
+        print(self.db.msg.get(msgid, 'content'))
+        print("\n==== fragment\n")
+        print(text_fragments[converter])
         self.compareStringFragments(self.db.msg.get(msgid, 'content'),
                                     text_fragments[converter])