Merge pull request #402 from d0ugal/toc_refactor

Refactor the TOC parsing code
2026-03-27 09:58:31 +07:00 · 2015-04-03 10:00:53 +01:00
parent 74e60382b8 b8dc35fa7b
commit 1edfdabf06
3 changed files with 48 additions and 6 deletions
--- a/mkdocs/compat.py
+++ b/mkdocs/compat.py
@@ -13,6 +13,7 @@ if PY2:
    httpserver = httpserver
    import SocketServer
    socketserver = SocketServer
+    from HTMLParser import HTMLParser

    import itertools
    zip = itertools.izip
@@ -30,6 +31,7 @@ else:  # PY3
    httpserver = httpserver
    import socketserver
    socketserver = socketserver
+    from html.parser import HTMLParser

    zip = zip

--- a/mkdocs/tests/toc_tests.py
+++ b/mkdocs/tests/toc_tests.py
@@ -29,6 +29,20 @@ class TableOfContentsTests(unittest.TestCase):
        toc = self.markdown_to_toc(md)
        self.assertEqual(str(toc).strip(), expected)

+    def test_indented_toc_html(self):
+        md = dedent("""
+        # Heading 1
+        ## <code>Heading</code> 2
+        ## Heading 3
+        """)
+        expected = dedent("""
+        Heading 1 - #heading-1
+            Heading 2 - #heading-2
+            Heading 3 - #heading-3
+        """)
+        toc = self.markdown_to_toc(md)
+        self.assertEqual(str(toc).strip(), expected)
+
    def test_flat_toc(self):
        md = dedent("""
        # Heading 1
--- a/mkdocs/toc.py
+++ b/mkdocs/toc.py
@@ -14,9 +14,7 @@ The steps we take to generate a table of contents are:
 * Parse table of contents HTML into the underlying data structure.
 """

-import re
-
-TOC_LINK_REGEX = re.compile('<a href=["]([^"]*)["]>([^<]*)</a>')
+from mkdocs.compat import HTMLParser


 class TableOfContents(object):
@@ -52,6 +50,32 @@ class AnchorLink(object):
        return ret


+class TOCParser(HTMLParser):
+
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.links = []
+
+        self.in_anchor = True
+        self.attrs = None
+        self.title = ''
+
+    def handle_starttag(self, tag, attrs):
+
+        if tag == 'a':
+            self.in_anchor = True
+            self.attrs = dict(attrs)
+
+    def handle_endtag(self, tag):
+        if tag == 'a':
+            self.in_anchor = False
+
+    def handle_data(self, data):
+
+        if self.in_anchor:
+            self.title += data
+
+
 def _parse_html_table_of_contents(html):
    """
    Given a table of contents string that has been automatically generated by
@@ -63,9 +87,11 @@ def _parse_html_table_of_contents(html):
    parents = []
    ret = []
    for line in lines:
-        match = TOC_LINK_REGEX.search(line)
-        if match:
-            href, title = match.groups()
+        parser = TOCParser()
+        parser.feed(line)
+        if parser.title:
+            href = parser.attrs['href']
+            title = parser.title
            nav = AnchorLink(title, href)
            # Add the item to its parent if required.  If it is a topmost
            # item then instead append it to our return value.