From b8dc35fa7bae3f7014fc2fdf8b6fbf9d2e234991 Mon Sep 17 00:00:00 2001
From: Dougal Matthews <dougal@redhat.com>
Date: Fri, 3 Apr 2015 09:51:21 +0100
Subject: [PATCH] Refactor the TOC parsing code

We now use HTMLParser rather than regular expressions which makes it
much smarter about handling tags within the titles themselves.

Fixes #367
---
 mkdocs/compat.py          |  2 ++
 mkdocs/tests/toc_tests.py | 14 ++++++++++++++
 mkdocs/toc.py             | 38 ++++++++++++++++++++++++++++++++------
 3 files changed, 48 insertions(+), 6 deletions(-)
diff --git a/mkdocs/compat.py b/mkdocs/compat.py
index 49bd396a..518a4937 100644
--- a/mkdocs/compat.py
+++ b/mkdocs/compat.py
@@ -13,6 +13,7 @@ if PY2:
     httpserver = httpserver
     import SocketServer
     socketserver = SocketServer
+    from HTMLParser import HTMLParser
 
     import itertools
     zip = itertools.izip
@@ -30,6 +31,7 @@ else:  # PY3
     httpserver = httpserver
     import socketserver
     socketserver = socketserver
+    from html.parser import HTMLParser
 
     zip = zip
 
diff --git a/mkdocs/tests/toc_tests.py b/mkdocs/tests/toc_tests.py
index 03ab9cd0..b0bdea11 100644
--- a/mkdocs/tests/toc_tests.py
+++ b/mkdocs/tests/toc_tests.py
@@ -29,6 +29,20 @@ class TableOfContentsTests(unittest.TestCase):
         toc = self.markdown_to_toc(md)
         self.assertEqual(str(toc).strip(), expected)
 
+    def test_indented_toc_html(self):
+        md = dedent("""
+        # Heading 1
+        ## <code>Heading</code> 2
+        ## Heading 3
+        """)
+        expected = dedent("""
+        Heading 1 - #heading-1
+            Heading 2 - #heading-2
+            Heading 3 - #heading-3
+        """)
+        toc = self.markdown_to_toc(md)
+        self.assertEqual(str(toc).strip(), expected)
+
     def test_flat_toc(self):
         md = dedent("""
         # Heading 1
diff --git a/mkdocs/toc.py b/mkdocs/toc.py
index 410aff5a..89627381 100644
--- a/mkdocs/toc.py
+++ b/mkdocs/toc.py
@@ -14,9 +14,7 @@ The steps we take to generate a table of contents are:
 * Parse table of contents HTML into the underlying data structure.
 """
 
-import re
-
-TOC_LINK_REGEX = re.compile('<a href=["]([^"]*)["]>([^<]*)</a>')
+from mkdocs.compat import HTMLParser
 
 
 class TableOfContents(object):
@@ -52,6 +50,32 @@ class AnchorLink(object):
         return ret
 
 
+class TOCParser(HTMLParser):
+
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.links = []
+
+        self.in_anchor = True
+        self.attrs = None
+        self.title = ''
+
+    def handle_starttag(self, tag, attrs):
+
+        if tag == 'a':
+            self.in_anchor = True
+            self.attrs = dict(attrs)
+
+    def handle_endtag(self, tag):
+        if tag == 'a':
+            self.in_anchor = False
+
+    def handle_data(self, data):
+
+        if self.in_anchor:
+            self.title += data
+
+
 def _parse_html_table_of_contents(html):
     """
     Given a table of contents string that has been automatically generated by
@@ -63,9 +87,11 @@ def _parse_html_table_of_contents(html):
     parents = []
     ret = []
     for line in lines:
-        match = TOC_LINK_REGEX.search(line)
-        if match:
-            href, title = match.groups()
+        parser = TOCParser()
+        parser.feed(line)
+        if parser.title:
+            href = parser.attrs['href']
+            title = parser.title
             nav = AnchorLink(title, href)
             # Add the item to its parent if required.  If it is a topmost
             # item then instead append it to our return value.