From b8dc35fa7bae3f7014fc2fdf8b6fbf9d2e234991 Mon Sep 17 00:00:00 2001 From: Dougal Matthews Date: Fri, 3 Apr 2015 09:51:21 +0100 Subject: [PATCH] Refactor the TOC parsing code We now use HTMLParser rather than regular expressions which makes it much smarter about handling tags within the titles themselves. Fixes #367 --- mkdocs/compat.py | 2 ++ mkdocs/tests/toc_tests.py | 14 ++++++++++++++ mkdocs/toc.py | 38 ++++++++++++++++++++++++++++++++------ 3 files changed, 48 insertions(+), 6 deletions(-) diff --git a/mkdocs/compat.py b/mkdocs/compat.py index 49bd396a..518a4937 100644 --- a/mkdocs/compat.py +++ b/mkdocs/compat.py @@ -13,6 +13,7 @@ if PY2: httpserver = httpserver import SocketServer socketserver = SocketServer + from HTMLParser import HTMLParser import itertools zip = itertools.izip @@ -30,6 +31,7 @@ else: # PY3 httpserver = httpserver import socketserver socketserver = socketserver + from html.parser import HTMLParser zip = zip diff --git a/mkdocs/tests/toc_tests.py b/mkdocs/tests/toc_tests.py index 03ab9cd0..b0bdea11 100644 --- a/mkdocs/tests/toc_tests.py +++ b/mkdocs/tests/toc_tests.py @@ -29,6 +29,20 @@ class TableOfContentsTests(unittest.TestCase): toc = self.markdown_to_toc(md) self.assertEqual(str(toc).strip(), expected) + def test_indented_toc_html(self): + md = dedent(""" + # Heading 1 + ## Heading 2 + ## Heading 3 + """) + expected = dedent(""" + Heading 1 - #heading-1 + Heading 2 - #heading-2 + Heading 3 - #heading-3 + """) + toc = self.markdown_to_toc(md) + self.assertEqual(str(toc).strip(), expected) + def test_flat_toc(self): md = dedent(""" # Heading 1 diff --git a/mkdocs/toc.py b/mkdocs/toc.py index 410aff5a..89627381 100644 --- a/mkdocs/toc.py +++ b/mkdocs/toc.py @@ -14,9 +14,7 @@ The steps we take to generate a table of contents are: * Parse table of contents HTML into the underlying data structure. """ -import re - -TOC_LINK_REGEX = re.compile('([^<]*)') +from mkdocs.compat import HTMLParser class TableOfContents(object): @@ -52,6 +50,32 @@ class AnchorLink(object): return ret +class TOCParser(HTMLParser): + + def __init__(self): + HTMLParser.__init__(self) + self.links = [] + + self.in_anchor = True + self.attrs = None + self.title = '' + + def handle_starttag(self, tag, attrs): + + if tag == 'a': + self.in_anchor = True + self.attrs = dict(attrs) + + def handle_endtag(self, tag): + if tag == 'a': + self.in_anchor = False + + def handle_data(self, data): + + if self.in_anchor: + self.title += data + + def _parse_html_table_of_contents(html): """ Given a table of contents string that has been automatically generated by @@ -63,9 +87,11 @@ def _parse_html_table_of_contents(html): parents = [] ret = [] for line in lines: - match = TOC_LINK_REGEX.search(line) - if match: - href, title = match.groups() + parser = TOCParser() + parser.feed(line) + if parser.title: + href = parser.attrs['href'] + title = parser.title nav = AnchorLink(title, href) # Add the item to its parent if required. If it is a topmost # item then instead append it to our return value.