From e755aaed7ea47348a60495ab364d5483ab90a4a6 Mon Sep 17 00:00:00 2001 From: Oleh Prypin Date: Thu, 8 Feb 2024 19:17:16 +0100 Subject: [PATCH] Strip all HTML when getting the title from the first H1 tag (#3564) Not stripping it was a bug, and also inconsistent with how ToC titles are extracted. --- mkdocs/structure/pages.py | 12 +-- mkdocs/structure/toc.py | 2 +- mkdocs/tests/structure/page_tests.py | 110 +++++++++++++-------------- pyproject.toml | 4 +- 4 files changed, 62 insertions(+), 66 deletions(-) diff --git a/mkdocs/structure/pages.py b/mkdocs/structure/pages.py index ef13e512..8b0a642d 100644 --- a/mkdocs/structure/pages.py +++ b/mkdocs/structure/pages.py @@ -10,6 +10,7 @@ from urllib.parse import unquote as urlunquote from urllib.parse import urljoin, urlsplit, urlunsplit import markdown +import markdown.extensions.toc import markdown.htmlparser # type: ignore import markdown.postprocessors import markdown.treeprocessors @@ -549,7 +550,7 @@ class _HTMLHandler(markdown.htmlparser.htmlparser.HTMLParser): # type: ignore[n class _ExtractTitleTreeprocessor(markdown.treeprocessors.Treeprocessor): title: str | None = None - postprocessors: Sequence[markdown.postprocessors.Postprocessor] = () + md: markdown.Markdown def run(self, root: etree.Element) -> etree.Element: for el in root: @@ -561,14 +562,15 @@ class _ExtractTitleTreeprocessor(markdown.treeprocessors.Treeprocessor): # Extract the text only, recursively. title = ''.join(el.itertext()) # Unescape per Markdown implementation details. - for pp in self.postprocessors: - title = pp.run(title) - self.title = title + title = markdown.extensions.toc.stashedHTML2text( + title, self.md, strip_entities=False + ) + self.title = title.strip() break return root def _register(self, md: markdown.Markdown) -> None: - self.postprocessors = tuple(md.postprocessors) + self.md = md md.treeprocessors.register(self, "mkdocs_extract_title", priority=-1) # After the end. diff --git a/mkdocs/structure/toc.py b/mkdocs/structure/toc.py index 6d09867b..e1df40be 100644 --- a/mkdocs/structure/toc.py +++ b/mkdocs/structure/toc.py @@ -33,7 +33,7 @@ class AnchorLink: self.children = [] title: str - """The text of the item.""" + """The text of the item, as HTML.""" @property def url(self) -> str: diff --git a/mkdocs/tests/structure/page_tests.py b/mkdocs/tests/structure/page_tests.py index ee6aa159..77d8542b 100644 --- a/mkdocs/tests/structure/page_tests.py +++ b/mkdocs/tests/structure/page_tests.py @@ -6,9 +6,11 @@ import textwrap import unittest from unittest import mock +import markdown + from mkdocs.config.defaults import MkDocsConfig from mkdocs.structure.files import File, Files -from mkdocs.structure.pages import Page, _RelativePathTreeprocessor +from mkdocs.structure.pages import Page, _ExtractTitleTreeprocessor, _RelativePathTreeprocessor from mkdocs.tests.base import dedent, tempdir DOCS_DIR = os.path.join( @@ -315,9 +317,16 @@ class PageTests(unittest.TestCase): self.assertEqual(pg.parent, None) self.assertEqual(pg.previous_page, None) self.assertEqual(pg.title, 'Welcome to MkDocs') - pg.render(cfg, fl) + pg.render(cfg, Files([fl])) self.assertEqual(pg.title, 'Welcome to MkDocs') + def _test_extract_title(self, content, expected, extensions={}): + md = markdown.Markdown(extensions=list(extensions.keys()), extension_configs=extensions) + extract_title_ext = _ExtractTitleTreeprocessor() + extract_title_ext._register(md) + md.convert(content) + self.assertEqual(extract_title_ext.title, expected) + _SETEXT_CONTENT = dedent( ''' Welcome to MkDocs Setext @@ -327,46 +336,37 @@ class PageTests(unittest.TestCase): ''' ) - @tempdir(files={'testing_setext_title.md': _SETEXT_CONTENT}) - def test_page_title_from_setext_markdown(self, docs_dir): - cfg = load_config() - fl = File('testing_setext_title.md', docs_dir, docs_dir, use_directory_urls=True) - pg = Page(None, fl, cfg) - self.assertIsNone(pg.title) - pg.read_source(cfg) - self.assertEqual(pg.title, 'Testing setext title') - pg.render(cfg, fl) - self.assertEqual(pg.title, 'Welcome to MkDocs Setext') + def test_page_title_from_setext_markdown(self): + self._test_extract_title( + self._SETEXT_CONTENT, + expected='Welcome to MkDocs Setext', + ) - @tempdir(files={'testing_setext_title.md': _SETEXT_CONTENT}) - def test_page_title_from_markdown_stripped_anchorlinks(self, docs_dir): - cfg = MkDocsConfig() - cfg.site_name = 'example' - cfg.markdown_extensions = {'toc': {'permalink': '&'}} - self.assertEqual(cfg.validate(), ([], [])) - fl = File('testing_setext_title.md', docs_dir, docs_dir, use_directory_urls=True) - pg = Page(None, fl, cfg) - pg.read_source(cfg) - pg.render(cfg, fl) - self.assertEqual(pg.title, 'Welcome to MkDocs Setext') + def test_page_title_from_markdown_stripped_anchorlinks(self): + self._test_extract_title( + self._SETEXT_CONTENT, + extensions={'toc': {'permalink': '&'}}, + expected='Welcome to MkDocs Setext', + ) - _FORMATTING_CONTENT = dedent( - ''' - # \\*Hello --- *beautiful* `world` + def test_page_title_from_markdown_strip_formatting(self): + self._test_extract_title( + '''# \\*Hello --- *beautiful* `wor
`''', + extensions={'smarty': {}}, + expected='*Hello — beautiful wor<dl>', + ) - Hi. - ''' - ) + def test_page_title_from_markdown_strip_raw_html(self): + self._test_extract_title( + '''# Hello world''', + expected='Hello world', + ) - @tempdir(files={'testing_formatting.md': _FORMATTING_CONTENT}) - def test_page_title_from_markdown_strip_formatting(self, docs_dir): - cfg = load_config() - cfg.markdown_extensions.append('smarty') - fl = File('testing_formatting.md', docs_dir, docs_dir, use_directory_urls=True) - pg = Page(None, fl, cfg) - pg.read_source(cfg) - pg.render(cfg, fl) - self.assertEqual(pg.title, '*Hello — beautiful world') + def test_page_title_from_markdown_strip_image(self): + self._test_extract_title( + '''# Hi ![😄](hah.png)''', + expected='Hi', # TODO: Should the alt text of the image be extracted? + ) _ATTRLIST_CONTENT = dedent( ''' @@ -376,24 +376,18 @@ class PageTests(unittest.TestCase): ''' ) - @tempdir(files={'testing_attr_list.md': _ATTRLIST_CONTENT}) - def test_page_title_from_markdown_stripped_attr_list(self, docs_dir): - cfg = load_config() - cfg.markdown_extensions.append('attr_list') - fl = File('testing_attr_list.md', docs_dir, docs_dir, use_directory_urls=True) - pg = Page(None, fl, cfg) - pg.read_source(cfg) - pg.render(cfg, fl) - self.assertEqual(pg.title, 'Welcome to MkDocs Attr') + def test_page_title_from_markdown_stripped_attr_list(self): + self._test_extract_title( + self._ATTRLIST_CONTENT, + extensions={'attr_list': {}}, + expected='Welcome to MkDocs Attr', + ) - @tempdir(files={'testing_attr_list.md': _ATTRLIST_CONTENT}) - def test_page_title_from_markdown_preserved_attr_list(self, docs_dir): - cfg = load_config() - fl = File('testing_attr_list.md', docs_dir, docs_dir, use_directory_urls=True) - pg = Page(None, fl, cfg) - pg.read_source(cfg) - pg.render(cfg, fl) - self.assertEqual(pg.title, 'Welcome to MkDocs Attr { #welcome }') + def test_page_title_from_markdown_preserved_attr_list(self): + self._test_extract_title( + self._ATTRLIST_CONTENT, + expected='Welcome to MkDocs Attr { #welcome }', + ) def test_page_title_from_meta(self): cfg = load_config(docs_dir=DOCS_DIR) @@ -418,7 +412,7 @@ class PageTests(unittest.TestCase): self.assertEqual(pg.previous_page, None) self.assertEqual(pg.title, 'A Page Title') self.assertEqual(pg.toc, []) - pg.render(cfg, fl) + pg.render(cfg, Files([fl])) self.assertEqual(pg.title, 'A Page Title') def test_page_title_from_filename(self): @@ -443,7 +437,7 @@ class PageTests(unittest.TestCase): self.assertEqual(pg.parent, None) self.assertEqual(pg.previous_page, None) self.assertEqual(pg.title, 'Page title') - pg.render(cfg, fl) + pg.render(cfg, Files([fl])) self.assertEqual(pg.title, 'Page title') def test_page_title_from_capitalized_filename(self): @@ -704,7 +698,7 @@ class PageTests(unittest.TestCase): pg.read_source(cfg) self.assertEqual(pg.content, None) self.assertEqual(pg.toc, []) - pg.render(cfg, [fl]) + pg.render(cfg, Files([fl])) self.assertTrue( pg.content.startswith('

Welcome to MkDocs

\n') ) diff --git a/pyproject.toml b/pyproject.toml index 28355b73..71bc189e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ dependencies = [ "click >=7.0", "Jinja2 >=2.11.1", "markupsafe >=2.0.1", - "Markdown >=3.3.6", + "Markdown >=3.4.1", "PyYAML >=5.1", "watchdog >=2.0", "ghp-import >=1.0", @@ -57,7 +57,7 @@ min-versions = [ "click ==7.0", "Jinja2 ==2.11.1", "markupsafe ==2.0.1", - "Markdown ==3.3.6", + "Markdown ==3.4.1", "PyYAML ==5.1", "watchdog ==2.0", "ghp-import ==1.0",