Strip all HTML when getting the title from the first H1 tag (#3564)

Not stripping it was a bug, and also inconsistent with how ToC titles are extracted.
This commit is contained in:
Oleh Prypin
2024-02-08 19:17:16 +01:00
committed by GitHub
parent d6fcc56a3e
commit e755aaed7e
4 changed files with 62 additions and 66 deletions

View File

@@ -10,6 +10,7 @@ from urllib.parse import unquote as urlunquote
from urllib.parse import urljoin, urlsplit, urlunsplit from urllib.parse import urljoin, urlsplit, urlunsplit
import markdown import markdown
import markdown.extensions.toc
import markdown.htmlparser # type: ignore import markdown.htmlparser # type: ignore
import markdown.postprocessors import markdown.postprocessors
import markdown.treeprocessors import markdown.treeprocessors
@@ -549,7 +550,7 @@ class _HTMLHandler(markdown.htmlparser.htmlparser.HTMLParser): # type: ignore[n
class _ExtractTitleTreeprocessor(markdown.treeprocessors.Treeprocessor): class _ExtractTitleTreeprocessor(markdown.treeprocessors.Treeprocessor):
title: str | None = None title: str | None = None
postprocessors: Sequence[markdown.postprocessors.Postprocessor] = () md: markdown.Markdown
def run(self, root: etree.Element) -> etree.Element: def run(self, root: etree.Element) -> etree.Element:
for el in root: for el in root:
@@ -561,14 +562,15 @@ class _ExtractTitleTreeprocessor(markdown.treeprocessors.Treeprocessor):
# Extract the text only, recursively. # Extract the text only, recursively.
title = ''.join(el.itertext()) title = ''.join(el.itertext())
# Unescape per Markdown implementation details. # Unescape per Markdown implementation details.
for pp in self.postprocessors: title = markdown.extensions.toc.stashedHTML2text(
title = pp.run(title) title, self.md, strip_entities=False
self.title = title )
self.title = title.strip()
break break
return root return root
def _register(self, md: markdown.Markdown) -> None: def _register(self, md: markdown.Markdown) -> None:
self.postprocessors = tuple(md.postprocessors) self.md = md
md.treeprocessors.register(self, "mkdocs_extract_title", priority=-1) # After the end. md.treeprocessors.register(self, "mkdocs_extract_title", priority=-1) # After the end.

View File

@@ -33,7 +33,7 @@ class AnchorLink:
self.children = [] self.children = []
title: str title: str
"""The text of the item.""" """The text of the item, as HTML."""
@property @property
def url(self) -> str: def url(self) -> str:

View File

@@ -6,9 +6,11 @@ import textwrap
import unittest import unittest
from unittest import mock from unittest import mock
import markdown
from mkdocs.config.defaults import MkDocsConfig from mkdocs.config.defaults import MkDocsConfig
from mkdocs.structure.files import File, Files from mkdocs.structure.files import File, Files
from mkdocs.structure.pages import Page, _RelativePathTreeprocessor from mkdocs.structure.pages import Page, _ExtractTitleTreeprocessor, _RelativePathTreeprocessor
from mkdocs.tests.base import dedent, tempdir from mkdocs.tests.base import dedent, tempdir
DOCS_DIR = os.path.join( DOCS_DIR = os.path.join(
@@ -315,9 +317,16 @@ class PageTests(unittest.TestCase):
self.assertEqual(pg.parent, None) self.assertEqual(pg.parent, None)
self.assertEqual(pg.previous_page, None) self.assertEqual(pg.previous_page, None)
self.assertEqual(pg.title, 'Welcome to MkDocs') self.assertEqual(pg.title, 'Welcome to MkDocs')
pg.render(cfg, fl) pg.render(cfg, Files([fl]))
self.assertEqual(pg.title, 'Welcome to MkDocs') self.assertEqual(pg.title, 'Welcome to MkDocs')
def _test_extract_title(self, content, expected, extensions={}):
md = markdown.Markdown(extensions=list(extensions.keys()), extension_configs=extensions)
extract_title_ext = _ExtractTitleTreeprocessor()
extract_title_ext._register(md)
md.convert(content)
self.assertEqual(extract_title_ext.title, expected)
_SETEXT_CONTENT = dedent( _SETEXT_CONTENT = dedent(
''' '''
Welcome to MkDocs Setext Welcome to MkDocs Setext
@@ -327,46 +336,37 @@ class PageTests(unittest.TestCase):
''' '''
) )
@tempdir(files={'testing_setext_title.md': _SETEXT_CONTENT}) def test_page_title_from_setext_markdown(self):
def test_page_title_from_setext_markdown(self, docs_dir): self._test_extract_title(
cfg = load_config() self._SETEXT_CONTENT,
fl = File('testing_setext_title.md', docs_dir, docs_dir, use_directory_urls=True) expected='Welcome to MkDocs Setext',
pg = Page(None, fl, cfg) )
self.assertIsNone(pg.title)
pg.read_source(cfg)
self.assertEqual(pg.title, 'Testing setext title')
pg.render(cfg, fl)
self.assertEqual(pg.title, 'Welcome to MkDocs Setext')
@tempdir(files={'testing_setext_title.md': _SETEXT_CONTENT}) def test_page_title_from_markdown_stripped_anchorlinks(self):
def test_page_title_from_markdown_stripped_anchorlinks(self, docs_dir): self._test_extract_title(
cfg = MkDocsConfig() self._SETEXT_CONTENT,
cfg.site_name = 'example' extensions={'toc': {'permalink': '&'}},
cfg.markdown_extensions = {'toc': {'permalink': '&'}} expected='Welcome to MkDocs Setext',
self.assertEqual(cfg.validate(), ([], [])) )
fl = File('testing_setext_title.md', docs_dir, docs_dir, use_directory_urls=True)
pg = Page(None, fl, cfg)
pg.read_source(cfg)
pg.render(cfg, fl)
self.assertEqual(pg.title, 'Welcome to MkDocs Setext')
_FORMATTING_CONTENT = dedent( def test_page_title_from_markdown_strip_formatting(self):
''' self._test_extract_title(
# \\*Hello --- *beautiful* `world` '''# \\*Hello --- *beautiful* `wor<dl>`''',
extensions={'smarty': {}},
expected='*Hello &mdash; beautiful wor&lt;dl&gt;',
)
Hi. def test_page_title_from_markdown_strip_raw_html(self):
''' self._test_extract_title(
) '''# Hello <b>world</b>''',
expected='Hello world',
)
@tempdir(files={'testing_formatting.md': _FORMATTING_CONTENT}) def test_page_title_from_markdown_strip_image(self):
def test_page_title_from_markdown_strip_formatting(self, docs_dir): self._test_extract_title(
cfg = load_config() '''# Hi ![😄](hah.png)''',
cfg.markdown_extensions.append('smarty') expected='Hi', # TODO: Should the alt text of the image be extracted?
fl = File('testing_formatting.md', docs_dir, docs_dir, use_directory_urls=True) )
pg = Page(None, fl, cfg)
pg.read_source(cfg)
pg.render(cfg, fl)
self.assertEqual(pg.title, '*Hello &mdash; beautiful world')
_ATTRLIST_CONTENT = dedent( _ATTRLIST_CONTENT = dedent(
''' '''
@@ -376,24 +376,18 @@ class PageTests(unittest.TestCase):
''' '''
) )
@tempdir(files={'testing_attr_list.md': _ATTRLIST_CONTENT}) def test_page_title_from_markdown_stripped_attr_list(self):
def test_page_title_from_markdown_stripped_attr_list(self, docs_dir): self._test_extract_title(
cfg = load_config() self._ATTRLIST_CONTENT,
cfg.markdown_extensions.append('attr_list') extensions={'attr_list': {}},
fl = File('testing_attr_list.md', docs_dir, docs_dir, use_directory_urls=True) expected='Welcome to MkDocs Attr',
pg = Page(None, fl, cfg) )
pg.read_source(cfg)
pg.render(cfg, fl)
self.assertEqual(pg.title, 'Welcome to MkDocs Attr')
@tempdir(files={'testing_attr_list.md': _ATTRLIST_CONTENT}) def test_page_title_from_markdown_preserved_attr_list(self):
def test_page_title_from_markdown_preserved_attr_list(self, docs_dir): self._test_extract_title(
cfg = load_config() self._ATTRLIST_CONTENT,
fl = File('testing_attr_list.md', docs_dir, docs_dir, use_directory_urls=True) expected='Welcome to MkDocs Attr { #welcome }',
pg = Page(None, fl, cfg) )
pg.read_source(cfg)
pg.render(cfg, fl)
self.assertEqual(pg.title, 'Welcome to MkDocs Attr { #welcome }')
def test_page_title_from_meta(self): def test_page_title_from_meta(self):
cfg = load_config(docs_dir=DOCS_DIR) cfg = load_config(docs_dir=DOCS_DIR)
@@ -418,7 +412,7 @@ class PageTests(unittest.TestCase):
self.assertEqual(pg.previous_page, None) self.assertEqual(pg.previous_page, None)
self.assertEqual(pg.title, 'A Page Title') self.assertEqual(pg.title, 'A Page Title')
self.assertEqual(pg.toc, []) self.assertEqual(pg.toc, [])
pg.render(cfg, fl) pg.render(cfg, Files([fl]))
self.assertEqual(pg.title, 'A Page Title') self.assertEqual(pg.title, 'A Page Title')
def test_page_title_from_filename(self): def test_page_title_from_filename(self):
@@ -443,7 +437,7 @@ class PageTests(unittest.TestCase):
self.assertEqual(pg.parent, None) self.assertEqual(pg.parent, None)
self.assertEqual(pg.previous_page, None) self.assertEqual(pg.previous_page, None)
self.assertEqual(pg.title, 'Page title') self.assertEqual(pg.title, 'Page title')
pg.render(cfg, fl) pg.render(cfg, Files([fl]))
self.assertEqual(pg.title, 'Page title') self.assertEqual(pg.title, 'Page title')
def test_page_title_from_capitalized_filename(self): def test_page_title_from_capitalized_filename(self):
@@ -704,7 +698,7 @@ class PageTests(unittest.TestCase):
pg.read_source(cfg) pg.read_source(cfg)
self.assertEqual(pg.content, None) self.assertEqual(pg.content, None)
self.assertEqual(pg.toc, []) self.assertEqual(pg.toc, [])
pg.render(cfg, [fl]) pg.render(cfg, Files([fl]))
self.assertTrue( self.assertTrue(
pg.content.startswith('<h1 id="welcome-to-mkdocs">Welcome to MkDocs</h1>\n') pg.content.startswith('<h1 id="welcome-to-mkdocs">Welcome to MkDocs</h1>\n')
) )

View File

@@ -36,7 +36,7 @@ dependencies = [
"click >=7.0", "click >=7.0",
"Jinja2 >=2.11.1", "Jinja2 >=2.11.1",
"markupsafe >=2.0.1", "markupsafe >=2.0.1",
"Markdown >=3.3.6", "Markdown >=3.4.1",
"PyYAML >=5.1", "PyYAML >=5.1",
"watchdog >=2.0", "watchdog >=2.0",
"ghp-import >=1.0", "ghp-import >=1.0",
@@ -57,7 +57,7 @@ min-versions = [
"click ==7.0", "click ==7.0",
"Jinja2 ==2.11.1", "Jinja2 ==2.11.1",
"markupsafe ==2.0.1", "markupsafe ==2.0.1",
"Markdown ==3.3.6", "Markdown ==3.4.1",
"PyYAML ==5.1", "PyYAML ==5.1",
"watchdog ==2.0", "watchdog ==2.0",
"ghp-import ==1.0", "ghp-import ==1.0",