From 5af8bd30538ff8f0cfb698c8b90c3020da319f92 Mon Sep 17 00:00:00 2001 From: Oleh Prypin Date: Mon, 29 May 2023 21:59:31 +0200 Subject: [PATCH] Infer titles of pages based on full Markdown parsing (#3191) Properly parse Markdown and report the first H1 tag rather than naively looking for `# Stuff` at the start of the doc. This satisfies feature requests such as supporting setext-style headers and fixes attr-list suffixes that failed to be ignored. Co-authored-by: Hendrik Polczynski Co-authored-by: Darrick Herwehe --- docs/user-guide/writing-your-docs.md | 12 ++- mkdocs/structure/pages.py | 116 +++++++++++++++++++-------- mkdocs/tests/structure/page_tests.py | 86 +++++++++++++++++++- mkdocs/utils/__init__.py | 21 +++-- 4 files changed, 189 insertions(+), 46 deletions(-) diff --git a/docs/user-guide/writing-your-docs.md b/docs/user-guide/writing-your-docs.md index 4697abd5..66a10067 100644 --- a/docs/user-guide/writing-your-docs.md +++ b/docs/user-guide/writing-your-docs.md @@ -380,10 +380,14 @@ specific page. The following keys are supported: MkDocs will attempt to determine the title of a document in the following ways, in order: - 1. A title defined in the [nav] configuration setting for a document. - 2. A title defined in the `title` meta-data key of a document. - 3. A level 1 Markdown header on the first line of the document body. Please note that [Setext-style] headers are not supported. - 4. The filename of a document. + 1. A title defined in the [nav] configuration setting for a document. + + 2. A title defined in the `title` meta-data key of a document. + + 3. A level 1 Markdown header on the first line of the document body. + ([Setext-style] headers are supported *only since MkDocs 1.5*.) + + 4. The filename of a document. Upon finding a title for a page, MkDoc does not continue checking any additional sources in the above list. diff --git a/mkdocs/structure/pages.py b/mkdocs/structure/pages.py index b2785687..e5a95874 100644 --- a/mkdocs/structure/pages.py +++ b/mkdocs/structure/pages.py @@ -1,27 +1,36 @@ from __future__ import annotations +import copy import logging import os import posixpath -from typing import TYPE_CHECKING, Any, Mapping, MutableMapping +import warnings +from typing import TYPE_CHECKING, Any, Callable, Mapping, MutableMapping from urllib.parse import unquote as urlunquote from urllib.parse import urljoin, urlsplit, urlunsplit -from xml.etree.ElementTree import Element +from xml.etree import ElementTree as etree import markdown -from markdown.extensions import Extension -from markdown.treeprocessors import Treeprocessor +import markdown.extensions +import markdown.postprocessors +import markdown.treeprocessors from markdown.util import AMP_SUBSTITUTE from mkdocs.structure.files import File, Files from mkdocs.structure.toc import get_toc -from mkdocs.utils import get_build_date, get_markdown_title, meta +from mkdocs.utils import get_build_date, get_markdown_title, meta, weak_property if TYPE_CHECKING: from mkdocs.config.defaults import MkDocsConfig from mkdocs.structure.nav import Section from mkdocs.structure.toc import TableOfContents +_unescape: Callable[[str], str] +try: + _unescape = markdown.treeprocessors.UnescapeTreeprocessor().unescape # type: ignore +except AttributeError: + _unescape = markdown.postprocessors.UnescapePostprocessor().run + log = logging.getLogger(__name__) @@ -32,7 +41,8 @@ class Page: ) -> None: file.page = self self.file = file - self.title = title + if title is not None: + self.title = title # Navigation attributes self.parent = None @@ -50,6 +60,7 @@ class Page: # Placeholders to be filled in later in the build process. self.markdown = None + self._title_from_render: str | None = None self.content = None self.toc = [] # type: ignore self.meta = {} @@ -69,9 +80,6 @@ class Page: def _indent_print(self, depth=0): return '{}{}'.format(' ' * depth, repr(self)) - title: str | None - """Contains the Title for the current page.""" - markdown: str | None """The original Markdown content from the file.""" @@ -226,11 +234,18 @@ class Page: raise self.markdown, self.meta = meta.get_data(source) - self._set_title() def _set_title(self) -> None: + warnings.warn( + "_set_title is no longer used in MkDocs and will be removed soon.", DeprecationWarning + ) + + @weak_property + def title(self) -> str | None: """ - Set the title for a Markdown document. + Returns the title for the current page. + + Before calling `read_source()`, this value is empty. It can also be updated by `render()`. Check these in order and use the first that returns a valid title: - value provided on init (passed in from config) @@ -238,48 +253,56 @@ class Page: - content of the first H1 in Markdown content - convert filename to title """ - if self.title is not None: - return + if self.markdown is None: + return None if 'title' in self.meta: - self.title = self.meta['title'] - return + return self.meta['title'] - assert self.markdown is not None - title = get_markdown_title(self.markdown) + if self._title_from_render: + return self._title_from_render + elif self.content is None: # Preserve legacy behavior only for edge cases in plugins. + title_from_md = get_markdown_title(self.markdown) + if title_from_md is not None: + return title_from_md - if title is None: - if self.is_homepage: - title = 'Home' - else: - title = self.file.name.replace('-', ' ').replace('_', ' ') - # Capitalize if the filename was all lowercase, otherwise leave it as-is. - if title.lower() == title: - title = title.capitalize() + if self.is_homepage: + return 'Home' - self.title = title + title = self.file.name.replace('-', ' ').replace('_', ' ') + # Capitalize if the filename was all lowercase, otherwise leave it as-is. + if title.lower() == title: + title = title.capitalize() + return title def render(self, config: MkDocsConfig, files: Files) -> None: """ Convert the Markdown source file to HTML as per the config. """ - extensions = [_RelativePathExtension(self.file, files), *config['markdown_extensions']] + if self.markdown is None: + raise RuntimeError("`markdown` field hasn't been set (via `read_source`)") + relative_path_extension = _RelativePathExtension(self.file, files) + extract_title_extension = _ExtractTitleExtension() md = markdown.Markdown( - extensions=extensions, + extensions=[ + relative_path_extension, + extract_title_extension, + *config['markdown_extensions'], + ], extension_configs=config['mdx_configs'] or {}, ) - assert self.markdown is not None self.content = md.convert(self.markdown) self.toc = get_toc(getattr(md, 'toc_tokens', [])) + self._title_from_render = extract_title_extension.title -class _RelativePathTreeprocessor(Treeprocessor): +class _RelativePathTreeprocessor(markdown.treeprocessors.Treeprocessor): def __init__(self, file: File, files: Files) -> None: self.file = file self.files = files - def run(self, root: Element) -> Element: + def run(self, root: etree.Element) -> etree.Element: """ Update urls on anchors and images to make them relative @@ -335,7 +358,7 @@ class _RelativePathTreeprocessor(Treeprocessor): return urlunsplit(components) -class _RelativePathExtension(Extension): +class _RelativePathExtension(markdown.extensions.Extension): """ The Extension class is what we pass to markdown, it then registers the Treeprocessor. @@ -348,3 +371,32 @@ class _RelativePathExtension(Extension): def extendMarkdown(self, md: markdown.Markdown) -> None: relpath = _RelativePathTreeprocessor(self.file, self.files) md.treeprocessors.register(relpath, "relpath", 0) + + +class _ExtractTitleExtension(markdown.extensions.Extension): + def __init__(self) -> None: + self.title: str | None = None + + def extendMarkdown(self, md: markdown.Markdown) -> None: + md.treeprocessors.register( + _ExtractTitleTreeprocessor(self), + "mkdocs_extract_title", + priority=1, # Close to the end. + ) + + +class _ExtractTitleTreeprocessor(markdown.treeprocessors.Treeprocessor): + def __init__(self, ext: _ExtractTitleExtension) -> None: + self.ext = ext + + def run(self, root: etree.Element) -> etree.Element: + for el in root: + if el.tag == 'h1': + # Drop anchorlink from the element, if present. + if len(el) > 0 and el[-1].tag == 'a' and not (el.tail or '').strip(): + el = copy.copy(el) + del el[-1] + # Extract the text only, recursively. + self.ext.title = _unescape(''.join(el.itertext())) + break + return root diff --git a/mkdocs/tests/structure/page_tests.py b/mkdocs/tests/structure/page_tests.py index 9ccd9ff8..30a90f68 100644 --- a/mkdocs/tests/structure/page_tests.py +++ b/mkdocs/tests/structure/page_tests.py @@ -4,6 +4,7 @@ import sys import unittest from unittest import mock +from mkdocs.config.defaults import MkDocsConfig from mkdocs.structure.files import File, Files from mkdocs.structure.pages import Page from mkdocs.tests.base import dedent, load_config, tempdir @@ -299,7 +300,84 @@ class PageTests(unittest.TestCase): self.assertEqual(pg.parent, None) self.assertEqual(pg.previous_page, None) self.assertEqual(pg.title, 'Welcome to MkDocs') - self.assertEqual(pg.toc, []) + pg.render(cfg, fl) + self.assertEqual(pg.title, 'Welcome to MkDocs') + + _SETEXT_CONTENT = dedent( + ''' + Welcome to MkDocs Setext + ======================== + + This tests extracting a setext style title. + ''' + ) + + @tempdir(files={'testing_setext_title.md': _SETEXT_CONTENT}) + def test_page_title_from_setext_markdown(self, docs_dir): + cfg = load_config() + fl = File('testing_setext_title.md', docs_dir, docs_dir, use_directory_urls=True) + pg = Page(None, fl, cfg) + self.assertIsNone(pg.title) + pg.read_source(cfg) + self.assertEqual(pg.title, 'Testing setext title') + pg.render(cfg, fl) + self.assertEqual(pg.title, 'Welcome to MkDocs Setext') + + @tempdir(files={'testing_setext_title.md': _SETEXT_CONTENT}) + def test_page_title_from_markdown_stripped_anchorlinks(self, docs_dir): + cfg = MkDocsConfig() + cfg.site_name = 'example' + cfg.markdown_extensions = {'toc': {'permalink': '&'}} + self.assertEqual(cfg.validate(), ([], [])) + fl = File('testing_setext_title.md', docs_dir, docs_dir, use_directory_urls=True) + pg = Page(None, fl, cfg) + pg.read_source(cfg) + pg.render(cfg, fl) + self.assertEqual(pg.title, 'Welcome to MkDocs Setext') + + _FORMATTING_CONTENT = dedent( + ''' + # Hello *beautiful* `world` + + Hi. + ''' + ) + + @tempdir(files={'testing_formatting.md': _FORMATTING_CONTENT}) + def test_page_title_from_markdown_strip_formatting(self, docs_dir): + cfg = load_config() + fl = File('testing_formatting.md', docs_dir, docs_dir, use_directory_urls=True) + pg = Page(None, fl, cfg) + pg.read_source(cfg) + pg.render(cfg, fl) + self.assertEqual(pg.title, 'Hello beautiful world') + + _ATTRLIST_CONTENT = dedent( + ''' + # Welcome to MkDocs Attr { #welcome } + + This tests extracting the title, with enabled attr_list markdown_extension. + ''' + ) + + @tempdir(files={'testing_attr_list.md': _ATTRLIST_CONTENT}) + def test_page_title_from_markdown_stripped_attr_list(self, docs_dir): + cfg = load_config() + cfg.markdown_extensions.append('attr_list') + fl = File('testing_attr_list.md', docs_dir, docs_dir, use_directory_urls=True) + pg = Page(None, fl, cfg) + pg.read_source(cfg) + pg.render(cfg, fl) + self.assertEqual(pg.title, 'Welcome to MkDocs Attr') + + @tempdir(files={'testing_attr_list.md': _ATTRLIST_CONTENT}) + def test_page_title_from_markdown_preserved_attr_list(self, docs_dir): + cfg = load_config() + fl = File('testing_attr_list.md', docs_dir, docs_dir, use_directory_urls=True) + pg = Page(None, fl, cfg) + pg.read_source(cfg) + pg.render(cfg, fl) + self.assertEqual(pg.title, 'Welcome to MkDocs Attr { #welcome }') def test_page_title_from_meta(self): cfg = load_config(docs_dir=self.DOCS_DIR) @@ -324,6 +402,8 @@ class PageTests(unittest.TestCase): self.assertEqual(pg.previous_page, None) self.assertEqual(pg.title, 'A Page Title') self.assertEqual(pg.toc, []) + pg.render(cfg, fl) + self.assertEqual(pg.title, 'A Page Title') def test_page_title_from_filename(self): cfg = load_config(docs_dir=self.DOCS_DIR) @@ -347,7 +427,8 @@ class PageTests(unittest.TestCase): self.assertEqual(pg.parent, None) self.assertEqual(pg.previous_page, None) self.assertEqual(pg.title, 'Page title') - self.assertEqual(pg.toc, []) + pg.render(cfg, fl) + self.assertEqual(pg.title, 'Page title') def test_page_title_from_capitalized_filename(self): cfg = load_config(docs_dir=self.DOCS_DIR) @@ -371,7 +452,6 @@ class PageTests(unittest.TestCase): self.assertEqual(pg.parent, None) self.assertEqual(pg.previous_page, None) self.assertEqual(pg.title, 'pageTitle') - self.assertEqual(pg.toc, []) def test_page_title_from_homepage_filename(self): cfg = load_config(docs_dir=self.DOCS_DIR) diff --git a/mkdocs/utils/__init__.py b/mkdocs/utils/__init__.py index 3eb55584..5ebf3731 100644 --- a/mkdocs/utils/__init__.py +++ b/mkdocs/utils/__init__.py @@ -385,13 +385,7 @@ def dirname_to_title(dirname: str) -> str: def get_markdown_title(markdown_src: str) -> str | None: - """ - Get the title of a Markdown document. The title in this case is considered - to be a H1 that occurs before any other content in the document. - The procedure is then to iterate through the lines, stopping at the first - non-whitespace content. If it is a title, return that, otherwise return - None. - """ + """Soft-deprecated, do not use.""" lines = markdown_src.replace('\r\n', '\n').replace('\r', '\n').split('\n') while lines: line = lines.pop(0).strip() @@ -460,6 +454,19 @@ class CountHandler(logging.NullHandler): return [(logging.getLevelName(k), v) for k, v in sorted(self.counts.items(), reverse=True)] +class weak_property: + """Same as a read-only property, but allows overwriting the field for good.""" + + def __init__(self, func): + self.func = func + self.__doc__ = func.__doc__ + + def __get__(self, instance, owner=None): + if instance is None: + return self + return self.func(instance) + + def __getattr__(name: str): if name == 'warning_filter': warnings.warn(