Make sitemap.xml.gz slightly more reproducible (#3460)

Use the latest page update time instead of the current time

Now the date of the gzip file will change only once per day, based on the pages' update date. The sitemap.xml itself also changes once per day already.
This commit is contained in:
Oleh Prypin
2023-12-08 21:39:14 +01:00
committed by GitHub
parent 64cab0d77f
commit ccf011db79
4 changed files with 18 additions and 15 deletions

View File

@@ -110,7 +110,9 @@ def _build_theme_template(
log.debug(f"Gzipping template: {template_name}")
gz_filename = f'{output_path}.gz'
with open(gz_filename, 'wb') as f:
timestamp = utils.get_build_timestamp()
timestamp = utils.get_build_timestamp(
pages=[f.page for f in files.documentation_pages() if f.page is not None]
)
with gzip.GzipFile(
fileobj=f, filename=gz_filename, mode='wb', mtime=timestamp
) as gz_buf:

View File

@@ -42,7 +42,7 @@ class Page(StructureItem):
self.next_page = None
self.active = False
self.update_date = get_build_date()
self.update_date: str = get_build_date()
self._set_canonical_url(config.get('site_url', None))
self._set_edit_url(

View File

@@ -225,7 +225,7 @@ class BuildTests(PathAssertionMixin, unittest.TestCase):
def test_build_theme_template(self, mock_build_template, mock_write_file):
cfg = load_config()
env = cfg.theme.get_env()
build._build_theme_template('main.html', env, mock.Mock(), cfg, mock.Mock())
build._build_theme_template('main.html', env, Files([]), cfg, mock.Mock())
mock_write_file.assert_called_once()
mock_build_template.assert_called_once()
@@ -238,7 +238,7 @@ class BuildTests(PathAssertionMixin, unittest.TestCase):
):
cfg = load_config(site_dir=site_dir)
env = cfg.theme.get_env()
build._build_theme_template('sitemap.xml', env, mock.Mock(), cfg, mock.Mock())
build._build_theme_template('sitemap.xml', env, Files([]), cfg, mock.Mock())
mock_write_file.assert_called_once()
mock_build_template.assert_called_once()
mock_gzip_gzipfile.assert_called_once()
@@ -249,7 +249,7 @@ class BuildTests(PathAssertionMixin, unittest.TestCase):
cfg = load_config()
env = cfg.theme.get_env()
with self.assertLogs('mkdocs') as cm:
build._build_theme_template('missing.html', env, mock.Mock(), cfg, mock.Mock())
build._build_theme_template('missing.html', env, Files([]), cfg, mock.Mock())
self.assertEqual(
'\n'.join(cm.output),
"WARNING:mkdocs.commands.build:Template skipped: 'missing.html' not found in theme directories.",
@@ -263,7 +263,7 @@ class BuildTests(PathAssertionMixin, unittest.TestCase):
cfg = load_config()
env = cfg.theme.get_env()
with self.assertLogs('mkdocs') as cm:
build._build_theme_template('main.html', env, mock.Mock(), cfg, mock.Mock())
build._build_theme_template('main.html', env, Files([]), cfg, mock.Mock())
self.assertEqual(
'\n'.join(cm.output),
"INFO:mkdocs.commands.build:Template skipped: 'main.html' generated empty output.",

View File

@@ -44,18 +44,19 @@ markdown_extensions = (
)
def get_build_timestamp() -> int:
def get_build_timestamp(*, pages: Collection[Page] | None = None) -> int:
"""
Returns the number of seconds since the epoch.
Returns the number of seconds since the epoch for the latest updated page.
Support SOURCE_DATE_EPOCH environment variable for reproducible builds.
See https://reproducible-builds.org/specs/source-date-epoch/
In reality this is just today's date because that's how pages' update time is populated.
"""
source_date_epoch = os.environ.get('SOURCE_DATE_EPOCH')
if source_date_epoch is None:
return int(datetime.now(timezone.utc).timestamp())
return int(source_date_epoch)
if pages:
# Lexicographic comparison is OK for ISO date.
date_string = max(p.update_date for p in pages)
dt = datetime.fromisoformat(date_string)
else:
dt = get_build_datetime()
return int(dt.timestamp())
def get_build_datetime() -> datetime: