From 555659cdc32c1a52344b7b00a2f3cace8ce3e68a Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Mon, 5 Jun 2017 19:49:59 -0400 Subject: [PATCH] Open files with `utf-8-sig` to account for BOM. Python simply discards the BOM with `utf-8-sig`. This way, users of Microsoft text editors can have their files properly parsered. In all other ways behaves as reading files using `utf-8` encoding. For more info see: https://docs.python.org/2/library/codecs.html#encodings-and-unicode Fixes #1186. --- mkdocs/nav.py | 2 +- mkdocs/tests/build_tests.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/mkdocs/nav.py b/mkdocs/nav.py index b73879d5..7240ef22 100644 --- a/mkdocs/nav.py +++ b/mkdocs/nav.py @@ -248,7 +248,7 @@ class Page(object): 'page_read_source', None, config=config, page=self) if source is None: try: - with io.open(self.abs_input_path, 'r', encoding='utf-8') as f: + with io.open(self.abs_input_path, 'r', encoding='utf-8-sig') as f: source = f.read() except IOError: log.error('File not found: %s', self.abs_input_path) diff --git a/mkdocs/tests/build_tests.py b/mkdocs/tests/build_tests.py index 4820e92d..ac5f5116 100644 --- a/mkdocs/tests/build_tests.py +++ b/mkdocs/tests/build_tests.py @@ -7,6 +7,7 @@ import shutil import tempfile import unittest import mock +import io try: from itertools import izip as zip @@ -464,3 +465,32 @@ class BuildTests(unittest.TestCase): context = build.get_context(mock.Mock(), cfg) self.assertEqual(context['config']['extra']['a'], 1) + + def test_BOM(self): + docs_dir = tempfile.mkdtemp() + site_dir = tempfile.mkdtemp() + try: + # Create an UTF-8 Encoded file with BOM (as Micorsoft editors do). See #1186. + f = io.open(os.path.join(docs_dir, 'index.md'), 'w', encoding='utf-8-sig') + f.write('# An UTF-8 encoded file with a BOM') + f.close() + + cfg = load_config( + docs_dir=docs_dir, + site_dir=site_dir + ) + build.build(cfg) + + # Verify that the file was generated properly. + # If the BOM is not removed, Markdown will return: + # `

\ufeff# An UTF-8 encoded file with a BOM

`. + f = io.open(os.path.join(site_dir, 'index.html'), 'r', encoding='utf-8') + output = f.read() + f.close() + self.assertTrue( + '

An UTF-8 encoded file with a BOM

' in output + ) + + finally: + shutil.rmtree(docs_dir) + shutil.rmtree(site_dir)