Open files with utf-8-sig to account for BOM.

Python simply discards the BOM with `utf-8-sig`. This way, users of
Microsoft text editors can have their files properly parsered. In all other
ways behaves as reading files using `utf-8` encoding. For more info see:
https://docs.python.org/2/library/codecs.html#encodings-and-unicode

Fixes #1186.
This commit is contained in:
Waylan Limberg
2017-06-05 19:49:59 -04:00
parent 31e7c29784
commit 555659cdc3
2 changed files with 31 additions and 1 deletions

View File

@@ -248,7 +248,7 @@ class Page(object):
'page_read_source', None, config=config, page=self)
if source is None:
try:
with io.open(self.abs_input_path, 'r', encoding='utf-8') as f:
with io.open(self.abs_input_path, 'r', encoding='utf-8-sig') as f:
source = f.read()
except IOError:
log.error('File not found: %s', self.abs_input_path)

View File

@@ -7,6 +7,7 @@ import shutil
import tempfile
import unittest
import mock
import io
try:
from itertools import izip as zip
@@ -464,3 +465,32 @@ class BuildTests(unittest.TestCase):
context = build.get_context(mock.Mock(), cfg)
self.assertEqual(context['config']['extra']['a'], 1)
def test_BOM(self):
docs_dir = tempfile.mkdtemp()
site_dir = tempfile.mkdtemp()
try:
# Create an UTF-8 Encoded file with BOM (as Micorsoft editors do). See #1186.
f = io.open(os.path.join(docs_dir, 'index.md'), 'w', encoding='utf-8-sig')
f.write('# An UTF-8 encoded file with a BOM')
f.close()
cfg = load_config(
docs_dir=docs_dir,
site_dir=site_dir
)
build.build(cfg)
# Verify that the file was generated properly.
# If the BOM is not removed, Markdown will return:
# `<p>\ufeff# An UTF-8 encoded file with a BOM</p>`.
f = io.open(os.path.join(site_dir, 'index.html'), 'r', encoding='utf-8')
output = f.read()
f.close()
self.assertTrue(
'<h1 id="an-utf-8-encoded-file-with-a-bom">An UTF-8 encoded file with a BOM</h1>' in output
)
finally:
shutil.rmtree(docs_dir)
shutil.rmtree(site_dir)