# HG changeset patch # User Yuya Nishihara # Date 1524364733 -32400 # Node ID 57b0c7221dba1a1eee933fcd2c61ace6087053f4 # Parent bfe8ef6e370e3722d31fddc56e1a03a784e27392 encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it If 's' is a localstr, 's._utf8' must be returned to get the original UTF-8 sequence back. Because of this, it was totally wrong to test if '"\xed" not in s', which should be either '"\xed" not in s._utf8' or just omitted. This patch moves the localstr handling to top as the validity of 's._utf8' should be pre-checked by encoding.tolocal(). diff -r bfe8ef6e370e -r 57b0c7221dba mercurial/encoding.py --- a/mercurial/encoding.py Sun Mar 25 16:47:33 2018 +0900 +++ b/mercurial/encoding.py Sun Apr 22 11:38:53 2018 +0900 @@ -504,11 +504,13 @@ internal surrogate encoding as a UTF-8 string.) ''' - if not isinstance(s, localstr) and isasciistr(s): + if isinstance(s, localstr): + # assume that the original UTF-8 sequence would never contain + # invalid characters in U+DCxx range + return s._utf8 + elif isasciistr(s): return s if "\xed" not in s: - if isinstance(s, localstr): - return s._utf8 try: s.decode('utf-8', _utf8strict) return s diff -r bfe8ef6e370e -r 57b0c7221dba tests/test-encoding-func.py --- a/tests/test-encoding-func.py Sun Mar 25 16:47:33 2018 +0900 +++ b/tests/test-encoding-func.py Sun Apr 22 11:38:53 2018 +0900 @@ -35,11 +35,32 @@ self.assertTrue(s is encoding.fromlocal(s)) class Utf8bEncodingTest(unittest.TestCase): + def setUp(self): + self.origencoding = encoding.encoding + + def tearDown(self): + encoding.encoding = self.origencoding + def testasciifastpath(self): s = b'\0' * 100 self.assertTrue(s is encoding.toutf8b(s)) self.assertTrue(s is encoding.fromutf8b(s)) + def testlossylatin(self): + encoding.encoding = b'ascii' + s = u'\xc0'.encode('utf-8') + l = encoding.tolocal(s) + self.assertEqual(l, b'?') # lossy + self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved + + def testlossy0xed(self): + encoding.encoding = b'euc-kr' # U+Dxxx Hangul + s = u'\ud1bc\xc0'.encode('utf-8') + l = encoding.tolocal(s) + self.assertIn(b'\xed', l) + self.assertTrue(l.endswith(b'?')) # lossy + self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved + if __name__ == '__main__': import silenttestrunner silenttestrunner.main(__name__)