comparison tests/test-encoding-func.py @ 37947:3ea3c96ada54

encoding: introduce tagging type for non-lossy non-ASCII string This fixes the weird behavior of toutf8b(), which would convert a local string back to UTF-8 *only if* it was lossy in the system encoding. Before b7b26e54e37a "encoding: avoid localstr when a string can be encoded losslessly (issue2763)", all local strings were wrapped by the localstr class. I think this would justify the round-trip behavior of toutf8b(). ASCII strings are special-cased, so the cost of wrapping with safelocalstr is negligible. (with mercurial repo) $ export HGRCPATH=/dev/null HGPLAIN= HGENCODING=latin-1 $ hg log --time --config experimental.evolution=all > /dev/null (original) time: real 11.340 secs (user 11.290+0.000 sys 0.060+0.000) time: real 11.390 secs (user 11.300+0.000 sys 0.080+0.000) time: real 11.430 secs (user 11.360+0.000 sys 0.070+0.000) (this patch) time: real 11.200 secs (user 11.100+0.000 sys 0.100+0.000) time: real 11.370 secs (user 11.300+0.000 sys 0.070+0.000) time: real 11.190 secs (user 11.130+0.000 sys 0.060+0.000)
author Yuya Nishihara <yuya@tcha.org>
date Sun, 23 Apr 2017 13:15:30 +0900
parents 57b0c7221dba
children 2372284d9457
comparison
equal deleted inserted replaced
37946:57b0c7221dba 37947:3ea3c96ada54
51 s = u'\xc0'.encode('utf-8') 51 s = u'\xc0'.encode('utf-8')
52 l = encoding.tolocal(s) 52 l = encoding.tolocal(s)
53 self.assertEqual(l, b'?') # lossy 53 self.assertEqual(l, b'?') # lossy
54 self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved 54 self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved
55 55
56 def testlosslesslatin(self):
57 encoding.encoding = b'latin-1'
58 s = u'\xc0'.encode('utf-8')
59 l = encoding.tolocal(s)
60 self.assertEqual(l, b'\xc0') # lossless
61 self.assertEqual(s, encoding.toutf8b(l)) # convert back to utf-8
62
56 def testlossy0xed(self): 63 def testlossy0xed(self):
57 encoding.encoding = b'euc-kr' # U+Dxxx Hangul 64 encoding.encoding = b'euc-kr' # U+Dxxx Hangul
58 s = u'\ud1bc\xc0'.encode('utf-8') 65 s = u'\ud1bc\xc0'.encode('utf-8')
59 l = encoding.tolocal(s) 66 l = encoding.tolocal(s)
60 self.assertIn(b'\xed', l) 67 self.assertIn(b'\xed', l)
61 self.assertTrue(l.endswith(b'?')) # lossy 68 self.assertTrue(l.endswith(b'?')) # lossy
62 self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved 69 self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved
63 70
71 def testlossless0xed(self):
72 encoding.encoding = b'euc-kr' # U+Dxxx Hangul
73 s = u'\ud1bc'.encode('utf-8')
74 l = encoding.tolocal(s)
75 self.assertEqual(l, b'\xc5\xed') # lossless
76 self.assertEqual(s, encoding.toutf8b(l)) # convert back to utf-8
77
64 if __name__ == '__main__': 78 if __name__ == '__main__':
65 import silenttestrunner 79 import silenttestrunner
66 silenttestrunner.main(__name__) 80 silenttestrunner.main(__name__)