Mercurial > public > mercurial-scm > hg
comparison tests/test-encoding-func.py @ 37947:3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
This fixes the weird behavior of toutf8b(), which would convert a local
string back to UTF-8 *only if* it was lossy in the system encoding.
Before b7b26e54e37a "encoding: avoid localstr when a string can be encoded
losslessly (issue2763)", all local strings were wrapped by the localstr
class. I think this would justify the round-trip behavior of toutf8b().
ASCII strings are special-cased, so the cost of wrapping with safelocalstr
is negligible.
(with mercurial repo)
$ export HGRCPATH=/dev/null HGPLAIN= HGENCODING=latin-1
$ hg log --time --config experimental.evolution=all > /dev/null
(original)
time: real 11.340 secs (user 11.290+0.000 sys 0.060+0.000)
time: real 11.390 secs (user 11.300+0.000 sys 0.080+0.000)
time: real 11.430 secs (user 11.360+0.000 sys 0.070+0.000)
(this patch)
time: real 11.200 secs (user 11.100+0.000 sys 0.100+0.000)
time: real 11.370 secs (user 11.300+0.000 sys 0.070+0.000)
time: real 11.190 secs (user 11.130+0.000 sys 0.060+0.000)
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Sun, 23 Apr 2017 13:15:30 +0900 |
parents | 57b0c7221dba |
children | 2372284d9457 |
comparison
equal
deleted
inserted
replaced
37946:57b0c7221dba | 37947:3ea3c96ada54 |
---|---|
51 s = u'\xc0'.encode('utf-8') | 51 s = u'\xc0'.encode('utf-8') |
52 l = encoding.tolocal(s) | 52 l = encoding.tolocal(s) |
53 self.assertEqual(l, b'?') # lossy | 53 self.assertEqual(l, b'?') # lossy |
54 self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved | 54 self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved |
55 | 55 |
56 def testlosslesslatin(self): | |
57 encoding.encoding = b'latin-1' | |
58 s = u'\xc0'.encode('utf-8') | |
59 l = encoding.tolocal(s) | |
60 self.assertEqual(l, b'\xc0') # lossless | |
61 self.assertEqual(s, encoding.toutf8b(l)) # convert back to utf-8 | |
62 | |
56 def testlossy0xed(self): | 63 def testlossy0xed(self): |
57 encoding.encoding = b'euc-kr' # U+Dxxx Hangul | 64 encoding.encoding = b'euc-kr' # U+Dxxx Hangul |
58 s = u'\ud1bc\xc0'.encode('utf-8') | 65 s = u'\ud1bc\xc0'.encode('utf-8') |
59 l = encoding.tolocal(s) | 66 l = encoding.tolocal(s) |
60 self.assertIn(b'\xed', l) | 67 self.assertIn(b'\xed', l) |
61 self.assertTrue(l.endswith(b'?')) # lossy | 68 self.assertTrue(l.endswith(b'?')) # lossy |
62 self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved | 69 self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved |
63 | 70 |
71 def testlossless0xed(self): | |
72 encoding.encoding = b'euc-kr' # U+Dxxx Hangul | |
73 s = u'\ud1bc'.encode('utf-8') | |
74 l = encoding.tolocal(s) | |
75 self.assertEqual(l, b'\xc5\xed') # lossless | |
76 self.assertEqual(s, encoding.toutf8b(l)) # convert back to utf-8 | |
77 | |
64 if __name__ == '__main__': | 78 if __name__ == '__main__': |
65 import silenttestrunner | 79 import silenttestrunner |
66 silenttestrunner.main(__name__) | 80 silenttestrunner.main(__name__) |