Mercurial > public > mercurial-scm > hg-stable
view tests/test-encoding-func.py @ 37991:3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
This fixes the weird behavior of toutf8b(), which would convert a local
string back to UTF-8 *only if* it was lossy in the system encoding.
Before b7b26e54e37a "encoding: avoid localstr when a string can be encoded
losslessly (issue2763)", all local strings were wrapped by the localstr
class. I think this would justify the round-trip behavior of toutf8b().
ASCII strings are special-cased, so the cost of wrapping with safelocalstr
is negligible.
(with mercurial repo)
$ export HGRCPATH=/dev/null HGPLAIN= HGENCODING=latin-1
$ hg log --time --config experimental.evolution=all > /dev/null
(original)
time: real 11.340 secs (user 11.290+0.000 sys 0.060+0.000)
time: real 11.390 secs (user 11.300+0.000 sys 0.080+0.000)
time: real 11.430 secs (user 11.360+0.000 sys 0.070+0.000)
(this patch)
time: real 11.200 secs (user 11.100+0.000 sys 0.100+0.000)
time: real 11.370 secs (user 11.300+0.000 sys 0.070+0.000)
time: real 11.190 secs (user 11.130+0.000 sys 0.060+0.000)
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Sun, 23 Apr 2017 13:15:30 +0900 |
parents | 57b0c7221dba |
children | 2372284d9457 |
line wrap: on
line source
from __future__ import absolute_import import unittest from mercurial import ( encoding, ) class IsasciistrTest(unittest.TestCase): asciistrs = [ b'a', b'ab', b'abc', b'abcd', b'abcde', b'abcdefghi', b'abcd\0fghi', ] def testascii(self): for s in self.asciistrs: self.assertTrue(encoding.isasciistr(s)) def testnonasciichar(self): for s in self.asciistrs: for i in range(len(s)): t = bytearray(s) t[i] |= 0x80 self.assertFalse(encoding.isasciistr(bytes(t))) class LocalEncodingTest(unittest.TestCase): def testasciifastpath(self): s = b'\0' * 100 self.assertTrue(s is encoding.tolocal(s)) self.assertTrue(s is encoding.fromlocal(s)) class Utf8bEncodingTest(unittest.TestCase): def setUp(self): self.origencoding = encoding.encoding def tearDown(self): encoding.encoding = self.origencoding def testasciifastpath(self): s = b'\0' * 100 self.assertTrue(s is encoding.toutf8b(s)) self.assertTrue(s is encoding.fromutf8b(s)) def testlossylatin(self): encoding.encoding = b'ascii' s = u'\xc0'.encode('utf-8') l = encoding.tolocal(s) self.assertEqual(l, b'?') # lossy self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved def testlosslesslatin(self): encoding.encoding = b'latin-1' s = u'\xc0'.encode('utf-8') l = encoding.tolocal(s) self.assertEqual(l, b'\xc0') # lossless self.assertEqual(s, encoding.toutf8b(l)) # convert back to utf-8 def testlossy0xed(self): encoding.encoding = b'euc-kr' # U+Dxxx Hangul s = u'\ud1bc\xc0'.encode('utf-8') l = encoding.tolocal(s) self.assertIn(b'\xed', l) self.assertTrue(l.endswith(b'?')) # lossy self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved def testlossless0xed(self): encoding.encoding = b'euc-kr' # U+Dxxx Hangul s = u'\ud1bc'.encode('utf-8') l = encoding.tolocal(s) self.assertEqual(l, b'\xc5\xed') # lossless self.assertEqual(s, encoding.toutf8b(l)) # convert back to utf-8 if __name__ == '__main__': import silenttestrunner silenttestrunner.main(__name__)