Mercurial > public > mercurial-scm > hg-stable
annotate mercurial/encoding.py @ 38823:e7aa113b14f7
global: use pycompat.xrange()
On Python 3, our module importer automatically rewrites xrange()
to pycompat.xrange().
We want to move away from the custom importer on Python 3.
This commit converts all instances of xrange() to use
pycompat.xrange().
Differential Revision: https://phab.mercurial-scm.org/D4032
author | Gregory Szorc <gregory.szorc@gmail.com> |
---|---|
date | Wed, 01 Aug 2018 13:00:45 -0700 |
parents | 7acec9408e1c |
children | 24e493ec2229 |
rev | line source |
---|---|
8226
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
1 # encoding.py - character transcoding support for Mercurial |
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
2 # |
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others |
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
4 # |
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
5 # This software may be used and distributed according to the terms of the |
10263 | 6 # GNU General Public License version 2 or any later version. |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
7 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
8 from __future__ import absolute_import, print_function |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
9 |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
10 import locale |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
11 import os |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
12 import unicodedata |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
13 |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
14 from . import ( |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
15 error, |
32411
df448de7cf3b
parsers: switch to policy importer
Yuya Nishihara <yuya@tcha.org>
parents:
32339
diff
changeset
|
16 policy, |
30031
0f6d6fdd3c2a
pycompat: provide 'ispy3' constant
Yuya Nishihara <yuya@tcha.org>
parents:
28508
diff
changeset
|
17 pycompat, |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
18 ) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
19 |
33942
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
20 from .pure import ( |
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
21 charencode as charencodepure, |
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
22 ) |
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
23 |
33782
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33038
diff
changeset
|
24 charencode = policy.importmod(r'charencode') |
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33038
diff
changeset
|
25 |
33944
f4433f2713d0
encoding: add function to test if a str consists of ASCII characters
Yuya Nishihara <yuya@tcha.org>
parents:
33943
diff
changeset
|
26 isasciistr = charencode.isasciistr |
33782
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33038
diff
changeset
|
27 asciilower = charencode.asciilower |
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33038
diff
changeset
|
28 asciiupper = charencode.asciiupper |
33943
2c37f9dabc32
encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents:
33942
diff
changeset
|
29 _jsonescapeu8fast = charencode.jsonescapeu8fast |
33782
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33038
diff
changeset
|
30 |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
31 _sysstr = pycompat.sysstr |
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
32 |
30031
0f6d6fdd3c2a
pycompat: provide 'ispy3' constant
Yuya Nishihara <yuya@tcha.org>
parents:
28508
diff
changeset
|
33 if pycompat.ispy3: |
28507
9bcbd9412225
encoding: make HFS+ ignore code Python 3 compatible
Gregory Szorc <gregory.szorc@gmail.com>
parents:
28069
diff
changeset
|
34 unichr = chr |
9bcbd9412225
encoding: make HFS+ ignore code Python 3 compatible
Gregory Szorc <gregory.szorc@gmail.com>
parents:
28069
diff
changeset
|
35 |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
36 # These unicode characters are ignored by HFS+ (Apple Technote 1150, |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
37 # "Unicode Subtleties"), so we need to ignore them in some places for |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
38 # sanity. |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
39 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
40 "200c 200d 200e 200f 202a 202b 202c 202d 202e " |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
41 "206a 206b 206c 206d 206e 206f feff".split()] |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
42 # verify the next function will work |
32339
7040f5131454
encoding: use i.startswith() instead of i[0] to eliminate py2/3 divergence
Yuya Nishihara <yuya@tcha.org>
parents:
32331
diff
changeset
|
43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore) |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
44 |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
45 def hfsignoreclean(s): |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
46 """Remove codepoints ignored by HFS+ from s. |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
47 |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
48 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
49 '.hg' |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
50 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8')) |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
51 '.hg' |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
52 """ |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
53 if "\xe2" in s or "\xef" in s: |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
54 for c in _ignore: |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
55 s = s.replace(c, '') |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
56 return s |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
57 |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
58 # encoding.environ is provided read-only, which may not be used to modify |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
59 # the process environment |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
60 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ) |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
61 if not pycompat.ispy3: |
32231
cf424dae5dc7
check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents:
32205
diff
changeset
|
62 environ = os.environ # re-exports |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
63 elif _nativeenviron: |
32231
cf424dae5dc7
check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents:
32205
diff
changeset
|
64 environ = os.environb # re-exports |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
65 else: |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
66 # preferred encoding isn't known yet; use utf-8 to avoid unicode error |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
67 # and recreate it once encoding is settled |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
68 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8')) |
32231
cf424dae5dc7
check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents:
32205
diff
changeset
|
69 for k, v in os.environ.items()) # re-exports |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
70 |
11892
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
71 _encodingfixers = { |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
72 '646': lambda: 'ascii', |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
73 'ANSI_X3.4-1968': lambda: 'ascii', |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
74 } |
37883
443029011990
encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
75 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2. |
443029011990
encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
76 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3. |
443029011990
encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
77 # https://bugs.python.org/issue13216 |
443029011990
encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
78 if pycompat.iswindows and not pycompat.ispy3: |
443029011990
encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
79 _encodingfixers['cp65001'] = lambda: 'utf-8' |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
80 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
81 try: |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
82 encoding = environ.get("HGENCODING") |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
83 if not encoding: |
30627
ce36fa9b140c
py3: make sure encoding.encoding is a bytes variable
Pulkit Goyal <7895pulkit@gmail.com>
parents:
30034
diff
changeset
|
84 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii' |
11892
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
85 encoding = _encodingfixers.get(encoding, lambda: encoding)() |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
86 except locale.Error: |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
87 encoding = 'ascii' |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
88 encodingmode = environ.get("HGENCODINGMODE", "strict") |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
89 fallbackencoding = 'ISO-8859-1' |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
90 |
33832
dabe1f11ae3a
py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents:
33782
diff
changeset
|
91 class localstr(bytes): |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
92 '''This class allows strings that are unmodified to be |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
93 round-tripped to the local encoding and back''' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
94 def __new__(cls, u, l): |
33832
dabe1f11ae3a
py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents:
33782
diff
changeset
|
95 s = bytes.__new__(cls, l) |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
96 s._utf8 = u |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
97 return s |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
98 def __hash__(self): |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
99 return hash(self._utf8) # avoid collisions in local string space |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
100 |
37991
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
101 class safelocalstr(bytes): |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
102 """Tagged string denoting it was previously an internal UTF-8 string, |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
103 and can be converted back to UTF-8 losslessly |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
104 |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
105 >>> assert safelocalstr(b'\\xc3') == b'\\xc3' |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
106 >>> assert b'\\xc3' == safelocalstr(b'\\xc3') |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
107 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
108 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
109 """ |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
110 |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
111 def tolocal(s): |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
112 """ |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
113 Convert a string from internal UTF-8 to local encoding |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
114 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
115 All internal strings should be UTF-8 but some repos before the |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
116 implementation of locale support may contain latin1 or possibly |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
117 other character sets. We attempt to decode everything strictly |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
118 using UTF-8, then Latin-1, and failing that, we use UTF-8 and |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
119 replace unknown characters. |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
120 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
121 The localstr class is used to cache the known UTF-8 encoding of |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
122 strings next to their local representation to allow lossless |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
123 round-trip conversion back to UTF-8. |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
124 |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
125 >>> u = b'foo: \\xc3\\xa4' # utf-8 |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
126 >>> l = tolocal(u) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
127 >>> l |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
128 'foo: ?' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
129 >>> fromlocal(l) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
130 'foo: \\xc3\\xa4' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
131 >>> u2 = b'foo: \\xc3\\xa1' |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
132 >>> d = { l: 1, tolocal(u2): 2 } |
18378
404feac78b8a
tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents:
17424
diff
changeset
|
133 >>> len(d) # no collision |
404feac78b8a
tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents:
17424
diff
changeset
|
134 2 |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
135 >>> b'foo: ?' in d |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
136 False |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
137 >>> l1 = b'foo: \\xe4' # historical latin1 fallback |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
138 >>> l = tolocal(l1) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
139 >>> l |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
140 'foo: ?' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
141 >>> fromlocal(l) # magically in utf-8 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
142 'foo: \\xc3\\xa4' |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
143 """ |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
144 |
33945
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33944
diff
changeset
|
145 if isasciistr(s): |
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33944
diff
changeset
|
146 return s |
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33944
diff
changeset
|
147 |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
148 try: |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
149 try: |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
150 # make sure string is actually stored in UTF-8 |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
151 u = s.decode('UTF-8') |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
152 if encoding == 'UTF-8': |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
153 # fast path |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
154 return s |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
155 r = u.encode(_sysstr(encoding), u"replace") |
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
156 if u == r.decode(_sysstr(encoding)): |
13940
b7b26e54e37a
encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents:
13051
diff
changeset
|
157 # r is a safe, non-lossy encoding of s |
37991
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
158 return safelocalstr(r) |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
159 return localstr(s, r) |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
160 except UnicodeDecodeError: |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
161 # we should only get here if we're looking at an ancient changeset |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
162 try: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
163 u = s.decode(_sysstr(fallbackencoding)) |
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
164 r = u.encode(_sysstr(encoding), u"replace") |
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
165 if u == r.decode(_sysstr(encoding)): |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
166 # r is a safe, non-lossy encoding of s |
37991
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
167 return safelocalstr(r) |
13940
b7b26e54e37a
encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents:
13051
diff
changeset
|
168 return localstr(u.encode('UTF-8'), r) |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
169 except UnicodeDecodeError: |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
170 u = s.decode("utf-8", "replace") # last ditch |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
171 # can't round-trip |
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
172 return u.encode(_sysstr(encoding), u"replace") |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
173 except LookupError as k: |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
174 raise error.Abort(k, hint="please check your locale settings") |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
175 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
176 def fromlocal(s): |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
177 """ |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
178 Convert a string from the local character encoding to UTF-8 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
179 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
180 We attempt to decode strings using the encoding mode set by |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
181 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
182 characters will cause an error message. Other modes include |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
183 'replace', which replaces unknown characters with a special |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
184 Unicode character, and 'ignore', which drops the character. |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
185 """ |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
186 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
187 # can we do a lossless round-trip? |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
188 if isinstance(s, localstr): |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
189 return s._utf8 |
33945
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33944
diff
changeset
|
190 if isasciistr(s): |
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33944
diff
changeset
|
191 return s |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
192 |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
193 try: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
194 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
195 return u.encode("utf-8") |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
196 except UnicodeDecodeError as inst: |
10282
08a0f04b56bd
many, many trivial check-code fixups
Matt Mackall <mpm@selenic.com>
parents:
10263
diff
changeset
|
197 sub = s[max(0, inst.start - 10):inst.start + 10] |
36578
3696efeab66f
py3: don't crash when re-raising encoding error
Yuya Nishihara <yuya@tcha.org>
parents:
34225
diff
changeset
|
198 raise error.Abort("decoding near '%s': %s!" |
3696efeab66f
py3: don't crash when re-raising encoding error
Yuya Nishihara <yuya@tcha.org>
parents:
34225
diff
changeset
|
199 % (sub, pycompat.bytestr(inst))) |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
200 except LookupError as k: |
15769
afdf4f5bac61
encoding: use hint markup for "please check your locale settings"
Mads Kiilerich <mads@kiilerich.com>
parents:
15672
diff
changeset
|
201 raise error.Abort(k, hint="please check your locale settings") |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
202 |
31456
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
203 def unitolocal(u): |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
204 """Convert a unicode string to a byte string of local encoding""" |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
205 return tolocal(u.encode('utf-8')) |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
206 |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
207 def unifromlocal(s): |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
208 """Convert a byte string of local encoding to a unicode string""" |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
209 return fromlocal(s).decode('utf-8') |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
210 |
33038
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
211 def unimethod(bytesfunc): |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
212 """Create a proxy method that forwards __unicode__() and __str__() of |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
213 Python 3 to __bytes__()""" |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
214 def unifunc(obj): |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
215 return unifromlocal(bytesfunc(obj)) |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
216 return unifunc |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
217 |
31457
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
218 # converter functions between native str and byte string. use these if the |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
219 # character encoding is not aware (e.g. exception message) or is known to |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
220 # be locale dependent (e.g. date formatting.) |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
221 if pycompat.ispy3: |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
222 strtolocal = unitolocal |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
223 strfromlocal = unifromlocal |
33038
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
224 strmethod = unimethod |
31457
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
225 else: |
31777
7d2cbe11ae48
pycompat: introduce identity function as a compat stub
Yuya Nishihara <yuya@tcha.org>
parents:
31457
diff
changeset
|
226 strtolocal = pycompat.identity |
7d2cbe11ae48
pycompat: introduce identity function as a compat stub
Yuya Nishihara <yuya@tcha.org>
parents:
31457
diff
changeset
|
227 strfromlocal = pycompat.identity |
33038
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
228 strmethod = pycompat.identity |
31457
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
229 |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
230 if not _nativeenviron: |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
231 # now encoding and helper functions are available, recreate the environ |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
232 # dict to be exported to other modules |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
233 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8'))) |
32231
cf424dae5dc7
check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents:
32205
diff
changeset
|
234 for k, v in os.environ.items()) # re-exports |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
235 |
12866
eddc20306ab6
encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents:
12770
diff
changeset
|
236 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. |
32570
044f3d7eb9ae
encoding: make sure "wide" variable never be referenced from other modules
Yuya Nishihara <yuya@tcha.org>
parents:
32562
diff
changeset
|
237 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide" |
044f3d7eb9ae
encoding: make sure "wide" variable never be referenced from other modules
Yuya Nishihara <yuya@tcha.org>
parents:
32562
diff
changeset
|
238 and "WFA" or "WF") |
12866
eddc20306ab6
encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents:
12770
diff
changeset
|
239 |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
240 def colwidth(s): |
15142
176882876780
encoding: colwidth input is in the local encoding
Matt Mackall <mpm@selenic.com>
parents:
15066
diff
changeset
|
241 "Find the column width of a string for display in the local encoding" |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
242 return ucolwidth(s.decode(_sysstr(encoding), u'replace')) |
15066
24efa83d81cb
i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
14951
diff
changeset
|
243 |
24efa83d81cb
i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
14951
diff
changeset
|
244 def ucolwidth(d): |
24efa83d81cb
i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
14951
diff
changeset
|
245 "Find the column width of a Unicode string for display" |
14951
61807854004e
encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents:
14069
diff
changeset
|
246 eaw = getattr(unicodedata, 'east_asian_width', None) |
61807854004e
encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents:
14069
diff
changeset
|
247 if eaw is not None: |
32570
044f3d7eb9ae
encoding: make sure "wide" variable never be referenced from other modules
Yuya Nishihara <yuya@tcha.org>
parents:
32562
diff
changeset
|
248 return sum([eaw(c) in _wide and 2 or 1 for c in d]) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
249 return len(d) |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
250 |
15143
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
251 def getcols(s, start, c): |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
252 '''Use colwidth to find a c-column substring of s starting at byte |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
253 index start''' |
38823
e7aa113b14f7
global: use pycompat.xrange()
Gregory Szorc <gregory.szorc@gmail.com>
parents:
38739
diff
changeset
|
254 for x in pycompat.xrange(start + c, len(s)): |
15143
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
255 t = s[start:x] |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
256 if colwidth(t) == c: |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
257 return t |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
258 |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
259 def trim(s, width, ellipsis='', leftside=False): |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
260 """Trim string 's' to at most 'width' columns (including 'ellipsis'). |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
261 |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
262 If 'leftside' is True, left side of string 's' is trimmed. |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
263 'ellipsis' is always placed at trimmed side. |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
264 |
34151
414a3513c2bd
doctest: do not embed non-ascii characters in docstring
Yuya Nishihara <yuya@tcha.org>
parents:
34150
diff
changeset
|
265 >>> from .node import bin |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
266 >>> def bprint(s): |
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
267 ... print(pycompat.sysstr(s)) |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
268 >>> ellipsis = b'+++' |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
269 >>> from . import encoding |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
270 >>> encoding.encoding = b'utf-8' |
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
271 >>> t = b'1234567890' |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
272 >>> bprint(trim(t, 12, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
273 1234567890 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
274 >>> bprint(trim(t, 10, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
275 1234567890 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
276 >>> bprint(trim(t, 8, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
277 12345+++ |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
278 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
279 +++67890 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
280 >>> bprint(trim(t, 8)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
281 12345678 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
282 >>> bprint(trim(t, 8, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
283 34567890 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
284 >>> bprint(trim(t, 3, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
285 +++ |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
286 >>> bprint(trim(t, 1, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
287 + |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
288 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns |
34150
e9e225f16932
doctest: pass encoding name as system string
Yuya Nishihara <yuya@tcha.org>
parents:
34146
diff
changeset
|
289 >>> t = u.encode(pycompat.sysstr(encoding.encoding)) |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
290 >>> bprint(trim(t, 12, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
291 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
292 >>> bprint(trim(t, 10, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
293 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
294 >>> bprint(trim(t, 8, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
295 \xe3\x81\x82\xe3\x81\x84+++ |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
296 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
297 +++\xe3\x81\x88\xe3\x81\x8a |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
298 >>> bprint(trim(t, 5)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
299 \xe3\x81\x82\xe3\x81\x84 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
300 >>> bprint(trim(t, 5, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
301 \xe3\x81\x88\xe3\x81\x8a |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
302 >>> bprint(trim(t, 4, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
303 +++ |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
304 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
305 +++ |
34151
414a3513c2bd
doctest: do not embed non-ascii characters in docstring
Yuya Nishihara <yuya@tcha.org>
parents:
34150
diff
changeset
|
306 >>> t = bin(b'112233445566778899aa') # invalid byte sequence |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
307 >>> bprint(trim(t, 12, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
308 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
309 >>> bprint(trim(t, 10, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
310 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
311 >>> bprint(trim(t, 8, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
312 \x11\x22\x33\x44\x55+++ |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
313 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
314 +++\x66\x77\x88\x99\xaa |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
315 >>> bprint(trim(t, 8)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
316 \x11\x22\x33\x44\x55\x66\x77\x88 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
317 >>> bprint(trim(t, 8, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
318 \x33\x44\x55\x66\x77\x88\x99\xaa |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
319 >>> bprint(trim(t, 3, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
320 +++ |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
321 >>> bprint(trim(t, 1, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
322 + |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
323 """ |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
324 try: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
325 u = s.decode(_sysstr(encoding)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
326 except UnicodeDecodeError: |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
327 if len(s) <= width: # trimming is not needed |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
328 return s |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
329 width -= len(ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
330 if width <= 0: # no enough room even for ellipsis |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
331 return ellipsis[:width + len(ellipsis)] |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
332 if leftside: |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
333 return ellipsis + s[-width:] |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
334 return s[:width] + ellipsis |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
335 |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
336 if ucolwidth(u) <= width: # trimming is not needed |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
337 return s |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
338 |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
339 width -= len(ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
340 if width <= 0: # no enough room even for ellipsis |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
341 return ellipsis[:width + len(ellipsis)] |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
342 |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
343 if leftside: |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
344 uslice = lambda i: u[i:] |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
345 concat = lambda s: ellipsis + s |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
346 else: |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
347 uslice = lambda i: u[:-i] |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
348 concat = lambda s: s + ellipsis |
38823
e7aa113b14f7
global: use pycompat.xrange()
Gregory Szorc <gregory.szorc@gmail.com>
parents:
38739
diff
changeset
|
349 for i in pycompat.xrange(1, len(u)): |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
350 usub = uslice(i) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
351 if ucolwidth(usub) <= width: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
352 return concat(usub.encode(_sysstr(encoding))) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
353 return ellipsis # no enough room for multi-column characters |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
354 |
14069
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
355 def lower(s): |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
356 "best-effort encoding-aware case-folding of local string s" |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
357 try: |
22779
d9585dda63c3
encoding.lower: use fast ASCII lower
Siddharth Agarwal <sid0@fb.com>
parents:
22778
diff
changeset
|
358 return asciilower(s) |
17235
3745ae495ce5
encoding: use s.decode to trigger UnicodeDecodeError
Martin Geisler <mg@aragost.com>
parents:
16493
diff
changeset
|
359 except UnicodeDecodeError: |
16387
c481761033bd
encoding: add fast-path for ASCII lowercase
Matt Mackall <mpm@selenic.com>
parents:
16274
diff
changeset
|
360 pass |
c481761033bd
encoding: add fast-path for ASCII lowercase
Matt Mackall <mpm@selenic.com>
parents:
16274
diff
changeset
|
361 try: |
14069
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
362 if isinstance(s, localstr): |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
363 u = s._utf8.decode("utf-8") |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
364 else: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
365 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
14069
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
366 |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
367 lu = u.lower() |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
368 if u == lu: |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
369 return s # preserve localstring |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
370 return lu.encode(_sysstr(encoding)) |
14069
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
371 except UnicodeError: |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
372 return s.lower() # we don't know how to fold this except in ASCII |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
373 except LookupError as k: |
15672
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
374 raise error.Abort(k, hint="please check your locale settings") |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
375 |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
376 def upper(s): |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
377 "best-effort encoding-aware case-folding of local string s" |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
378 try: |
24578
ac08de78de7f
encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents:
23596
diff
changeset
|
379 return asciiupper(s) |
17236
9fb8312dbdbd
encoding: add fast-path for ASCII uppercase.
Martin Geisler <mg@aragost.com>
parents:
17235
diff
changeset
|
380 except UnicodeDecodeError: |
24597
b4258d5a1600
encoding.upper: factor out fallback code
Siddharth Agarwal <sid0@fb.com>
parents:
24593
diff
changeset
|
381 return upperfallback(s) |
b4258d5a1600
encoding.upper: factor out fallback code
Siddharth Agarwal <sid0@fb.com>
parents:
24593
diff
changeset
|
382 |
b4258d5a1600
encoding.upper: factor out fallback code
Siddharth Agarwal <sid0@fb.com>
parents:
24593
diff
changeset
|
383 def upperfallback(s): |
17236
9fb8312dbdbd
encoding: add fast-path for ASCII uppercase.
Martin Geisler <mg@aragost.com>
parents:
17235
diff
changeset
|
384 try: |
15672
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
385 if isinstance(s, localstr): |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
386 u = s._utf8.decode("utf-8") |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
387 else: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
388 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
15672
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
389 |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
390 uu = u.upper() |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
391 if u == uu: |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
392 return s # preserve localstring |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
393 return uu.encode(_sysstr(encoding)) |
15672
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
394 except UnicodeError: |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
395 return s.upper() # we don't know how to fold this except in ASCII |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
396 except LookupError as k: |
15672
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
397 raise error.Abort(k, hint="please check your locale settings") |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
398 |
24593
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
399 class normcasespecs(object): |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
400 '''what a platform's normcase does to ASCII strings |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
401 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
402 This is specified per platform, and should be consistent with what normcase |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
403 on that platform actually does. |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
404 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
405 lower: normcase lowercases ASCII strings |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
406 upper: normcase uppercases ASCII strings |
24608
1c533e23ce95
util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents:
24597
diff
changeset
|
407 other: the fallback function should always be called |
1c533e23ce95
util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents:
24597
diff
changeset
|
408 |
1c533e23ce95
util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents:
24597
diff
changeset
|
409 This should be kept in sync with normcase_spec in util.h.''' |
24593
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
410 lower = -1 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
411 upper = 1 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
412 other = 0 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
413 |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
414 def jsonescape(s, paranoid=False): |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
415 '''returns a string suitable for JSON |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
416 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
417 JSON is problematic for us because it doesn't support non-Unicode |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
418 bytes. To deal with this, we take the following approach: |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
419 |
37991
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
420 - localstr/safelocalstr objects are converted back to UTF-8 |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
421 - valid UTF-8/ASCII strings are passed as-is |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
422 - other strings are converted to UTF-8b surrogate encoding |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
423 - apply JSON-specified string escaping |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
424 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
425 (escapes are doubled in these tests) |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
426 |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
427 >>> jsonescape(b'this is a test') |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
428 'this is a test' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
429 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f') |
27881
ffa599f3f503
encoding: escape U+007F (DEL) character in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
27699
diff
changeset
|
430 'escape characters: \\\\u0000 \\\\u000b \\\\u007f' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
431 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\') |
33943
2c37f9dabc32
encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents:
33942
diff
changeset
|
432 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
433 >>> jsonescape(b'a weird byte: \\xdd') |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
434 'a weird byte: \\xed\\xb3\\x9d' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
435 >>> jsonescape(b'utf-8: caf\\xc3\\xa9') |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
436 'utf-8: caf\\xc3\\xa9' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
437 >>> jsonescape(b'') |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
438 '' |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
439 |
28069
b2d24c2898f9
encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents:
28068
diff
changeset
|
440 If paranoid, non-ascii and common troublesome characters are also escaped. |
b2d24c2898f9
encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents:
28068
diff
changeset
|
441 This is suitable for web output. |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
442 |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
443 >>> s = b'escape characters: \\0 \\x0b \\x7f' |
33943
2c37f9dabc32
encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents:
33942
diff
changeset
|
444 >>> assert jsonescape(s) == jsonescape(s, paranoid=True) |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
445 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\' |
33943
2c37f9dabc32
encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents:
33942
diff
changeset
|
446 >>> assert jsonescape(s) == jsonescape(s, paranoid=True) |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
447 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True) |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
448 'escape boundary: ~ \\\\u007f \\\\u0080' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
449 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True) |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
450 'a weird byte: \\\\udcdd' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
451 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True) |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
452 'utf-8: caf\\\\u00e9' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
453 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
454 'non-BMP: \\\\ud834\\\\udd1e' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
455 >>> jsonescape(b'<foo@example.org>', paranoid=True) |
28069
b2d24c2898f9
encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents:
28068
diff
changeset
|
456 '\\\\u003cfoo@example.org\\\\u003e' |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
457 ''' |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
458 |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
459 u8chars = toutf8b(s) |
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
460 try: |
33942
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
461 return _jsonescapeu8fast(u8chars, paranoid) |
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
462 except ValueError: |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
463 pass |
33942
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
464 return charencodepure.jsonescapeu8fallback(u8chars, paranoid) |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
465 |
34225
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
466 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8 |
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
467 # bytes are mapped to that range. |
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
468 if pycompat.ispy3: |
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
469 _utf8strict = r'surrogatepass' |
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
470 else: |
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
471 _utf8strict = r'strict' |
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
472 |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
473 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
474 |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
475 def getutf8char(s, pos): |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
476 '''get the next full utf-8 character in the given string, starting at pos |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
477 |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
478 Raises a UnicodeError if the given location does not start a valid |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
479 utf-8 character. |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
480 ''' |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
481 |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
482 # find how many bytes to attempt decoding from first nibble |
34207
112f118ecb00
encoding: ensure getutf8char always returns a bytestr, never an int
Augie Fackler <raf@durin42.com>
parents:
34152
diff
changeset
|
483 l = _utf8len[ord(s[pos:pos + 1]) >> 4] |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
484 if not l: # ascii |
34207
112f118ecb00
encoding: ensure getutf8char always returns a bytestr, never an int
Augie Fackler <raf@durin42.com>
parents:
34152
diff
changeset
|
485 return s[pos:pos + 1] |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
486 |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
487 c = s[pos:pos + l] |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
488 # validate with attempted decode |
34225
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
489 c.decode("utf-8", _utf8strict) |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
490 return c |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
491 |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
492 def toutf8b(s): |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
493 '''convert a local, possibly-binary string into UTF-8b |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
494 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
495 This is intended as a generic method to preserve data when working |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
496 with schemes like JSON and XML that have no provision for |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
497 arbitrary byte strings. As Mercurial often doesn't know |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
498 what encoding data is in, we use so-called UTF-8b. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
499 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
500 If a string is already valid UTF-8 (or ASCII), it passes unmodified. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
501 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range, |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
502 uDC00-uDCFF. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
503 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
504 Principles of operation: |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
505 |
17424
e7cfe3587ea4
fix trivial spelling errors
Mads Kiilerich <mads@kiilerich.com>
parents:
17236
diff
changeset
|
506 - ASCII and UTF-8 data successfully round-trips and is understood |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
507 by Unicode-oriented clients |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
508 - filenames and file contents in arbitrary other encodings can have |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
509 be round-tripped or recovered by clueful clients |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
510 - local strings that have a cached known UTF-8 encoding (aka |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
511 localstr) get sent as UTF-8 so Unicode-oriented clients get the |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
512 Unicode data they want |
37991
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
513 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
514 - because we must preserve UTF-8 bytestring in places such as |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
515 filenames, metadata can't be roundtripped without help |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
516 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
517 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
518 arbitrary bytes into an internal Unicode format that can be |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
519 re-encoded back into the original. Here we are exposing the |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
520 internal surrogate encoding as a UTF-8 string.) |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
521 ''' |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
522 |
37990
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
523 if isinstance(s, localstr): |
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
524 # assume that the original UTF-8 sequence would never contain |
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
525 # invalid characters in U+DCxx range |
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
526 return s._utf8 |
37991
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
527 elif isinstance(s, safelocalstr): |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
528 # already verified that s is non-lossy in legacy encoding, which |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
529 # shouldn't contain characters in U+DCxx range |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
530 return fromlocal(s) |
37990
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
531 elif isasciistr(s): |
33946
6c119dbfd0c0
encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33945
diff
changeset
|
532 return s |
26879
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
533 if "\xed" not in s: |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
534 try: |
34225
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
535 s.decode('utf-8', _utf8strict) |
26879
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
536 return s |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
537 except UnicodeDecodeError: |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
538 pass |
26878
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
539 |
34223
1c601df9894c
py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara <yuya@tcha.org>
parents:
34207
diff
changeset
|
540 s = pycompat.bytestr(s) |
26878
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
541 r = "" |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
542 pos = 0 |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
543 l = len(s) |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
544 while pos < l: |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
545 try: |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
546 c = getutf8char(s, pos) |
26879
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
547 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
548 # have to re-escape existing U+DCxx characters |
34225
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
549 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict) |
26879
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
550 pos += 1 |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
551 else: |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
552 pos += len(c) |
26878
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
553 except UnicodeDecodeError: |
34225
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
554 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict) |
26878
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
555 pos += 1 |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
556 r += c |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
557 return r |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
558 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
559 def fromutf8b(s): |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
560 '''Given a UTF-8b string, return a local, possibly-binary string. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
561 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
562 return the original binary string. This |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
563 is a round-trip process for strings like filenames, but metadata |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
564 that's was passed through tolocal will remain in UTF-8. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
565 |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
566 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
567 >>> m = b"\\xc3\\xa9\\x99abcd" |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
568 >>> toutf8b(m) |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
569 '\\xc3\\xa9\\xed\\xb2\\x99abcd' |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
570 >>> roundtrip(m) |
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
571 True |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
572 >>> roundtrip(b"\\xc2\\xc2\\x80") |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
573 True |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
574 >>> roundtrip(b"\\xef\\xbf\\xbd") |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
575 True |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
576 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd") |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
577 True |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
578 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80") |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
579 True |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
580 ''' |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
581 |
33946
6c119dbfd0c0
encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33945
diff
changeset
|
582 if isasciistr(s): |
6c119dbfd0c0
encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33945
diff
changeset
|
583 return s |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
584 # fast path - look for uDxxx prefixes in s |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
585 if "\xed" not in s: |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
586 return s |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
587 |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
588 # We could do this with the unicode type but some Python builds |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
589 # use UTF-16 internally (issue5031) which causes non-BMP code |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
590 # points to be escaped. Instead, we use our handy getutf8char |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
591 # helper again to walk the string without "decoding" it. |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
592 |
34223
1c601df9894c
py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara <yuya@tcha.org>
parents:
34207
diff
changeset
|
593 s = pycompat.bytestr(s) |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
594 r = "" |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
595 pos = 0 |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
596 l = len(s) |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
597 while pos < l: |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
598 c = getutf8char(s, pos) |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
599 pos += len(c) |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
600 # unescape U+DCxx characters |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
601 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": |
34225
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
602 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff) |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
603 r += c |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
604 return r |