Mercurial > public > mercurial-scm > hg-stable
annotate mercurial/encoding.py @ 33942:b9101467d88b
encoding: extract stub for fast JSON escape
This moves JSON character maps to pure/charencode.py because they will be
used only when the fast-path fails.
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Sun, 23 Apr 2017 16:10:51 +0900 |
parents | f18b11534274 |
children | 2c37f9dabc32 |
rev | line source |
---|---|
8226
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
1 # encoding.py - character transcoding support for Mercurial |
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
2 # |
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others |
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
4 # |
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
5 # This software may be used and distributed according to the terms of the |
10263 | 6 # GNU General Public License version 2 or any later version. |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
7 |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
8 from __future__ import absolute_import |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
9 |
33860
7d5bc0e5b88f
py3: introduce a wrapper for __builtins__.{raw_,}input()
Augie Fackler <augie@google.com>
parents:
33832
diff
changeset
|
10 import io |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
11 import locale |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
12 import os |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
13 import unicodedata |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
14 |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
15 from . import ( |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
16 error, |
32411
df448de7cf3b
parsers: switch to policy importer
Yuya Nishihara <yuya@tcha.org>
parents:
32339
diff
changeset
|
17 policy, |
30031
0f6d6fdd3c2a
pycompat: provide 'ispy3' constant
Yuya Nishihara <yuya@tcha.org>
parents:
28508
diff
changeset
|
18 pycompat, |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
19 ) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
20 |
33942
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
21 from .pure import ( |
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
22 charencode as charencodepure, |
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
23 ) |
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
24 |
33782
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33038
diff
changeset
|
25 charencode = policy.importmod(r'charencode') |
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33038
diff
changeset
|
26 |
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33038
diff
changeset
|
27 asciilower = charencode.asciilower |
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33038
diff
changeset
|
28 asciiupper = charencode.asciiupper |
33942
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
29 _jsonescapeu8fast = charencodepure.jsonescapeu8fast # TODO: no "pure" |
33782
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33038
diff
changeset
|
30 |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
31 _sysstr = pycompat.sysstr |
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
32 |
30031
0f6d6fdd3c2a
pycompat: provide 'ispy3' constant
Yuya Nishihara <yuya@tcha.org>
parents:
28508
diff
changeset
|
33 if pycompat.ispy3: |
28507
9bcbd9412225
encoding: make HFS+ ignore code Python 3 compatible
Gregory Szorc <gregory.szorc@gmail.com>
parents:
28069
diff
changeset
|
34 unichr = chr |
9bcbd9412225
encoding: make HFS+ ignore code Python 3 compatible
Gregory Szorc <gregory.szorc@gmail.com>
parents:
28069
diff
changeset
|
35 |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
36 # These unicode characters are ignored by HFS+ (Apple Technote 1150, |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
37 # "Unicode Subtleties"), so we need to ignore them in some places for |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
38 # sanity. |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
39 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
40 "200c 200d 200e 200f 202a 202b 202c 202d 202e " |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
41 "206a 206b 206c 206d 206e 206f feff".split()] |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
42 # verify the next function will work |
32339
7040f5131454
encoding: use i.startswith() instead of i[0] to eliminate py2/3 divergence
Yuya Nishihara <yuya@tcha.org>
parents:
32331
diff
changeset
|
43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore) |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
44 |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
45 def hfsignoreclean(s): |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
46 """Remove codepoints ignored by HFS+ from s. |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
47 |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
48 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
49 '.hg' |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
50 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8')) |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
51 '.hg' |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
52 """ |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
53 if "\xe2" in s or "\xef" in s: |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
54 for c in _ignore: |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
55 s = s.replace(c, '') |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
56 return s |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
57 |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
58 # encoding.environ is provided read-only, which may not be used to modify |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
59 # the process environment |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
60 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ) |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
61 if not pycompat.ispy3: |
32231
cf424dae5dc7
check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents:
32205
diff
changeset
|
62 environ = os.environ # re-exports |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
63 elif _nativeenviron: |
32231
cf424dae5dc7
check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents:
32205
diff
changeset
|
64 environ = os.environb # re-exports |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
65 else: |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
66 # preferred encoding isn't known yet; use utf-8 to avoid unicode error |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
67 # and recreate it once encoding is settled |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
68 environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8')) |
32231
cf424dae5dc7
check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents:
32205
diff
changeset
|
69 for k, v in os.environ.items()) # re-exports |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
70 |
11892
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
71 _encodingfixers = { |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
72 '646': lambda: 'ascii', |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
73 'ANSI_X3.4-1968': lambda: 'ascii', |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
74 } |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
75 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
76 try: |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
77 encoding = environ.get("HGENCODING") |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
78 if not encoding: |
30627
ce36fa9b140c
py3: make sure encoding.encoding is a bytes variable
Pulkit Goyal <7895pulkit@gmail.com>
parents:
30034
diff
changeset
|
79 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii' |
11892
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
80 encoding = _encodingfixers.get(encoding, lambda: encoding)() |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
81 except locale.Error: |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
82 encoding = 'ascii' |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
83 encodingmode = environ.get("HGENCODINGMODE", "strict") |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
84 fallbackencoding = 'ISO-8859-1' |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
85 |
33832
dabe1f11ae3a
py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents:
33782
diff
changeset
|
86 class localstr(bytes): |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
87 '''This class allows strings that are unmodified to be |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
88 round-tripped to the local encoding and back''' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
89 def __new__(cls, u, l): |
33832
dabe1f11ae3a
py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents:
33782
diff
changeset
|
90 s = bytes.__new__(cls, l) |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
91 s._utf8 = u |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
92 return s |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
93 def __hash__(self): |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
94 return hash(self._utf8) # avoid collisions in local string space |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
95 |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
96 def tolocal(s): |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
97 """ |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
98 Convert a string from internal UTF-8 to local encoding |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
99 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
100 All internal strings should be UTF-8 but some repos before the |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
101 implementation of locale support may contain latin1 or possibly |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
102 other character sets. We attempt to decode everything strictly |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
103 using UTF-8, then Latin-1, and failing that, we use UTF-8 and |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
104 replace unknown characters. |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
105 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
106 The localstr class is used to cache the known UTF-8 encoding of |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
107 strings next to their local representation to allow lossless |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
108 round-trip conversion back to UTF-8. |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
109 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
110 >>> u = 'foo: \\xc3\\xa4' # utf-8 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
111 >>> l = tolocal(u) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
112 >>> l |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
113 'foo: ?' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
114 >>> fromlocal(l) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
115 'foo: \\xc3\\xa4' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
116 >>> u2 = 'foo: \\xc3\\xa1' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
117 >>> d = { l: 1, tolocal(u2): 2 } |
18378
404feac78b8a
tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents:
17424
diff
changeset
|
118 >>> len(d) # no collision |
404feac78b8a
tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents:
17424
diff
changeset
|
119 2 |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
120 >>> 'foo: ?' in d |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
121 False |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
122 >>> l1 = 'foo: \\xe4' # historical latin1 fallback |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
123 >>> l = tolocal(l1) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
124 >>> l |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
125 'foo: ?' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
126 >>> fromlocal(l) # magically in utf-8 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
127 'foo: \\xc3\\xa4' |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
128 """ |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
129 |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
130 try: |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
131 try: |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
132 # make sure string is actually stored in UTF-8 |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
133 u = s.decode('UTF-8') |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
134 if encoding == 'UTF-8': |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
135 # fast path |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
136 return s |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
137 r = u.encode(_sysstr(encoding), u"replace") |
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
138 if u == r.decode(_sysstr(encoding)): |
13940
b7b26e54e37a
encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents:
13051
diff
changeset
|
139 # r is a safe, non-lossy encoding of s |
b7b26e54e37a
encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents:
13051
diff
changeset
|
140 return r |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
141 return localstr(s, r) |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
142 except UnicodeDecodeError: |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
143 # we should only get here if we're looking at an ancient changeset |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
144 try: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
145 u = s.decode(_sysstr(fallbackencoding)) |
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
146 r = u.encode(_sysstr(encoding), u"replace") |
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
147 if u == r.decode(_sysstr(encoding)): |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
148 # r is a safe, non-lossy encoding of s |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
149 return r |
13940
b7b26e54e37a
encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents:
13051
diff
changeset
|
150 return localstr(u.encode('UTF-8'), r) |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
151 except UnicodeDecodeError: |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
152 u = s.decode("utf-8", "replace") # last ditch |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
153 # can't round-trip |
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
154 return u.encode(_sysstr(encoding), u"replace") |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
155 except LookupError as k: |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
156 raise error.Abort(k, hint="please check your locale settings") |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
157 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
158 def fromlocal(s): |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
159 """ |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
160 Convert a string from the local character encoding to UTF-8 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
161 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
162 We attempt to decode strings using the encoding mode set by |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
163 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
164 characters will cause an error message. Other modes include |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
165 'replace', which replaces unknown characters with a special |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
166 Unicode character, and 'ignore', which drops the character. |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
167 """ |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
168 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
169 # can we do a lossless round-trip? |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
170 if isinstance(s, localstr): |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
171 return s._utf8 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
172 |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
173 try: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
174 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
175 return u.encode("utf-8") |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
176 except UnicodeDecodeError as inst: |
10282
08a0f04b56bd
many, many trivial check-code fixups
Matt Mackall <mpm@selenic.com>
parents:
10263
diff
changeset
|
177 sub = s[max(0, inst.start - 10):inst.start + 10] |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
178 raise error.Abort("decoding near '%s': %s!" % (sub, inst)) |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
179 except LookupError as k: |
15769
afdf4f5bac61
encoding: use hint markup for "please check your locale settings"
Mads Kiilerich <mads@kiilerich.com>
parents:
15672
diff
changeset
|
180 raise error.Abort(k, hint="please check your locale settings") |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
181 |
31456
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
182 def unitolocal(u): |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
183 """Convert a unicode string to a byte string of local encoding""" |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
184 return tolocal(u.encode('utf-8')) |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
185 |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
186 def unifromlocal(s): |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
187 """Convert a byte string of local encoding to a unicode string""" |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
188 return fromlocal(s).decode('utf-8') |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
189 |
33038
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
190 def unimethod(bytesfunc): |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
191 """Create a proxy method that forwards __unicode__() and __str__() of |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
192 Python 3 to __bytes__()""" |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
193 def unifunc(obj): |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
194 return unifromlocal(bytesfunc(obj)) |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
195 return unifunc |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
196 |
31457
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
197 # converter functions between native str and byte string. use these if the |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
198 # character encoding is not aware (e.g. exception message) or is known to |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
199 # be locale dependent (e.g. date formatting.) |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
200 if pycompat.ispy3: |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
201 strtolocal = unitolocal |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
202 strfromlocal = unifromlocal |
33038
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
203 strmethod = unimethod |
31457
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
204 else: |
31777
7d2cbe11ae48
pycompat: introduce identity function as a compat stub
Yuya Nishihara <yuya@tcha.org>
parents:
31457
diff
changeset
|
205 strtolocal = pycompat.identity |
7d2cbe11ae48
pycompat: introduce identity function as a compat stub
Yuya Nishihara <yuya@tcha.org>
parents:
31457
diff
changeset
|
206 strfromlocal = pycompat.identity |
33038
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
207 strmethod = pycompat.identity |
31457
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
208 |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
209 if not _nativeenviron: |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
210 # now encoding and helper functions are available, recreate the environ |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
211 # dict to be exported to other modules |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
212 environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8'))) |
32231
cf424dae5dc7
check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents:
32205
diff
changeset
|
213 for k, v in os.environ.items()) # re-exports |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
214 |
12866
eddc20306ab6
encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents:
12770
diff
changeset
|
215 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. |
32570
044f3d7eb9ae
encoding: make sure "wide" variable never be referenced from other modules
Yuya Nishihara <yuya@tcha.org>
parents:
32562
diff
changeset
|
216 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide" |
044f3d7eb9ae
encoding: make sure "wide" variable never be referenced from other modules
Yuya Nishihara <yuya@tcha.org>
parents:
32562
diff
changeset
|
217 and "WFA" or "WF") |
12866
eddc20306ab6
encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents:
12770
diff
changeset
|
218 |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
219 def colwidth(s): |
15142
176882876780
encoding: colwidth input is in the local encoding
Matt Mackall <mpm@selenic.com>
parents:
15066
diff
changeset
|
220 "Find the column width of a string for display in the local encoding" |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
221 return ucolwidth(s.decode(_sysstr(encoding), u'replace')) |
15066
24efa83d81cb
i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
14951
diff
changeset
|
222 |
24efa83d81cb
i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
14951
diff
changeset
|
223 def ucolwidth(d): |
24efa83d81cb
i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
14951
diff
changeset
|
224 "Find the column width of a Unicode string for display" |
14951
61807854004e
encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents:
14069
diff
changeset
|
225 eaw = getattr(unicodedata, 'east_asian_width', None) |
61807854004e
encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents:
14069
diff
changeset
|
226 if eaw is not None: |
32570
044f3d7eb9ae
encoding: make sure "wide" variable never be referenced from other modules
Yuya Nishihara <yuya@tcha.org>
parents:
32562
diff
changeset
|
227 return sum([eaw(c) in _wide and 2 or 1 for c in d]) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
228 return len(d) |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
229 |
15143
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
230 def getcols(s, start, c): |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
231 '''Use colwidth to find a c-column substring of s starting at byte |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
232 index start''' |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
233 for x in xrange(start + c, len(s)): |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
234 t = s[start:x] |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
235 if colwidth(t) == c: |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
236 return t |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
237 |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
238 def trim(s, width, ellipsis='', leftside=False): |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
239 """Trim string 's' to at most 'width' columns (including 'ellipsis'). |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
240 |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
241 If 'leftside' is True, left side of string 's' is trimmed. |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
242 'ellipsis' is always placed at trimmed side. |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
243 |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
244 >>> ellipsis = '+++' |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
245 >>> from . import encoding |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
246 >>> encoding.encoding = 'utf-8' |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
247 >>> t= '1234567890' |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
248 >>> print trim(t, 12, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
249 1234567890 |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
250 >>> print trim(t, 10, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
251 1234567890 |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
252 >>> print trim(t, 8, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
253 12345+++ |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
254 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True) |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
255 +++67890 |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
256 >>> print trim(t, 8) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
257 12345678 |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
258 >>> print trim(t, 8, leftside=True) |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
259 34567890 |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
260 >>> print trim(t, 3, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
261 +++ |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
262 >>> print trim(t, 1, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
263 + |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
264 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
265 >>> t = u.encode(encoding.encoding) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
266 >>> print trim(t, 12, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
267 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
268 >>> print trim(t, 10, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
269 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
270 >>> print trim(t, 8, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
271 \xe3\x81\x82\xe3\x81\x84+++ |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
272 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True) |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
273 +++\xe3\x81\x88\xe3\x81\x8a |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
274 >>> print trim(t, 5) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
275 \xe3\x81\x82\xe3\x81\x84 |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
276 >>> print trim(t, 5, leftside=True) |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
277 \xe3\x81\x88\xe3\x81\x8a |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
278 >>> print trim(t, 4, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
279 +++ |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
280 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True) |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
281 +++ |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
282 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
283 >>> print trim(t, 12, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
284 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
285 >>> print trim(t, 10, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
286 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
287 >>> print trim(t, 8, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
288 \x11\x22\x33\x44\x55+++ |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
289 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True) |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
290 +++\x66\x77\x88\x99\xaa |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
291 >>> print trim(t, 8) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
292 \x11\x22\x33\x44\x55\x66\x77\x88 |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
293 >>> print trim(t, 8, leftside=True) |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
294 \x33\x44\x55\x66\x77\x88\x99\xaa |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
295 >>> print trim(t, 3, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
296 +++ |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
297 >>> print trim(t, 1, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
298 + |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
299 """ |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
300 try: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
301 u = s.decode(_sysstr(encoding)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
302 except UnicodeDecodeError: |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
303 if len(s) <= width: # trimming is not needed |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
304 return s |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
305 width -= len(ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
306 if width <= 0: # no enough room even for ellipsis |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
307 return ellipsis[:width + len(ellipsis)] |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
308 if leftside: |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
309 return ellipsis + s[-width:] |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
310 return s[:width] + ellipsis |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
311 |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
312 if ucolwidth(u) <= width: # trimming is not needed |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
313 return s |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
314 |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
315 width -= len(ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
316 if width <= 0: # no enough room even for ellipsis |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
317 return ellipsis[:width + len(ellipsis)] |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
318 |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
319 if leftside: |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
320 uslice = lambda i: u[i:] |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
321 concat = lambda s: ellipsis + s |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
322 else: |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
323 uslice = lambda i: u[:-i] |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
324 concat = lambda s: s + ellipsis |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
325 for i in xrange(1, len(u)): |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
326 usub = uslice(i) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
327 if ucolwidth(usub) <= width: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
328 return concat(usub.encode(_sysstr(encoding))) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
329 return ellipsis # no enough room for multi-column characters |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
330 |
14069
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
331 def lower(s): |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
332 "best-effort encoding-aware case-folding of local string s" |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
333 try: |
22779
d9585dda63c3
encoding.lower: use fast ASCII lower
Siddharth Agarwal <sid0@fb.com>
parents:
22778
diff
changeset
|
334 return asciilower(s) |
17235
3745ae495ce5
encoding: use s.decode to trigger UnicodeDecodeError
Martin Geisler <mg@aragost.com>
parents:
16493
diff
changeset
|
335 except UnicodeDecodeError: |
16387
c481761033bd
encoding: add fast-path for ASCII lowercase
Matt Mackall <mpm@selenic.com>
parents:
16274
diff
changeset
|
336 pass |
c481761033bd
encoding: add fast-path for ASCII lowercase
Matt Mackall <mpm@selenic.com>
parents:
16274
diff
changeset
|
337 try: |
14069
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
338 if isinstance(s, localstr): |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
339 u = s._utf8.decode("utf-8") |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
340 else: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
341 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
14069
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
342 |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
343 lu = u.lower() |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
344 if u == lu: |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
345 return s # preserve localstring |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
346 return lu.encode(_sysstr(encoding)) |
14069
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
347 except UnicodeError: |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
348 return s.lower() # we don't know how to fold this except in ASCII |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
349 except LookupError as k: |
15672
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
350 raise error.Abort(k, hint="please check your locale settings") |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
351 |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
352 def upper(s): |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
353 "best-effort encoding-aware case-folding of local string s" |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
354 try: |
24578
ac08de78de7f
encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents:
23596
diff
changeset
|
355 return asciiupper(s) |
17236
9fb8312dbdbd
encoding: add fast-path for ASCII uppercase.
Martin Geisler <mg@aragost.com>
parents:
17235
diff
changeset
|
356 except UnicodeDecodeError: |
24597
b4258d5a1600
encoding.upper: factor out fallback code
Siddharth Agarwal <sid0@fb.com>
parents:
24593
diff
changeset
|
357 return upperfallback(s) |
b4258d5a1600
encoding.upper: factor out fallback code
Siddharth Agarwal <sid0@fb.com>
parents:
24593
diff
changeset
|
358 |
b4258d5a1600
encoding.upper: factor out fallback code
Siddharth Agarwal <sid0@fb.com>
parents:
24593
diff
changeset
|
359 def upperfallback(s): |
17236
9fb8312dbdbd
encoding: add fast-path for ASCII uppercase.
Martin Geisler <mg@aragost.com>
parents:
17235
diff
changeset
|
360 try: |
15672
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
361 if isinstance(s, localstr): |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
362 u = s._utf8.decode("utf-8") |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
363 else: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
364 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
15672
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
365 |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
366 uu = u.upper() |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
367 if u == uu: |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
368 return s # preserve localstring |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
369 return uu.encode(_sysstr(encoding)) |
15672
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
370 except UnicodeError: |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
371 return s.upper() # we don't know how to fold this except in ASCII |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
372 except LookupError as k: |
15672
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
373 raise error.Abort(k, hint="please check your locale settings") |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
374 |
24593
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
375 class normcasespecs(object): |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
376 '''what a platform's normcase does to ASCII strings |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
377 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
378 This is specified per platform, and should be consistent with what normcase |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
379 on that platform actually does. |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
380 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
381 lower: normcase lowercases ASCII strings |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
382 upper: normcase uppercases ASCII strings |
24608
1c533e23ce95
util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents:
24597
diff
changeset
|
383 other: the fallback function should always be called |
1c533e23ce95
util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents:
24597
diff
changeset
|
384 |
1c533e23ce95
util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents:
24597
diff
changeset
|
385 This should be kept in sync with normcase_spec in util.h.''' |
24593
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
386 lower = -1 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
387 upper = 1 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
388 other = 0 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
389 |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
390 def jsonescape(s, paranoid=False): |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
391 '''returns a string suitable for JSON |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
392 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
393 JSON is problematic for us because it doesn't support non-Unicode |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
394 bytes. To deal with this, we take the following approach: |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
395 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
396 - localstr objects are converted back to UTF-8 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
397 - valid UTF-8/ASCII strings are passed as-is |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
398 - other strings are converted to UTF-8b surrogate encoding |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
399 - apply JSON-specified string escaping |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
400 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
401 (escapes are doubled in these tests) |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
402 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
403 >>> jsonescape('this is a test') |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
404 'this is a test' |
27881
ffa599f3f503
encoding: escape U+007F (DEL) character in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
27699
diff
changeset
|
405 >>> jsonescape('escape characters: \\0 \\x0b \\x7f') |
ffa599f3f503
encoding: escape U+007F (DEL) character in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
27699
diff
changeset
|
406 'escape characters: \\\\u0000 \\\\u000b \\\\u007f' |
ffa599f3f503
encoding: escape U+007F (DEL) character in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
27699
diff
changeset
|
407 >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\') |
ffa599f3f503
encoding: escape U+007F (DEL) character in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
27699
diff
changeset
|
408 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\' |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
409 >>> jsonescape('a weird byte: \\xdd') |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
410 'a weird byte: \\xed\\xb3\\x9d' |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
411 >>> jsonescape('utf-8: caf\\xc3\\xa9') |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
412 'utf-8: caf\\xc3\\xa9' |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
413 >>> jsonescape('') |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
414 '' |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
415 |
28069
b2d24c2898f9
encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents:
28068
diff
changeset
|
416 If paranoid, non-ascii and common troublesome characters are also escaped. |
b2d24c2898f9
encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents:
28068
diff
changeset
|
417 This is suitable for web output. |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
418 |
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
419 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True) |
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
420 'escape boundary: ~ \\\\u007f \\\\u0080' |
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
421 >>> jsonescape('a weird byte: \\xdd', paranoid=True) |
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
422 'a weird byte: \\\\udcdd' |
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
423 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True) |
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
424 'utf-8: caf\\\\u00e9' |
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
425 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) |
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
426 'non-BMP: \\\\ud834\\\\udd1e' |
28069
b2d24c2898f9
encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents:
28068
diff
changeset
|
427 >>> jsonescape('<foo@example.org>', paranoid=True) |
b2d24c2898f9
encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents:
28068
diff
changeset
|
428 '\\\\u003cfoo@example.org\\\\u003e' |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
429 ''' |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
430 |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
431 u8chars = toutf8b(s) |
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
432 try: |
33942
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
433 return _jsonescapeu8fast(u8chars, paranoid) |
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
434 except ValueError: |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
435 pass |
33942
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
436 return charencodepure.jsonescapeu8fallback(u8chars, paranoid) |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
437 |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
438 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
439 |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
440 def getutf8char(s, pos): |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
441 '''get the next full utf-8 character in the given string, starting at pos |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
442 |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
443 Raises a UnicodeError if the given location does not start a valid |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
444 utf-8 character. |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
445 ''' |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
446 |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
447 # find how many bytes to attempt decoding from first nibble |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
448 l = _utf8len[ord(s[pos]) >> 4] |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
449 if not l: # ascii |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
450 return s[pos] |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
451 |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
452 c = s[pos:pos + l] |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
453 # validate with attempted decode |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
454 c.decode("utf-8") |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
455 return c |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
456 |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
457 def toutf8b(s): |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
458 '''convert a local, possibly-binary string into UTF-8b |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
459 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
460 This is intended as a generic method to preserve data when working |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
461 with schemes like JSON and XML that have no provision for |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
462 arbitrary byte strings. As Mercurial often doesn't know |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
463 what encoding data is in, we use so-called UTF-8b. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
464 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
465 If a string is already valid UTF-8 (or ASCII), it passes unmodified. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
466 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range, |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
467 uDC00-uDCFF. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
468 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
469 Principles of operation: |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
470 |
17424
e7cfe3587ea4
fix trivial spelling errors
Mads Kiilerich <mads@kiilerich.com>
parents:
17236
diff
changeset
|
471 - ASCII and UTF-8 data successfully round-trips and is understood |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
472 by Unicode-oriented clients |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
473 - filenames and file contents in arbitrary other encodings can have |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
474 be round-tripped or recovered by clueful clients |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
475 - local strings that have a cached known UTF-8 encoding (aka |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
476 localstr) get sent as UTF-8 so Unicode-oriented clients get the |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
477 Unicode data they want |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
478 - because we must preserve UTF-8 bytestring in places such as |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
479 filenames, metadata can't be roundtripped without help |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
480 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
481 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
482 arbitrary bytes into an internal Unicode format that can be |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
483 re-encoded back into the original. Here we are exposing the |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
484 internal surrogate encoding as a UTF-8 string.) |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
485 ''' |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
486 |
26879
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
487 if "\xed" not in s: |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
488 if isinstance(s, localstr): |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
489 return s._utf8 |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
490 try: |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
491 s.decode('utf-8') |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
492 return s |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
493 except UnicodeDecodeError: |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
494 pass |
26878
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
495 |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
496 r = "" |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
497 pos = 0 |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
498 l = len(s) |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
499 while pos < l: |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
500 try: |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
501 c = getutf8char(s, pos) |
26879
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
502 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
503 # have to re-escape existing U+DCxx characters |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
504 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
505 pos += 1 |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
506 else: |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
507 pos += len(c) |
26878
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
508 except UnicodeDecodeError: |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
509 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
510 pos += 1 |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
511 r += c |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
512 return r |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
513 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
514 def fromutf8b(s): |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
515 '''Given a UTF-8b string, return a local, possibly-binary string. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
516 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
517 return the original binary string. This |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
518 is a round-trip process for strings like filenames, but metadata |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
519 that's was passed through tolocal will remain in UTF-8. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
520 |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
521 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
522 >>> m = "\\xc3\\xa9\\x99abcd" |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
523 >>> toutf8b(m) |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
524 '\\xc3\\xa9\\xed\\xb2\\x99abcd' |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
525 >>> roundtrip(m) |
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
526 True |
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
527 >>> roundtrip("\\xc2\\xc2\\x80") |
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
528 True |
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
529 >>> roundtrip("\\xef\\xbf\\xbd") |
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
530 True |
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
531 >>> roundtrip("\\xef\\xef\\xbf\\xbd") |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
532 True |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
533 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80") |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
534 True |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
535 ''' |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
536 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
537 # fast path - look for uDxxx prefixes in s |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
538 if "\xed" not in s: |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
539 return s |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
540 |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
541 # We could do this with the unicode type but some Python builds |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
542 # use UTF-16 internally (issue5031) which causes non-BMP code |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
543 # points to be escaped. Instead, we use our handy getutf8char |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
544 # helper again to walk the string without "decoding" it. |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
545 |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
546 r = "" |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
547 pos = 0 |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
548 l = len(s) |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
549 while pos < l: |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
550 c = getutf8char(s, pos) |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
551 pos += len(c) |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
552 # unescape U+DCxx characters |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
553 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
554 c = chr(ord(c.decode("utf-8")) & 0xff) |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
555 r += c |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
556 return r |
33860
7d5bc0e5b88f
py3: introduce a wrapper for __builtins__.{raw_,}input()
Augie Fackler <augie@google.com>
parents:
33832
diff
changeset
|
557 |
33873
f18b11534274
py3: make encoding.strio() an identity function on Python 2
Yuya Nishihara <yuya@tcha.org>
parents:
33860
diff
changeset
|
558 if pycompat.ispy3: |
f18b11534274
py3: make encoding.strio() an identity function on Python 2
Yuya Nishihara <yuya@tcha.org>
parents:
33860
diff
changeset
|
559 class strio(io.TextIOWrapper): |
f18b11534274
py3: make encoding.strio() an identity function on Python 2
Yuya Nishihara <yuya@tcha.org>
parents:
33860
diff
changeset
|
560 """Wrapper around TextIOWrapper that respects hg's encoding assumptions. |
33860
7d5bc0e5b88f
py3: introduce a wrapper for __builtins__.{raw_,}input()
Augie Fackler <augie@google.com>
parents:
33832
diff
changeset
|
561 |
33873
f18b11534274
py3: make encoding.strio() an identity function on Python 2
Yuya Nishihara <yuya@tcha.org>
parents:
33860
diff
changeset
|
562 Also works around Python closing streams. |
f18b11534274
py3: make encoding.strio() an identity function on Python 2
Yuya Nishihara <yuya@tcha.org>
parents:
33860
diff
changeset
|
563 """ |
33860
7d5bc0e5b88f
py3: introduce a wrapper for __builtins__.{raw_,}input()
Augie Fackler <augie@google.com>
parents:
33832
diff
changeset
|
564 |
33873
f18b11534274
py3: make encoding.strio() an identity function on Python 2
Yuya Nishihara <yuya@tcha.org>
parents:
33860
diff
changeset
|
565 def __init__(self, buffer): |
f18b11534274
py3: make encoding.strio() an identity function on Python 2
Yuya Nishihara <yuya@tcha.org>
parents:
33860
diff
changeset
|
566 super(strio, self).__init__(buffer, encoding=_sysstr(encoding)) |
33860
7d5bc0e5b88f
py3: introduce a wrapper for __builtins__.{raw_,}input()
Augie Fackler <augie@google.com>
parents:
33832
diff
changeset
|
567 |
33873
f18b11534274
py3: make encoding.strio() an identity function on Python 2
Yuya Nishihara <yuya@tcha.org>
parents:
33860
diff
changeset
|
568 def __del__(self): |
f18b11534274
py3: make encoding.strio() an identity function on Python 2
Yuya Nishihara <yuya@tcha.org>
parents:
33860
diff
changeset
|
569 """Override __del__ so it doesn't close the underlying stream.""" |
f18b11534274
py3: make encoding.strio() an identity function on Python 2
Yuya Nishihara <yuya@tcha.org>
parents:
33860
diff
changeset
|
570 else: |
f18b11534274
py3: make encoding.strio() an identity function on Python 2
Yuya Nishihara <yuya@tcha.org>
parents:
33860
diff
changeset
|
571 strio = pycompat.identity |