Mercurial > public > mercurial-scm > hg-stable
annotate mercurial/encoding.py @ 51302:9d3721552b6c
pytype: import typing directly
First we no longer needs the pycompat layer, second having the types imported in
all case will allow to use them more directly in type annotation, something
important to upgrade the old "type comment" to proper type annotation.
A lot a stupid assert are needed to keep pyflakes happy. We should be able to
remove most of them once the type comment have been upgraded.
author | Pierre-Yves David <pierre-yves.david@octobus.net> |
---|---|
date | Wed, 20 Dec 2023 12:51:20 +0100 |
parents | 80c243eab724 |
children | f15cb5111a1e |
rev | line source |
---|---|
8226
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
1 # encoding.py - character transcoding support for Mercurial |
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
2 # |
46819
d4ba4d51f85f
contributor: change mentions of mpm to olivia
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46319
diff
changeset
|
3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others |
8226
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
4 # |
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
5 # This software may be used and distributed according to the terms of the |
10263 | 6 # GNU General Public License version 2 or any later version. |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
7 |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
8 |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
9 import locale |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
10 import os |
47621
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
11 import re |
51302
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51000
diff
changeset
|
12 import typing |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
13 import unicodedata |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
14 |
51302
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51000
diff
changeset
|
15 from typing import ( |
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51000
diff
changeset
|
16 Any, |
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51000
diff
changeset
|
17 Callable, |
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51000
diff
changeset
|
18 List, |
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51000
diff
changeset
|
19 Text, |
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51000
diff
changeset
|
20 Type, |
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51000
diff
changeset
|
21 TypeVar, |
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51000
diff
changeset
|
22 Union, |
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51000
diff
changeset
|
23 ) |
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51000
diff
changeset
|
24 |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
25 from . import ( |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
26 error, |
32411
df448de7cf3b
parsers: switch to policy importer
Yuya Nishihara <yuya@tcha.org>
parents:
32339
diff
changeset
|
27 policy, |
30031
0f6d6fdd3c2a
pycompat: provide 'ispy3' constant
Yuya Nishihara <yuya@tcha.org>
parents:
28508
diff
changeset
|
28 pycompat, |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
29 ) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
30 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
31 from .pure import charencode as charencodepure |
33942
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
32 |
51302
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51000
diff
changeset
|
33 # keep pyflakes happy |
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51000
diff
changeset
|
34 for t in (Any, Callable, List, Text, Type, Union): |
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51000
diff
changeset
|
35 assert t |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
36 |
51302
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51000
diff
changeset
|
37 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr') |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
38 |
43554
9f70512ae2cf
cleanup: remove pointless r-prefixes on single-quoted strings
Augie Fackler <augie@google.com>
parents:
43551
diff
changeset
|
39 charencode = policy.importmod('charencode') |
33782
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33038
diff
changeset
|
40 |
33944
f4433f2713d0
encoding: add function to test if a str consists of ASCII characters
Yuya Nishihara <yuya@tcha.org>
parents:
33943
diff
changeset
|
41 isasciistr = charencode.isasciistr |
33782
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33038
diff
changeset
|
42 asciilower = charencode.asciilower |
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33038
diff
changeset
|
43 asciiupper = charencode.asciiupper |
33943
2c37f9dabc32
encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents:
33942
diff
changeset
|
44 _jsonescapeu8fast = charencode.jsonescapeu8fast |
33782
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33038
diff
changeset
|
45 |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
46 _sysstr = pycompat.sysstr |
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
47 |
48983
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
48 unichr = chr |
28507
9bcbd9412225
encoding: make HFS+ ignore code Python 3 compatible
Gregory Szorc <gregory.szorc@gmail.com>
parents:
28069
diff
changeset
|
49 |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
50 # These unicode characters are ignored by HFS+ (Apple Technote 1150, |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
51 # "Unicode Subtleties"), so we need to ignore them in some places for |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
52 # sanity. |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
53 _ignore = [ |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
54 unichr(int(x, 16)).encode("utf-8") |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
55 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e " |
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
56 b"206a 206b 206c 206d 206e 206f feff".split() |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
57 ] |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
58 # verify the next function will work |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
59 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore) |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
60 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
61 |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
62 def hfsignoreclean(s): |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
63 # type: (bytes) -> bytes |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
64 """Remove codepoints ignored by HFS+ from s. |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
65 |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
66 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
67 '.hg' |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
68 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8')) |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
69 '.hg' |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
70 """ |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
71 if b"\xe2" in s or b"\xef" in s: |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
72 for c in _ignore: |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
73 s = s.replace(c, b'') |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
74 return s |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
75 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
76 |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
77 # encoding.environ is provided read-only, which may not be used to modify |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
78 # the process environment |
48983
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
79 _nativeenviron = os.supports_bytes_environ |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
80 if _nativeenviron: |
32231
cf424dae5dc7
check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents:
32205
diff
changeset
|
81 environ = os.environb # re-exports |
51000
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50952
diff
changeset
|
82 if pycompat.sysplatform == b'OpenVMS': |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50952
diff
changeset
|
83 # workaround for a bug in VSI 3.10 port |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50952
diff
changeset
|
84 # os.environb is only populated with a few Predefined symbols |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50952
diff
changeset
|
85 def newget(self, key, default=None): |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50952
diff
changeset
|
86 # pytype on linux does not understand OpenVMS special modules |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50952
diff
changeset
|
87 import _decc # pytype: disable=import-error |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50952
diff
changeset
|
88 |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50952
diff
changeset
|
89 v = _decc.getenv(key, None) |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50952
diff
changeset
|
90 if isinstance(key, bytes): |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50952
diff
changeset
|
91 return default if v is None else v.encode('latin-1') |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50952
diff
changeset
|
92 else: |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50952
diff
changeset
|
93 return default if v is None else v |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50952
diff
changeset
|
94 |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50952
diff
changeset
|
95 environ.__class__.get = newget |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
96 else: |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
97 # preferred encoding isn't known yet; use utf-8 to avoid unicode error |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
98 # and recreate it once encoding is settled |
44470
9d2b2df2c2ba
cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents:
43807
diff
changeset
|
99 environ = { |
9d2b2df2c2ba
cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents:
43807
diff
changeset
|
100 k.encode('utf-8'): v.encode('utf-8') |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
101 for k, v in os.environ.items() # re-exports |
44470
9d2b2df2c2ba
cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents:
43807
diff
changeset
|
102 } |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
103 |
39844
9e8fcd2e78c1
encoding: remove unnecessary lambdas from _encodingfixers
Martin von Zweigbergk <martinvonz@google.com>
parents:
39824
diff
changeset
|
104 _encodingrewrites = { |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
105 b'646': b'ascii', |
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
106 b'ANSI_X3.4-1968': b'ascii', |
11892
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
107 } |
37883
443029011990
encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
108 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2. |
443029011990
encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
109 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3. |
443029011990
encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
110 # https://bugs.python.org/issue13216 |
48983
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
111 if pycompat.iswindows: |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
112 _encodingrewrites[b'cp65001'] = b'utf-8' |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
113 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
114 try: |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
115 encoding = environ.get(b"HGENCODING") |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
116 if not encoding: |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
117 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii' |
39844
9e8fcd2e78c1
encoding: remove unnecessary lambdas from _encodingfixers
Martin von Zweigbergk <martinvonz@google.com>
parents:
39824
diff
changeset
|
118 encoding = _encodingrewrites.get(encoding, encoding) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
119 except locale.Error: |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
120 encoding = b'ascii' |
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
121 encodingmode = environ.get(b"HGENCODINGMODE", b"strict") |
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
122 fallbackencoding = b'ISO-8859-1' |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
123 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
124 |
33832
dabe1f11ae3a
py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents:
33782
diff
changeset
|
125 class localstr(bytes): |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
126 """This class allows strings that are unmodified to be |
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
127 round-tripped to the local encoding and back""" |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
128 |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
129 def __new__(cls, u, l): |
33832
dabe1f11ae3a
py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents:
33782
diff
changeset
|
130 s = bytes.__new__(cls, l) |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
131 s._utf8 = u |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
132 return s |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
133 |
51302
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51000
diff
changeset
|
134 if typing.TYPE_CHECKING: |
43725
da925257a39e
typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents:
43724
diff
changeset
|
135 # pseudo implementation to help pytype see localstr() constructor |
da925257a39e
typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents:
43724
diff
changeset
|
136 def __init__(self, u, l): |
da925257a39e
typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents:
43724
diff
changeset
|
137 # type: (bytes, bytes) -> None |
da925257a39e
typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents:
43724
diff
changeset
|
138 super(localstr, self).__init__(l) |
da925257a39e
typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents:
43724
diff
changeset
|
139 self._utf8 = u |
da925257a39e
typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents:
43724
diff
changeset
|
140 |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
141 def __hash__(self): |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
142 return hash(self._utf8) # avoid collisions in local string space |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
143 |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
144 |
37991
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
145 class safelocalstr(bytes): |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
146 """Tagged string denoting it was previously an internal UTF-8 string, |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
147 and can be converted back to UTF-8 losslessly |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
148 |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
149 >>> assert safelocalstr(b'\\xc3') == b'\\xc3' |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
150 >>> assert b'\\xc3' == safelocalstr(b'\\xc3') |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
151 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
152 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
153 """ |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
154 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
155 |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
156 def tolocal(s): |
43721
b65fcccd9100
typing: fix argument type of encoding.tolocal() and .fromutf8b()
Yuya Nishihara <yuya@tcha.org>
parents:
43720
diff
changeset
|
157 # type: (bytes) -> bytes |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
158 """ |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
159 Convert a string from internal UTF-8 to local encoding |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
160 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
161 All internal strings should be UTF-8 but some repos before the |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
162 implementation of locale support may contain latin1 or possibly |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
163 other character sets. We attempt to decode everything strictly |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
164 using UTF-8, then Latin-1, and failing that, we use UTF-8 and |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
165 replace unknown characters. |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
166 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
167 The localstr class is used to cache the known UTF-8 encoding of |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
168 strings next to their local representation to allow lossless |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
169 round-trip conversion back to UTF-8. |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
170 |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
171 >>> u = b'foo: \\xc3\\xa4' # utf-8 |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
172 >>> l = tolocal(u) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
173 >>> l |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
174 'foo: ?' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
175 >>> fromlocal(l) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
176 'foo: \\xc3\\xa4' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
177 >>> u2 = b'foo: \\xc3\\xa1' |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
178 >>> d = { l: 1, tolocal(u2): 2 } |
18378
404feac78b8a
tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents:
17424
diff
changeset
|
179 >>> len(d) # no collision |
404feac78b8a
tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents:
17424
diff
changeset
|
180 2 |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
181 >>> b'foo: ?' in d |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
182 False |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
183 >>> l1 = b'foo: \\xe4' # historical latin1 fallback |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
184 >>> l = tolocal(l1) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
185 >>> l |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
186 'foo: ?' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
187 >>> fromlocal(l) # magically in utf-8 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
188 'foo: \\xc3\\xa4' |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
189 """ |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
190 |
33945
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33944
diff
changeset
|
191 if isasciistr(s): |
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33944
diff
changeset
|
192 return s |
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33944
diff
changeset
|
193 |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
194 try: |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
195 try: |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
196 # make sure string is actually stored in UTF-8 |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
197 u = s.decode('UTF-8') |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
198 if encoding == b'UTF-8': |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
199 # fast path |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
200 return s |
43551
313e3a279828
cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents:
43544
diff
changeset
|
201 r = u.encode(_sysstr(encoding), "replace") |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
202 if u == r.decode(_sysstr(encoding)): |
13940
b7b26e54e37a
encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents:
13051
diff
changeset
|
203 # r is a safe, non-lossy encoding of s |
37991
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
204 return safelocalstr(r) |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
205 return localstr(s, r) |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
206 except UnicodeDecodeError: |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
207 # we should only get here if we're looking at an ancient changeset |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
208 try: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
209 u = s.decode(_sysstr(fallbackencoding)) |
43551
313e3a279828
cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents:
43544
diff
changeset
|
210 r = u.encode(_sysstr(encoding), "replace") |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
211 if u == r.decode(_sysstr(encoding)): |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
212 # r is a safe, non-lossy encoding of s |
37991
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
213 return safelocalstr(r) |
13940
b7b26e54e37a
encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents:
13051
diff
changeset
|
214 return localstr(u.encode('UTF-8'), r) |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
215 except UnicodeDecodeError: |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
216 u = s.decode("utf-8", "replace") # last ditch |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
217 # can't round-trip |
43551
313e3a279828
cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents:
43544
diff
changeset
|
218 return u.encode(_sysstr(encoding), "replace") |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
219 except LookupError as k: |
45681
a736ab681b78
errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents:
44470
diff
changeset
|
220 raise error.Abort( |
a736ab681b78
errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents:
44470
diff
changeset
|
221 pycompat.bytestr(k), hint=b"please check your locale settings" |
a736ab681b78
errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents:
44470
diff
changeset
|
222 ) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
223 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
224 |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
225 def fromlocal(s): |
43681
7edc07fb890c
encoding: fix bad type annotation
Augie Fackler <augie@google.com>
parents:
43554
diff
changeset
|
226 # type: (bytes) -> bytes |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
227 """ |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
228 Convert a string from the local character encoding to UTF-8 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
229 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
230 We attempt to decode strings using the encoding mode set by |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
231 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
232 characters will cause an error message. Other modes include |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
233 'replace', which replaces unknown characters with a special |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
234 Unicode character, and 'ignore', which drops the character. |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
235 """ |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
236 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
237 # can we do a lossless round-trip? |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
238 if isinstance(s, localstr): |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
239 return s._utf8 |
33945
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33944
diff
changeset
|
240 if isasciistr(s): |
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33944
diff
changeset
|
241 return s |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
242 |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
243 try: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
244 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
245 return u.encode("utf-8") |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
246 except UnicodeDecodeError as inst: |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
247 sub = s[max(0, inst.start - 10) : inst.start + 10] |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
248 raise error.Abort( |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
249 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst)) |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
250 ) |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
251 except LookupError as k: |
48030
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
252 raise error.Abort( |
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
253 pycompat.bytestr(k), hint=b"please check your locale settings" |
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
254 ) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
255 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
256 |
31456
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
257 def unitolocal(u): |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
258 # type: (Text) -> bytes |
31456
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
259 """Convert a unicode string to a byte string of local encoding""" |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
260 return tolocal(u.encode('utf-8')) |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
261 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
262 |
31456
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
263 def unifromlocal(s): |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
264 # type: (bytes) -> Text |
31456
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
265 """Convert a byte string of local encoding to a unicode string""" |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
266 return fromlocal(s).decode('utf-8') |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
267 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
268 |
33038
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
269 def unimethod(bytesfunc): |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
270 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text] |
33038
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
271 """Create a proxy method that forwards __unicode__() and __str__() of |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
272 Python 3 to __bytes__()""" |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
273 |
33038
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
274 def unifunc(obj): |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
275 return unifromlocal(bytesfunc(obj)) |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
276 |
33038
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
277 return unifunc |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
278 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
279 |
31457
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
280 # converter functions between native str and byte string. use these if the |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
281 # character encoding is not aware (e.g. exception message) or is known to |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
282 # be locale dependent (e.g. date formatting.) |
48983
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
283 strtolocal = unitolocal |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
284 strfromlocal = unifromlocal |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
285 strmethod = unimethod |
31457
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
286 |
47559
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
287 |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
288 def lower(s): |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
289 # type: (bytes) -> bytes |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
290 """best-effort encoding-aware case-folding of local string s""" |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
291 try: |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
292 return asciilower(s) |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
293 except UnicodeDecodeError: |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
294 pass |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
295 try: |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
296 if isinstance(s, localstr): |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
297 u = s._utf8.decode("utf-8") |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
298 else: |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
299 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
300 |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
301 lu = u.lower() |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
302 if u == lu: |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
303 return s # preserve localstring |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
304 return lu.encode(_sysstr(encoding)) |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
305 except UnicodeError: |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
306 return s.lower() # we don't know how to fold this except in ASCII |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
307 except LookupError as k: |
48030
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
308 raise error.Abort( |
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
309 pycompat.bytestr(k), hint=b"please check your locale settings" |
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
310 ) |
47559
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
311 |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
312 |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
313 def upper(s): |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
314 # type: (bytes) -> bytes |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
315 """best-effort encoding-aware case-folding of local string s""" |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
316 try: |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
317 return asciiupper(s) |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
318 except UnicodeDecodeError: |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
319 return upperfallback(s) |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
320 |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
321 |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
322 def upperfallback(s): |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
323 # type: (Any) -> Any |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
324 try: |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
325 if isinstance(s, localstr): |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
326 u = s._utf8.decode("utf-8") |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
327 else: |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
328 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
329 |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
330 uu = u.upper() |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
331 if u == uu: |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
332 return s # preserve localstring |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
333 return uu.encode(_sysstr(encoding)) |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
334 except UnicodeError: |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
335 return s.upper() # we don't know how to fold this except in ASCII |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
336 except LookupError as k: |
48030
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
337 raise error.Abort( |
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
338 pycompat.bytestr(k), hint=b"please check your locale settings" |
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
339 ) |
47559
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
340 |
53a864a60281
encoding: move case-related utils up
Rapha?l Gom?s <rgomes@octobus.net>
parents:
46819
diff
changeset
|
341 |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
342 if not _nativeenviron: |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
343 # now encoding and helper functions are available, recreate the environ |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
344 # dict to be exported to other modules |
48983
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
345 if pycompat.iswindows: |
47560
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Rapha?l Gom?s <rgomes@octobus.net>
parents:
47559
diff
changeset
|
346 |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Rapha?l Gom?s <rgomes@octobus.net>
parents:
47559
diff
changeset
|
347 class WindowsEnviron(dict): |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Rapha?l Gom?s <rgomes@octobus.net>
parents:
47559
diff
changeset
|
348 """`os.environ` normalizes environment variables to uppercase on windows""" |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Rapha?l Gom?s <rgomes@octobus.net>
parents:
47559
diff
changeset
|
349 |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Rapha?l Gom?s <rgomes@octobus.net>
parents:
47559
diff
changeset
|
350 def get(self, key, default=None): |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Rapha?l Gom?s <rgomes@octobus.net>
parents:
47559
diff
changeset
|
351 return super().get(upper(key), default) |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Rapha?l Gom?s <rgomes@octobus.net>
parents:
47559
diff
changeset
|
352 |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Rapha?l Gom?s <rgomes@octobus.net>
parents:
47559
diff
changeset
|
353 environ = WindowsEnviron() |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Rapha?l Gom?s <rgomes@octobus.net>
parents:
47559
diff
changeset
|
354 |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Rapha?l Gom?s <rgomes@octobus.net>
parents:
47559
diff
changeset
|
355 for k, v in os.environ.items(): # re-exports |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Rapha?l Gom?s <rgomes@octobus.net>
parents:
47559
diff
changeset
|
356 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8')) |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Rapha?l Gom?s <rgomes@octobus.net>
parents:
47559
diff
changeset
|
357 |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
358 |
47621
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
359 DRIVE_RE = re.compile(b'^[a-z]:') |
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
360 |
48983
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
361 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
362 # returns bytes. |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
363 if pycompat.iswindows: |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
364 # Python 3 on Windows issues a DeprecationWarning about using the bytes |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
365 # API when os.getcwdb() is called. |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
366 # |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
367 # Additionally, py3.8+ uppercases the drive letter when calling |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
368 # os.path.realpath(), which is used on ``repo.root``. Since those |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
369 # strings are compared in various places as simple strings, also call |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
370 # realpath here. See https://bugs.python.org/issue40368 |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
371 # |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
372 # However this is not reliable, so lets explicitly make this drive |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
373 # letter upper case. |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
374 # |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
375 # note: we should consider dropping realpath here since it seems to |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
376 # change the semantic of `getcwd`. |
47621
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
377 |
48983
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
378 def getcwd(): |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
379 cwd = os.getcwd() # re-exports |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
380 cwd = os.path.realpath(cwd) |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
381 cwd = strtolocal(cwd) |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
382 if DRIVE_RE.match(cwd): |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
383 cwd = cwd[0:1].upper() + cwd[1:] |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
384 return cwd |
47621
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
385 |
48983
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
386 |
39823
24e493ec2229
py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents:
38823
diff
changeset
|
387 else: |
48983
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
388 getcwd = os.getcwdb # re-exports |
39823
24e493ec2229
py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents:
38823
diff
changeset
|
389 |
12866
eddc20306ab6
encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents:
12770
diff
changeset
|
390 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
391 _wide = _sysstr( |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
392 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide" |
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
393 and b"WFA" |
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
394 or b"WF" |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
395 ) |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
396 |
12866
eddc20306ab6
encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents:
12770
diff
changeset
|
397 |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
398 def colwidth(s): |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
399 # type: (bytes) -> int |
43807
be8552f25cab
cleanup: fix docstring formatting
Matt Harbison <matt_harbison@yahoo.com>
parents:
43793
diff
changeset
|
400 """Find the column width of a string for display in the local encoding""" |
43554
9f70512ae2cf
cleanup: remove pointless r-prefixes on single-quoted strings
Augie Fackler <augie@google.com>
parents:
43551
diff
changeset
|
401 return ucolwidth(s.decode(_sysstr(encoding), 'replace')) |
15066
24efa83d81cb
i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
14951
diff
changeset
|
402 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
403 |
15066
24efa83d81cb
i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
14951
diff
changeset
|
404 def ucolwidth(d): |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
405 # type: (Text) -> int |
43807
be8552f25cab
cleanup: fix docstring formatting
Matt Harbison <matt_harbison@yahoo.com>
parents:
43793
diff
changeset
|
406 """Find the column width of a Unicode string for display""" |
14951
61807854004e
encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents:
14069
diff
changeset
|
407 eaw = getattr(unicodedata, 'east_asian_width', None) |
61807854004e
encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents:
14069
diff
changeset
|
408 if eaw is not None: |
32570
044f3d7eb9ae
encoding: make sure "wide" variable never be referenced from other modules
Yuya Nishihara <yuya@tcha.org>
parents:
32562
diff
changeset
|
409 return sum([eaw(c) in _wide and 2 or 1 for c in d]) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
410 return len(d) |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
411 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
412 |
15143
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
413 def getcols(s, start, c): |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
414 # type: (bytes, int, int) -> bytes |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
415 """Use colwidth to find a c-column substring of s starting at byte |
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
416 index start""" |
49292
d44e3c45f0e4
py3: replace `pycompat.xrange` by `range`
Manuel Jacob <me@manueljacob.de>
parents:
49037
diff
changeset
|
417 for x in range(start + c, len(s)): |
15143
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
418 t = s[start:x] |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
419 if colwidth(t) == c: |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
420 return t |
43719
7cf332318f62
encoding: make getcols() raise exception explicitly
Yuya Nishihara <yuya@tcha.org>
parents:
43681
diff
changeset
|
421 raise ValueError('substring not found') |
15143
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
422 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
423 |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
424 def trim(s, width, ellipsis=b'', leftside=False): |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
425 # type: (bytes, int, bytes, bool) -> bytes |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
426 """Trim string 's' to at most 'width' columns (including 'ellipsis'). |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
427 |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
428 If 'leftside' is True, left side of string 's' is trimmed. |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
429 'ellipsis' is always placed at trimmed side. |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
430 |
34151
414a3513c2bd
doctest: do not embed non-ascii characters in docstring
Yuya Nishihara <yuya@tcha.org>
parents:
34150
diff
changeset
|
431 >>> from .node import bin |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
432 >>> def bprint(s): |
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
433 ... print(pycompat.sysstr(s)) |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
434 >>> ellipsis = b'+++' |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
435 >>> from . import encoding |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
436 >>> encoding.encoding = b'utf-8' |
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
437 >>> t = b'1234567890' |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
438 >>> bprint(trim(t, 12, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
439 1234567890 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
440 >>> bprint(trim(t, 10, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
441 1234567890 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
442 >>> bprint(trim(t, 8, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
443 12345+++ |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
444 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
445 +++67890 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
446 >>> bprint(trim(t, 8)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
447 12345678 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
448 >>> bprint(trim(t, 8, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
449 34567890 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
450 >>> bprint(trim(t, 3, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
451 +++ |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
452 >>> bprint(trim(t, 1, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
453 + |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
454 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns |
34150
e9e225f16932
doctest: pass encoding name as system string
Yuya Nishihara <yuya@tcha.org>
parents:
34146
diff
changeset
|
455 >>> t = u.encode(pycompat.sysstr(encoding.encoding)) |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
456 >>> bprint(trim(t, 12, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
457 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
458 >>> bprint(trim(t, 10, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
459 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
460 >>> bprint(trim(t, 8, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
461 \xe3\x81\x82\xe3\x81\x84+++ |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
462 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
463 +++\xe3\x81\x88\xe3\x81\x8a |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
464 >>> bprint(trim(t, 5)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
465 \xe3\x81\x82\xe3\x81\x84 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
466 >>> bprint(trim(t, 5, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
467 \xe3\x81\x88\xe3\x81\x8a |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
468 >>> bprint(trim(t, 4, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
469 +++ |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
470 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
471 +++ |
34151
414a3513c2bd
doctest: do not embed non-ascii characters in docstring
Yuya Nishihara <yuya@tcha.org>
parents:
34150
diff
changeset
|
472 >>> t = bin(b'112233445566778899aa') # invalid byte sequence |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
473 >>> bprint(trim(t, 12, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
474 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
475 >>> bprint(trim(t, 10, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
476 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
477 >>> bprint(trim(t, 8, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
478 \x11\x22\x33\x44\x55+++ |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
479 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
480 +++\x66\x77\x88\x99\xaa |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
481 >>> bprint(trim(t, 8)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
482 \x11\x22\x33\x44\x55\x66\x77\x88 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
483 >>> bprint(trim(t, 8, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
484 \x33\x44\x55\x66\x77\x88\x99\xaa |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
485 >>> bprint(trim(t, 3, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
486 +++ |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
487 >>> bprint(trim(t, 1, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
488 + |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
489 """ |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
490 try: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
491 u = s.decode(_sysstr(encoding)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
492 except UnicodeDecodeError: |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
493 if len(s) <= width: # trimming is not needed |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
494 return s |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
495 width -= len(ellipsis) |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
496 if width <= 0: # no enough room even for ellipsis |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
497 return ellipsis[: width + len(ellipsis)] |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
498 if leftside: |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
499 return ellipsis + s[-width:] |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
500 return s[:width] + ellipsis |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
501 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
502 if ucolwidth(u) <= width: # trimming is not needed |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
503 return s |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
504 |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
505 width -= len(ellipsis) |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
506 if width <= 0: # no enough room even for ellipsis |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
507 return ellipsis[: width + len(ellipsis)] |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
508 |
48692
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
509 chars = list(u) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
510 if leftside: |
48692
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
511 chars.reverse() |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
512 width_so_far = 0 |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
513 for i, c in enumerate(chars): |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
514 width_so_far += ucolwidth(c) |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
515 if width_so_far > width: |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
516 break |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
517 chars = chars[:i] |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
518 if leftside: |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
519 chars.reverse() |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
520 u = u''.join(chars).encode(_sysstr(encoding)) |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
521 if leftside: |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
522 return ellipsis + u |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
523 return u + ellipsis |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
524 |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
525 |
49037
642e31cb55f0
py3: use class X: instead of class X(object):
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48983
diff
changeset
|
526 class normcasespecs: |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
527 """what a platform's normcase does to ASCII strings |
24593
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
528 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
529 This is specified per platform, and should be consistent with what normcase |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
530 on that platform actually does. |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
531 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
532 lower: normcase lowercases ASCII strings |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
533 upper: normcase uppercases ASCII strings |
24608
1c533e23ce95
util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents:
24597
diff
changeset
|
534 other: the fallback function should always be called |
1c533e23ce95
util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents:
24597
diff
changeset
|
535 |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
536 This should be kept in sync with normcase_spec in util.h.""" |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
537 |
24593
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
538 lower = -1 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
539 upper = 1 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
540 other = 0 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
541 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
542 |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
543 def jsonescape(s, paranoid=False): |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
544 # type: (Any, Any) -> Any |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
545 """returns a string suitable for JSON |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
546 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
547 JSON is problematic for us because it doesn't support non-Unicode |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
548 bytes. To deal with this, we take the following approach: |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
549 |
37991
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
550 - localstr/safelocalstr objects are converted back to UTF-8 |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
551 - valid UTF-8/ASCII strings are passed as-is |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
552 - other strings are converted to UTF-8b surrogate encoding |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
553 - apply JSON-specified string escaping |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
554 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
555 (escapes are doubled in these tests) |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
556 |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
557 >>> jsonescape(b'this is a test') |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
558 'this is a test' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
559 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f') |
27881
ffa599f3f503
encoding: escape U+007F (DEL) character in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
27699
diff
changeset
|
560 'escape characters: \\\\u0000 \\\\u000b \\\\u007f' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
561 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\') |
33943
2c37f9dabc32
encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents:
33942
diff
changeset
|
562 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
563 >>> jsonescape(b'a weird byte: \\xdd') |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
564 'a weird byte: \\xed\\xb3\\x9d' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
565 >>> jsonescape(b'utf-8: caf\\xc3\\xa9') |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
566 'utf-8: caf\\xc3\\xa9' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
567 >>> jsonescape(b'') |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
568 '' |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
569 |
28069
b2d24c2898f9
encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents:
28068
diff
changeset
|
570 If paranoid, non-ascii and common troublesome characters are also escaped. |
b2d24c2898f9
encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents:
28068
diff
changeset
|
571 This is suitable for web output. |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
572 |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
573 >>> s = b'escape characters: \\0 \\x0b \\x7f' |
33943
2c37f9dabc32
encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents:
33942
diff
changeset
|
574 >>> assert jsonescape(s) == jsonescape(s, paranoid=True) |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
575 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\' |
33943
2c37f9dabc32
encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents:
33942
diff
changeset
|
576 >>> assert jsonescape(s) == jsonescape(s, paranoid=True) |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
577 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True) |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
578 'escape boundary: ~ \\\\u007f \\\\u0080' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
579 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True) |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
580 'a weird byte: \\\\udcdd' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
581 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True) |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
582 'utf-8: caf\\\\u00e9' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
583 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
584 'non-BMP: \\\\ud834\\\\udd1e' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
585 >>> jsonescape(b'<foo@example.org>', paranoid=True) |
28069
b2d24c2898f9
encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents:
28068
diff
changeset
|
586 '\\\\u003cfoo@example.org\\\\u003e' |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
587 """ |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
588 |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
589 u8chars = toutf8b(s) |
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
590 try: |
33942
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
591 return _jsonescapeu8fast(u8chars, paranoid) |
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
592 except ValueError: |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
593 pass |
33942
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
594 return charencodepure.jsonescapeu8fallback(u8chars, paranoid) |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
595 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
596 |
34225
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
597 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8 |
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
598 # bytes are mapped to that range. |
48983
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48966
diff
changeset
|
599 _utf8strict = r'surrogatepass' |
34225
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
600 |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
601 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
602 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
603 |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
604 def getutf8char(s, pos): |
43722
83a349aaeba3
typing: constrain argument/return types of encoding.toutf8b()
Yuya Nishihara <yuya@tcha.org>
parents:
43721
diff
changeset
|
605 # type: (bytes, int) -> bytes |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
606 """get the next full utf-8 character in the given string, starting at pos |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
607 |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
608 Raises a UnicodeError if the given location does not start a valid |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
609 utf-8 character. |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
610 """ |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
611 |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
612 # find how many bytes to attempt decoding from first nibble |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
613 l = _utf8len[ord(s[pos : pos + 1]) >> 4] |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
614 if not l: # ascii |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
615 return s[pos : pos + 1] |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
616 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
617 c = s[pos : pos + l] |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
618 # validate with attempted decode |
34225
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
619 c.decode("utf-8", _utf8strict) |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
620 return c |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
621 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
622 |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
623 def toutf8b(s): |
43722
83a349aaeba3
typing: constrain argument/return types of encoding.toutf8b()
Yuya Nishihara <yuya@tcha.org>
parents:
43721
diff
changeset
|
624 # type: (bytes) -> bytes |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
625 """convert a local, possibly-binary string into UTF-8b |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
626 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
627 This is intended as a generic method to preserve data when working |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
628 with schemes like JSON and XML that have no provision for |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
629 arbitrary byte strings. As Mercurial often doesn't know |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
630 what encoding data is in, we use so-called UTF-8b. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
631 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
632 If a string is already valid UTF-8 (or ASCII), it passes unmodified. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
633 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range, |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
634 uDC00-uDCFF. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
635 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
636 Principles of operation: |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
637 |
17424
e7cfe3587ea4
fix trivial spelling errors
Mads Kiilerich <mads@kiilerich.com>
parents:
17236
diff
changeset
|
638 - ASCII and UTF-8 data successfully round-trips and is understood |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
639 by Unicode-oriented clients |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
640 - filenames and file contents in arbitrary other encodings can have |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
641 be round-tripped or recovered by clueful clients |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
642 - local strings that have a cached known UTF-8 encoding (aka |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
643 localstr) get sent as UTF-8 so Unicode-oriented clients get the |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
644 Unicode data they want |
37991
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
645 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
646 - because we must preserve UTF-8 bytestring in places such as |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
647 filenames, metadata can't be roundtripped without help |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
648 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
649 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
650 arbitrary bytes into an internal Unicode format that can be |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
651 re-encoded back into the original. Here we are exposing the |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
652 internal surrogate encoding as a UTF-8 string.) |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
653 """ |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
654 |
37990
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
655 if isinstance(s, localstr): |
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
656 # assume that the original UTF-8 sequence would never contain |
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
657 # invalid characters in U+DCxx range |
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
658 return s._utf8 |
37991
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
659 elif isinstance(s, safelocalstr): |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
660 # already verified that s is non-lossy in legacy encoding, which |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
661 # shouldn't contain characters in U+DCxx range |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
662 return fromlocal(s) |
37990
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
663 elif isasciistr(s): |
33946
6c119dbfd0c0
encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33945
diff
changeset
|
664 return s |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
665 if b"\xed" not in s: |
26879
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
666 try: |
34225
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
667 s.decode('utf-8', _utf8strict) |
26879
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
668 return s |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
669 except UnicodeDecodeError: |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
670 pass |
26878
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
671 |
34223
1c601df9894c
py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara <yuya@tcha.org>
parents:
34207
diff
changeset
|
672 s = pycompat.bytestr(s) |
50434
95acba2c29f6
encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings
Arseniy Alekseyev <aalekseyev@janestreet.com>
parents:
49292
diff
changeset
|
673 r = bytearray() |
26878
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
674 pos = 0 |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
675 l = len(s) |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
676 while pos < l: |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
677 try: |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
678 c = getutf8char(s, pos) |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
679 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": |
26879
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
680 # have to re-escape existing U+DCxx characters |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
681 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) |
26879
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
682 pos += 1 |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
683 else: |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
684 pos += len(c) |
26878
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
685 except UnicodeDecodeError: |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
686 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) |
26878
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
687 pos += 1 |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
688 r += c |
50434
95acba2c29f6
encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings
Arseniy Alekseyev <aalekseyev@janestreet.com>
parents:
49292
diff
changeset
|
689 return bytes(r) |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
690 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
691 |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
692 def fromutf8b(s): |
43721
b65fcccd9100
typing: fix argument type of encoding.tolocal() and .fromutf8b()
Yuya Nishihara <yuya@tcha.org>
parents:
43720
diff
changeset
|
693 # type: (bytes) -> bytes |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
694 """Given a UTF-8b string, return a local, possibly-binary string. |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
695 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
696 return the original binary string. This |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
697 is a round-trip process for strings like filenames, but metadata |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
698 that's was passed through tolocal will remain in UTF-8. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
699 |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
700 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
701 >>> m = b"\\xc3\\xa9\\x99abcd" |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
702 >>> toutf8b(m) |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
703 '\\xc3\\xa9\\xed\\xb2\\x99abcd' |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
704 >>> roundtrip(m) |
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
705 True |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
706 >>> roundtrip(b"\\xc2\\xc2\\x80") |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
707 True |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
708 >>> roundtrip(b"\\xef\\xbf\\xbd") |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
709 True |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
710 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd") |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
711 True |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
712 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80") |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
713 True |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
714 """ |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
715 |
33946
6c119dbfd0c0
encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33945
diff
changeset
|
716 if isasciistr(s): |
6c119dbfd0c0
encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33945
diff
changeset
|
717 return s |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
718 # fast path - look for uDxxx prefixes in s |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
719 if b"\xed" not in s: |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
720 return s |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
721 |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
722 # We could do this with the unicode type but some Python builds |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
723 # use UTF-16 internally (issue5031) which causes non-BMP code |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
724 # points to be escaped. Instead, we use our handy getutf8char |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
725 # helper again to walk the string without "decoding" it. |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
726 |
34223
1c601df9894c
py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara <yuya@tcha.org>
parents:
34207
diff
changeset
|
727 s = pycompat.bytestr(s) |
50434
95acba2c29f6
encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings
Arseniy Alekseyev <aalekseyev@janestreet.com>
parents:
49292
diff
changeset
|
728 r = bytearray() |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
729 pos = 0 |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
730 l = len(s) |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
731 while pos < l: |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
732 c = getutf8char(s, pos) |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
733 pos += len(c) |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
734 # unescape U+DCxx characters |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
735 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
736 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF) |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
737 r += c |
50434
95acba2c29f6
encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings
Arseniy Alekseyev <aalekseyev@janestreet.com>
parents:
49292
diff
changeset
|
738 return bytes(r) |