Mercurial > public > mercurial-scm > hg-stable
comparison mercurial/encoding.py @ 43077:687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Done with
python3.7 contrib/byteify-strings.py -i $(hg files 'set:mercurial/**.py - mercurial/thirdparty/** + hgext/**.py - hgext/fsmonitor/pywatchman/** - mercurial/__init__.py')
black -l 80 -t py33 -S $(hg files 'set:**.py - mercurial/thirdparty/** - "contrib/python-zstandard/**" - hgext/fsmonitor/pywatchman/**')
# skip-blame mass-reformatting only
Differential Revision: https://phab.mercurial-scm.org/D6972
author | Augie Fackler <augie@google.com> |
---|---|
date | Sun, 06 Oct 2019 09:48:39 -0400 |
parents | 2372284d9457 |
children | c59eb1560c44 |
comparison
equal
deleted
inserted
replaced
43076:2372284d9457 | 43077:687b865b95ad |
---|---|
34 # These unicode characters are ignored by HFS+ (Apple Technote 1150, | 34 # These unicode characters are ignored by HFS+ (Apple Technote 1150, |
35 # "Unicode Subtleties"), so we need to ignore them in some places for | 35 # "Unicode Subtleties"), so we need to ignore them in some places for |
36 # sanity. | 36 # sanity. |
37 _ignore = [ | 37 _ignore = [ |
38 unichr(int(x, 16)).encode("utf-8") | 38 unichr(int(x, 16)).encode("utf-8") |
39 for x in "200c 200d 200e 200f 202a 202b 202c 202d 202e " | 39 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e " |
40 "206a 206b 206c 206d 206e 206f feff".split() | 40 b"206a 206b 206c 206d 206e 206f feff".split() |
41 ] | 41 ] |
42 # verify the next function will work | 42 # verify the next function will work |
43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore) | 43 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore) |
44 | 44 |
45 | 45 |
46 def hfsignoreclean(s): | 46 def hfsignoreclean(s): |
47 """Remove codepoints ignored by HFS+ from s. | 47 """Remove codepoints ignored by HFS+ from s. |
48 | 48 |
49 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) | 49 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) |
50 '.hg' | 50 '.hg' |
51 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8')) | 51 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8')) |
52 '.hg' | 52 '.hg' |
53 """ | 53 """ |
54 if "\xe2" in s or "\xef" in s: | 54 if b"\xe2" in s or b"\xef" in s: |
55 for c in _ignore: | 55 for c in _ignore: |
56 s = s.replace(c, '') | 56 s = s.replace(c, b'') |
57 return s | 57 return s |
58 | 58 |
59 | 59 |
60 # encoding.environ is provided read-only, which may not be used to modify | 60 # encoding.environ is provided read-only, which may not be used to modify |
61 # the process environment | 61 # the process environment |
71 (k.encode(r'utf-8'), v.encode(r'utf-8')) | 71 (k.encode(r'utf-8'), v.encode(r'utf-8')) |
72 for k, v in os.environ.items() # re-exports | 72 for k, v in os.environ.items() # re-exports |
73 ) | 73 ) |
74 | 74 |
75 _encodingrewrites = { | 75 _encodingrewrites = { |
76 '646': 'ascii', | 76 b'646': b'ascii', |
77 'ANSI_X3.4-1968': 'ascii', | 77 b'ANSI_X3.4-1968': b'ascii', |
78 } | 78 } |
79 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2. | 79 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2. |
80 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3. | 80 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3. |
81 # https://bugs.python.org/issue13216 | 81 # https://bugs.python.org/issue13216 |
82 if pycompat.iswindows and not pycompat.ispy3: | 82 if pycompat.iswindows and not pycompat.ispy3: |
83 _encodingrewrites['cp65001'] = 'utf-8' | 83 _encodingrewrites[b'cp65001'] = b'utf-8' |
84 | 84 |
85 try: | 85 try: |
86 encoding = environ.get("HGENCODING") | 86 encoding = environ.get(b"HGENCODING") |
87 if not encoding: | 87 if not encoding: |
88 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii' | 88 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii' |
89 encoding = _encodingrewrites.get(encoding, encoding) | 89 encoding = _encodingrewrites.get(encoding, encoding) |
90 except locale.Error: | 90 except locale.Error: |
91 encoding = 'ascii' | 91 encoding = b'ascii' |
92 encodingmode = environ.get("HGENCODINGMODE", "strict") | 92 encodingmode = environ.get(b"HGENCODINGMODE", b"strict") |
93 fallbackencoding = 'ISO-8859-1' | 93 fallbackencoding = b'ISO-8859-1' |
94 | 94 |
95 | 95 |
96 class localstr(bytes): | 96 class localstr(bytes): |
97 '''This class allows strings that are unmodified to be | 97 '''This class allows strings that are unmodified to be |
98 round-tripped to the local encoding and back''' | 98 round-tripped to the local encoding and back''' |
156 | 156 |
157 try: | 157 try: |
158 try: | 158 try: |
159 # make sure string is actually stored in UTF-8 | 159 # make sure string is actually stored in UTF-8 |
160 u = s.decode('UTF-8') | 160 u = s.decode('UTF-8') |
161 if encoding == 'UTF-8': | 161 if encoding == b'UTF-8': |
162 # fast path | 162 # fast path |
163 return s | 163 return s |
164 r = u.encode(_sysstr(encoding), r"replace") | 164 r = u.encode(_sysstr(encoding), r"replace") |
165 if u == r.decode(_sysstr(encoding)): | 165 if u == r.decode(_sysstr(encoding)): |
166 # r is a safe, non-lossy encoding of s | 166 # r is a safe, non-lossy encoding of s |
178 except UnicodeDecodeError: | 178 except UnicodeDecodeError: |
179 u = s.decode("utf-8", "replace") # last ditch | 179 u = s.decode("utf-8", "replace") # last ditch |
180 # can't round-trip | 180 # can't round-trip |
181 return u.encode(_sysstr(encoding), r"replace") | 181 return u.encode(_sysstr(encoding), r"replace") |
182 except LookupError as k: | 182 except LookupError as k: |
183 raise error.Abort(k, hint="please check your locale settings") | 183 raise error.Abort(k, hint=b"please check your locale settings") |
184 | 184 |
185 | 185 |
186 def fromlocal(s): | 186 def fromlocal(s): |
187 """ | 187 """ |
188 Convert a string from the local character encoding to UTF-8 | 188 Convert a string from the local character encoding to UTF-8 |
204 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) | 204 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
205 return u.encode("utf-8") | 205 return u.encode("utf-8") |
206 except UnicodeDecodeError as inst: | 206 except UnicodeDecodeError as inst: |
207 sub = s[max(0, inst.start - 10) : inst.start + 10] | 207 sub = s[max(0, inst.start - 10) : inst.start + 10] |
208 raise error.Abort( | 208 raise error.Abort( |
209 "decoding near '%s': %s!" % (sub, pycompat.bytestr(inst)) | 209 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst)) |
210 ) | 210 ) |
211 except LookupError as k: | 211 except LookupError as k: |
212 raise error.Abort(k, hint="please check your locale settings") | 212 raise error.Abort(k, hint=b"please check your locale settings") |
213 | 213 |
214 | 214 |
215 def unitolocal(u): | 215 def unitolocal(u): |
216 """Convert a unicode string to a byte string of local encoding""" | 216 """Convert a unicode string to a byte string of local encoding""" |
217 return tolocal(u.encode('utf-8')) | 217 return tolocal(u.encode('utf-8')) |
264 else: | 264 else: |
265 getcwd = os.getcwd # re-exports | 265 getcwd = os.getcwd # re-exports |
266 | 266 |
267 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. | 267 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. |
268 _wide = _sysstr( | 268 _wide = _sysstr( |
269 environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide" and "WFA" or "WF" | 269 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide" |
270 and b"WFA" | |
271 or b"WF" | |
270 ) | 272 ) |
271 | 273 |
272 | 274 |
273 def colwidth(s): | 275 def colwidth(s): |
274 "Find the column width of a string for display in the local encoding" | 276 b"Find the column width of a string for display in the local encoding" |
275 return ucolwidth(s.decode(_sysstr(encoding), r'replace')) | 277 return ucolwidth(s.decode(_sysstr(encoding), r'replace')) |
276 | 278 |
277 | 279 |
278 def ucolwidth(d): | 280 def ucolwidth(d): |
279 "Find the column width of a Unicode string for display" | 281 b"Find the column width of a Unicode string for display" |
280 eaw = getattr(unicodedata, 'east_asian_width', None) | 282 eaw = getattr(unicodedata, 'east_asian_width', None) |
281 if eaw is not None: | 283 if eaw is not None: |
282 return sum([eaw(c) in _wide and 2 or 1 for c in d]) | 284 return sum([eaw(c) in _wide and 2 or 1 for c in d]) |
283 return len(d) | 285 return len(d) |
284 | 286 |
290 t = s[start:x] | 292 t = s[start:x] |
291 if colwidth(t) == c: | 293 if colwidth(t) == c: |
292 return t | 294 return t |
293 | 295 |
294 | 296 |
295 def trim(s, width, ellipsis='', leftside=False): | 297 def trim(s, width, ellipsis=b'', leftside=False): |
296 """Trim string 's' to at most 'width' columns (including 'ellipsis'). | 298 """Trim string 's' to at most 'width' columns (including 'ellipsis'). |
297 | 299 |
298 If 'leftside' is True, left side of string 's' is trimmed. | 300 If 'leftside' is True, left side of string 's' is trimmed. |
299 'ellipsis' is always placed at trimmed side. | 301 'ellipsis' is always placed at trimmed side. |
300 | 302 |
388 return concat(usub.encode(_sysstr(encoding))) | 390 return concat(usub.encode(_sysstr(encoding))) |
389 return ellipsis # no enough room for multi-column characters | 391 return ellipsis # no enough room for multi-column characters |
390 | 392 |
391 | 393 |
392 def lower(s): | 394 def lower(s): |
393 "best-effort encoding-aware case-folding of local string s" | 395 b"best-effort encoding-aware case-folding of local string s" |
394 try: | 396 try: |
395 return asciilower(s) | 397 return asciilower(s) |
396 except UnicodeDecodeError: | 398 except UnicodeDecodeError: |
397 pass | 399 pass |
398 try: | 400 try: |
406 return s # preserve localstring | 408 return s # preserve localstring |
407 return lu.encode(_sysstr(encoding)) | 409 return lu.encode(_sysstr(encoding)) |
408 except UnicodeError: | 410 except UnicodeError: |
409 return s.lower() # we don't know how to fold this except in ASCII | 411 return s.lower() # we don't know how to fold this except in ASCII |
410 except LookupError as k: | 412 except LookupError as k: |
411 raise error.Abort(k, hint="please check your locale settings") | 413 raise error.Abort(k, hint=b"please check your locale settings") |
412 | 414 |
413 | 415 |
414 def upper(s): | 416 def upper(s): |
415 "best-effort encoding-aware case-folding of local string s" | 417 b"best-effort encoding-aware case-folding of local string s" |
416 try: | 418 try: |
417 return asciiupper(s) | 419 return asciiupper(s) |
418 except UnicodeDecodeError: | 420 except UnicodeDecodeError: |
419 return upperfallback(s) | 421 return upperfallback(s) |
420 | 422 |
431 return s # preserve localstring | 433 return s # preserve localstring |
432 return uu.encode(_sysstr(encoding)) | 434 return uu.encode(_sysstr(encoding)) |
433 except UnicodeError: | 435 except UnicodeError: |
434 return s.upper() # we don't know how to fold this except in ASCII | 436 return s.upper() # we don't know how to fold this except in ASCII |
435 except LookupError as k: | 437 except LookupError as k: |
436 raise error.Abort(k, hint="please check your locale settings") | 438 raise error.Abort(k, hint=b"please check your locale settings") |
437 | 439 |
438 | 440 |
439 class normcasespecs(object): | 441 class normcasespecs(object): |
440 '''what a platform's normcase does to ASCII strings | 442 '''what a platform's normcase does to ASCII strings |
441 | 443 |
573 # already verified that s is non-lossy in legacy encoding, which | 575 # already verified that s is non-lossy in legacy encoding, which |
574 # shouldn't contain characters in U+DCxx range | 576 # shouldn't contain characters in U+DCxx range |
575 return fromlocal(s) | 577 return fromlocal(s) |
576 elif isasciistr(s): | 578 elif isasciistr(s): |
577 return s | 579 return s |
578 if "\xed" not in s: | 580 if b"\xed" not in s: |
579 try: | 581 try: |
580 s.decode('utf-8', _utf8strict) | 582 s.decode('utf-8', _utf8strict) |
581 return s | 583 return s |
582 except UnicodeDecodeError: | 584 except UnicodeDecodeError: |
583 pass | 585 pass |
584 | 586 |
585 s = pycompat.bytestr(s) | 587 s = pycompat.bytestr(s) |
586 r = "" | 588 r = b"" |
587 pos = 0 | 589 pos = 0 |
588 l = len(s) | 590 l = len(s) |
589 while pos < l: | 591 while pos < l: |
590 try: | 592 try: |
591 c = getutf8char(s, pos) | 593 c = getutf8char(s, pos) |
592 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": | 594 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": |
593 # have to re-escape existing U+DCxx characters | 595 # have to re-escape existing U+DCxx characters |
594 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) | 596 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) |
595 pos += 1 | 597 pos += 1 |
596 else: | 598 else: |
597 pos += len(c) | 599 pos += len(c) |
626 ''' | 628 ''' |
627 | 629 |
628 if isasciistr(s): | 630 if isasciistr(s): |
629 return s | 631 return s |
630 # fast path - look for uDxxx prefixes in s | 632 # fast path - look for uDxxx prefixes in s |
631 if "\xed" not in s: | 633 if b"\xed" not in s: |
632 return s | 634 return s |
633 | 635 |
634 # We could do this with the unicode type but some Python builds | 636 # We could do this with the unicode type but some Python builds |
635 # use UTF-16 internally (issue5031) which causes non-BMP code | 637 # use UTF-16 internally (issue5031) which causes non-BMP code |
636 # points to be escaped. Instead, we use our handy getutf8char | 638 # points to be escaped. Instead, we use our handy getutf8char |
637 # helper again to walk the string without "decoding" it. | 639 # helper again to walk the string without "decoding" it. |
638 | 640 |
639 s = pycompat.bytestr(s) | 641 s = pycompat.bytestr(s) |
640 r = "" | 642 r = b"" |
641 pos = 0 | 643 pos = 0 |
642 l = len(s) | 644 l = len(s) |
643 while pos < l: | 645 while pos < l: |
644 c = getutf8char(s, pos) | 646 c = getutf8char(s, pos) |
645 pos += len(c) | 647 pos += len(c) |
646 # unescape U+DCxx characters | 648 # unescape U+DCxx characters |
647 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": | 649 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": |
648 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF) | 650 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF) |
649 r += c | 651 r += c |
650 return r | 652 return r |