comparison mercurial/encoding.py @ 43077:687b865b95ad

formatting: byteify all mercurial/ and hgext/ string literals Done with python3.7 contrib/byteify-strings.py -i $(hg files 'set:mercurial/**.py - mercurial/thirdparty/** + hgext/**.py - hgext/fsmonitor/pywatchman/** - mercurial/__init__.py') black -l 80 -t py33 -S $(hg files 'set:**.py - mercurial/thirdparty/** - "contrib/python-zstandard/**" - hgext/fsmonitor/pywatchman/**') # skip-blame mass-reformatting only Differential Revision: https://phab.mercurial-scm.org/D6972
author Augie Fackler <augie@google.com>
date Sun, 06 Oct 2019 09:48:39 -0400
parents 2372284d9457
children c59eb1560c44
comparison
equal deleted inserted replaced
43076:2372284d9457 43077:687b865b95ad
34 # These unicode characters are ignored by HFS+ (Apple Technote 1150, 34 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
35 # "Unicode Subtleties"), so we need to ignore them in some places for 35 # "Unicode Subtleties"), so we need to ignore them in some places for
36 # sanity. 36 # sanity.
37 _ignore = [ 37 _ignore = [
38 unichr(int(x, 16)).encode("utf-8") 38 unichr(int(x, 16)).encode("utf-8")
39 for x in "200c 200d 200e 200f 202a 202b 202c 202d 202e " 39 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
40 "206a 206b 206c 206d 206e 206f feff".split() 40 b"206a 206b 206c 206d 206e 206f feff".split()
41 ] 41 ]
42 # verify the next function will work 42 # verify the next function will work
43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore) 43 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
44 44
45 45
46 def hfsignoreclean(s): 46 def hfsignoreclean(s):
47 """Remove codepoints ignored by HFS+ from s. 47 """Remove codepoints ignored by HFS+ from s.
48 48
49 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) 49 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
50 '.hg' 50 '.hg'
51 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8')) 51 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
52 '.hg' 52 '.hg'
53 """ 53 """
54 if "\xe2" in s or "\xef" in s: 54 if b"\xe2" in s or b"\xef" in s:
55 for c in _ignore: 55 for c in _ignore:
56 s = s.replace(c, '') 56 s = s.replace(c, b'')
57 return s 57 return s
58 58
59 59
60 # encoding.environ is provided read-only, which may not be used to modify 60 # encoding.environ is provided read-only, which may not be used to modify
61 # the process environment 61 # the process environment
71 (k.encode(r'utf-8'), v.encode(r'utf-8')) 71 (k.encode(r'utf-8'), v.encode(r'utf-8'))
72 for k, v in os.environ.items() # re-exports 72 for k, v in os.environ.items() # re-exports
73 ) 73 )
74 74
75 _encodingrewrites = { 75 _encodingrewrites = {
76 '646': 'ascii', 76 b'646': b'ascii',
77 'ANSI_X3.4-1968': 'ascii', 77 b'ANSI_X3.4-1968': b'ascii',
78 } 78 }
79 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2. 79 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
80 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3. 80 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
81 # https://bugs.python.org/issue13216 81 # https://bugs.python.org/issue13216
82 if pycompat.iswindows and not pycompat.ispy3: 82 if pycompat.iswindows and not pycompat.ispy3:
83 _encodingrewrites['cp65001'] = 'utf-8' 83 _encodingrewrites[b'cp65001'] = b'utf-8'
84 84
85 try: 85 try:
86 encoding = environ.get("HGENCODING") 86 encoding = environ.get(b"HGENCODING")
87 if not encoding: 87 if not encoding:
88 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii' 88 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
89 encoding = _encodingrewrites.get(encoding, encoding) 89 encoding = _encodingrewrites.get(encoding, encoding)
90 except locale.Error: 90 except locale.Error:
91 encoding = 'ascii' 91 encoding = b'ascii'
92 encodingmode = environ.get("HGENCODINGMODE", "strict") 92 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
93 fallbackencoding = 'ISO-8859-1' 93 fallbackencoding = b'ISO-8859-1'
94 94
95 95
96 class localstr(bytes): 96 class localstr(bytes):
97 '''This class allows strings that are unmodified to be 97 '''This class allows strings that are unmodified to be
98 round-tripped to the local encoding and back''' 98 round-tripped to the local encoding and back'''
156 156
157 try: 157 try:
158 try: 158 try:
159 # make sure string is actually stored in UTF-8 159 # make sure string is actually stored in UTF-8
160 u = s.decode('UTF-8') 160 u = s.decode('UTF-8')
161 if encoding == 'UTF-8': 161 if encoding == b'UTF-8':
162 # fast path 162 # fast path
163 return s 163 return s
164 r = u.encode(_sysstr(encoding), r"replace") 164 r = u.encode(_sysstr(encoding), r"replace")
165 if u == r.decode(_sysstr(encoding)): 165 if u == r.decode(_sysstr(encoding)):
166 # r is a safe, non-lossy encoding of s 166 # r is a safe, non-lossy encoding of s
178 except UnicodeDecodeError: 178 except UnicodeDecodeError:
179 u = s.decode("utf-8", "replace") # last ditch 179 u = s.decode("utf-8", "replace") # last ditch
180 # can't round-trip 180 # can't round-trip
181 return u.encode(_sysstr(encoding), r"replace") 181 return u.encode(_sysstr(encoding), r"replace")
182 except LookupError as k: 182 except LookupError as k:
183 raise error.Abort(k, hint="please check your locale settings") 183 raise error.Abort(k, hint=b"please check your locale settings")
184 184
185 185
186 def fromlocal(s): 186 def fromlocal(s):
187 """ 187 """
188 Convert a string from the local character encoding to UTF-8 188 Convert a string from the local character encoding to UTF-8
204 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) 204 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
205 return u.encode("utf-8") 205 return u.encode("utf-8")
206 except UnicodeDecodeError as inst: 206 except UnicodeDecodeError as inst:
207 sub = s[max(0, inst.start - 10) : inst.start + 10] 207 sub = s[max(0, inst.start - 10) : inst.start + 10]
208 raise error.Abort( 208 raise error.Abort(
209 "decoding near '%s': %s!" % (sub, pycompat.bytestr(inst)) 209 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
210 ) 210 )
211 except LookupError as k: 211 except LookupError as k:
212 raise error.Abort(k, hint="please check your locale settings") 212 raise error.Abort(k, hint=b"please check your locale settings")
213 213
214 214
215 def unitolocal(u): 215 def unitolocal(u):
216 """Convert a unicode string to a byte string of local encoding""" 216 """Convert a unicode string to a byte string of local encoding"""
217 return tolocal(u.encode('utf-8')) 217 return tolocal(u.encode('utf-8'))
264 else: 264 else:
265 getcwd = os.getcwd # re-exports 265 getcwd = os.getcwd # re-exports
266 266
267 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. 267 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
268 _wide = _sysstr( 268 _wide = _sysstr(
269 environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide" and "WFA" or "WF" 269 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
270 and b"WFA"
271 or b"WF"
270 ) 272 )
271 273
272 274
273 def colwidth(s): 275 def colwidth(s):
274 "Find the column width of a string for display in the local encoding" 276 b"Find the column width of a string for display in the local encoding"
275 return ucolwidth(s.decode(_sysstr(encoding), r'replace')) 277 return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
276 278
277 279
278 def ucolwidth(d): 280 def ucolwidth(d):
279 "Find the column width of a Unicode string for display" 281 b"Find the column width of a Unicode string for display"
280 eaw = getattr(unicodedata, 'east_asian_width', None) 282 eaw = getattr(unicodedata, 'east_asian_width', None)
281 if eaw is not None: 283 if eaw is not None:
282 return sum([eaw(c) in _wide and 2 or 1 for c in d]) 284 return sum([eaw(c) in _wide and 2 or 1 for c in d])
283 return len(d) 285 return len(d)
284 286
290 t = s[start:x] 292 t = s[start:x]
291 if colwidth(t) == c: 293 if colwidth(t) == c:
292 return t 294 return t
293 295
294 296
295 def trim(s, width, ellipsis='', leftside=False): 297 def trim(s, width, ellipsis=b'', leftside=False):
296 """Trim string 's' to at most 'width' columns (including 'ellipsis'). 298 """Trim string 's' to at most 'width' columns (including 'ellipsis').
297 299
298 If 'leftside' is True, left side of string 's' is trimmed. 300 If 'leftside' is True, left side of string 's' is trimmed.
299 'ellipsis' is always placed at trimmed side. 301 'ellipsis' is always placed at trimmed side.
300 302
388 return concat(usub.encode(_sysstr(encoding))) 390 return concat(usub.encode(_sysstr(encoding)))
389 return ellipsis # no enough room for multi-column characters 391 return ellipsis # no enough room for multi-column characters
390 392
391 393
392 def lower(s): 394 def lower(s):
393 "best-effort encoding-aware case-folding of local string s" 395 b"best-effort encoding-aware case-folding of local string s"
394 try: 396 try:
395 return asciilower(s) 397 return asciilower(s)
396 except UnicodeDecodeError: 398 except UnicodeDecodeError:
397 pass 399 pass
398 try: 400 try:
406 return s # preserve localstring 408 return s # preserve localstring
407 return lu.encode(_sysstr(encoding)) 409 return lu.encode(_sysstr(encoding))
408 except UnicodeError: 410 except UnicodeError:
409 return s.lower() # we don't know how to fold this except in ASCII 411 return s.lower() # we don't know how to fold this except in ASCII
410 except LookupError as k: 412 except LookupError as k:
411 raise error.Abort(k, hint="please check your locale settings") 413 raise error.Abort(k, hint=b"please check your locale settings")
412 414
413 415
414 def upper(s): 416 def upper(s):
415 "best-effort encoding-aware case-folding of local string s" 417 b"best-effort encoding-aware case-folding of local string s"
416 try: 418 try:
417 return asciiupper(s) 419 return asciiupper(s)
418 except UnicodeDecodeError: 420 except UnicodeDecodeError:
419 return upperfallback(s) 421 return upperfallback(s)
420 422
431 return s # preserve localstring 433 return s # preserve localstring
432 return uu.encode(_sysstr(encoding)) 434 return uu.encode(_sysstr(encoding))
433 except UnicodeError: 435 except UnicodeError:
434 return s.upper() # we don't know how to fold this except in ASCII 436 return s.upper() # we don't know how to fold this except in ASCII
435 except LookupError as k: 437 except LookupError as k:
436 raise error.Abort(k, hint="please check your locale settings") 438 raise error.Abort(k, hint=b"please check your locale settings")
437 439
438 440
439 class normcasespecs(object): 441 class normcasespecs(object):
440 '''what a platform's normcase does to ASCII strings 442 '''what a platform's normcase does to ASCII strings
441 443
573 # already verified that s is non-lossy in legacy encoding, which 575 # already verified that s is non-lossy in legacy encoding, which
574 # shouldn't contain characters in U+DCxx range 576 # shouldn't contain characters in U+DCxx range
575 return fromlocal(s) 577 return fromlocal(s)
576 elif isasciistr(s): 578 elif isasciistr(s):
577 return s 579 return s
578 if "\xed" not in s: 580 if b"\xed" not in s:
579 try: 581 try:
580 s.decode('utf-8', _utf8strict) 582 s.decode('utf-8', _utf8strict)
581 return s 583 return s
582 except UnicodeDecodeError: 584 except UnicodeDecodeError:
583 pass 585 pass
584 586
585 s = pycompat.bytestr(s) 587 s = pycompat.bytestr(s)
586 r = "" 588 r = b""
587 pos = 0 589 pos = 0
588 l = len(s) 590 l = len(s)
589 while pos < l: 591 while pos < l:
590 try: 592 try:
591 c = getutf8char(s, pos) 593 c = getutf8char(s, pos)
592 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": 594 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
593 # have to re-escape existing U+DCxx characters 595 # have to re-escape existing U+DCxx characters
594 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) 596 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
595 pos += 1 597 pos += 1
596 else: 598 else:
597 pos += len(c) 599 pos += len(c)
626 ''' 628 '''
627 629
628 if isasciistr(s): 630 if isasciistr(s):
629 return s 631 return s
630 # fast path - look for uDxxx prefixes in s 632 # fast path - look for uDxxx prefixes in s
631 if "\xed" not in s: 633 if b"\xed" not in s:
632 return s 634 return s
633 635
634 # We could do this with the unicode type but some Python builds 636 # We could do this with the unicode type but some Python builds
635 # use UTF-16 internally (issue5031) which causes non-BMP code 637 # use UTF-16 internally (issue5031) which causes non-BMP code
636 # points to be escaped. Instead, we use our handy getutf8char 638 # points to be escaped. Instead, we use our handy getutf8char
637 # helper again to walk the string without "decoding" it. 639 # helper again to walk the string without "decoding" it.
638 640
639 s = pycompat.bytestr(s) 641 s = pycompat.bytestr(s)
640 r = "" 642 r = b""
641 pos = 0 643 pos = 0
642 l = len(s) 644 l = len(s)
643 while pos < l: 645 while pos < l:
644 c = getutf8char(s, pos) 646 c = getutf8char(s, pos)
645 pos += len(c) 647 pos += len(c)
646 # unescape U+DCxx characters 648 # unescape U+DCxx characters
647 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": 649 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
648 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF) 650 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
649 r += c 651 r += c
650 return r 652 return r