comparison mercurial/encoding.py @ 43544:2ade00f3b03b

encoding: add comment-based type hints for pytype Differential Revision: https://phab.mercurial-scm.org/D7275
author Augie Fackler <augie@google.com>
date Wed, 06 Nov 2019 14:48:34 -0500
parents 5f2a8dabb0d8
children 313e3a279828
comparison
equal deleted inserted replaced
43543:daade078f1f0 43544:2ade00f3b03b
17 policy, 17 policy,
18 pycompat, 18 pycompat,
19 ) 19 )
20 20
21 from .pure import charencode as charencodepure 21 from .pure import charencode as charencodepure
22
23 if not globals(): # hide this from non-pytype users
24 from typing import (
25 Any,
26 Callable,
27 List,
28 Text,
29 Type,
30 TypeVar,
31 Union,
32 )
33
34 # keep pyflakes happy
35 for t in (Any, Callable, List, Text, Type, Union):
36 assert t
37
38 _Tlocalstr = TypeVar('_Tlocalstr', bound=localstr)
22 39
23 charencode = policy.importmod(r'charencode') 40 charencode = policy.importmod(r'charencode')
24 41
25 isasciistr = charencode.isasciistr 42 isasciistr = charencode.isasciistr
26 asciilower = charencode.asciilower 43 asciilower = charencode.asciilower
43 # verify the next function will work 60 # verify the next function will work
44 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore) 61 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
45 62
46 63
47 def hfsignoreclean(s): 64 def hfsignoreclean(s):
65 # type: (bytes) -> bytes
48 """Remove codepoints ignored by HFS+ from s. 66 """Remove codepoints ignored by HFS+ from s.
49 67
50 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) 68 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
51 '.hg' 69 '.hg'
52 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8')) 70 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
97 class localstr(bytes): 115 class localstr(bytes):
98 '''This class allows strings that are unmodified to be 116 '''This class allows strings that are unmodified to be
99 round-tripped to the local encoding and back''' 117 round-tripped to the local encoding and back'''
100 118
101 def __new__(cls, u, l): 119 def __new__(cls, u, l):
120 # type: (Type[_Tlocalstr], Text, bytes) -> _Tlocalstr
102 s = bytes.__new__(cls, l) 121 s = bytes.__new__(cls, l)
103 s._utf8 = u 122 s._utf8 = u
104 return s 123 return s
105 124
106 def __hash__(self): 125 def __hash__(self):
117 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} 136 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
118 """ 137 """
119 138
120 139
121 def tolocal(s): 140 def tolocal(s):
141 # type: (Text) -> bytes
122 """ 142 """
123 Convert a string from internal UTF-8 to local encoding 143 Convert a string from internal UTF-8 to local encoding
124 144
125 All internal strings should be UTF-8 but some repos before the 145 All internal strings should be UTF-8 but some repos before the
126 implementation of locale support may contain latin1 or possibly 146 implementation of locale support may contain latin1 or possibly
183 except LookupError as k: 203 except LookupError as k:
184 raise error.Abort(k, hint=b"please check your locale settings") 204 raise error.Abort(k, hint=b"please check your locale settings")
185 205
186 206
187 def fromlocal(s): 207 def fromlocal(s):
208 # type: (bytes) -> Text
188 """ 209 """
189 Convert a string from the local character encoding to UTF-8 210 Convert a string from the local character encoding to UTF-8
190 211
191 We attempt to decode strings using the encoding mode set by 212 We attempt to decode strings using the encoding mode set by
192 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown 213 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
212 except LookupError as k: 233 except LookupError as k:
213 raise error.Abort(k, hint=b"please check your locale settings") 234 raise error.Abort(k, hint=b"please check your locale settings")
214 235
215 236
216 def unitolocal(u): 237 def unitolocal(u):
238 # type: (Text) -> bytes
217 """Convert a unicode string to a byte string of local encoding""" 239 """Convert a unicode string to a byte string of local encoding"""
218 return tolocal(u.encode('utf-8')) 240 return tolocal(u.encode('utf-8'))
219 241
220 242
221 def unifromlocal(s): 243 def unifromlocal(s):
244 # type: (bytes) -> Text
222 """Convert a byte string of local encoding to a unicode string""" 245 """Convert a byte string of local encoding to a unicode string"""
223 return fromlocal(s).decode('utf-8') 246 return fromlocal(s).decode('utf-8')
224 247
225 248
226 def unimethod(bytesfunc): 249 def unimethod(bytesfunc):
250 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
227 """Create a proxy method that forwards __unicode__() and __str__() of 251 """Create a proxy method that forwards __unicode__() and __str__() of
228 Python 3 to __bytes__()""" 252 Python 3 to __bytes__()"""
229 253
230 def unifunc(obj): 254 def unifunc(obj):
231 return unifromlocal(bytesfunc(obj)) 255 return unifromlocal(bytesfunc(obj))
279 or b"WF" 303 or b"WF"
280 ) 304 )
281 305
282 306
283 def colwidth(s): 307 def colwidth(s):
308 # type: (bytes) -> int
284 b"Find the column width of a string for display in the local encoding" 309 b"Find the column width of a string for display in the local encoding"
285 return ucolwidth(s.decode(_sysstr(encoding), r'replace')) 310 return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
286 311
287 312
288 def ucolwidth(d): 313 def ucolwidth(d):
314 # type: (Text) -> int
289 b"Find the column width of a Unicode string for display" 315 b"Find the column width of a Unicode string for display"
290 eaw = getattr(unicodedata, 'east_asian_width', None) 316 eaw = getattr(unicodedata, 'east_asian_width', None)
291 if eaw is not None: 317 if eaw is not None:
292 return sum([eaw(c) in _wide and 2 or 1 for c in d]) 318 return sum([eaw(c) in _wide and 2 or 1 for c in d])
293 return len(d) 319 return len(d)
294 320
295 321
296 def getcols(s, start, c): 322 def getcols(s, start, c):
323 # type: (bytes, int, int) -> bytes
297 '''Use colwidth to find a c-column substring of s starting at byte 324 '''Use colwidth to find a c-column substring of s starting at byte
298 index start''' 325 index start'''
299 for x in pycompat.xrange(start + c, len(s)): 326 for x in pycompat.xrange(start + c, len(s)):
300 t = s[start:x] 327 t = s[start:x]
301 if colwidth(t) == c: 328 if colwidth(t) == c:
302 return t 329 return t
303 330
304 331
305 def trim(s, width, ellipsis=b'', leftside=False): 332 def trim(s, width, ellipsis=b'', leftside=False):
333 # type: (bytes, int, bytes, bool) -> bytes
306 """Trim string 's' to at most 'width' columns (including 'ellipsis'). 334 """Trim string 's' to at most 'width' columns (including 'ellipsis').
307 335
308 If 'leftside' is True, left side of string 's' is trimmed. 336 If 'leftside' is True, left side of string 's' is trimmed.
309 'ellipsis' is always placed at trimmed side. 337 'ellipsis' is always placed at trimmed side.
310 338
398 return concat(usub.encode(_sysstr(encoding))) 426 return concat(usub.encode(_sysstr(encoding)))
399 return ellipsis # no enough room for multi-column characters 427 return ellipsis # no enough room for multi-column characters
400 428
401 429
402 def lower(s): 430 def lower(s):
431 # type: (bytes) -> bytes
403 b"best-effort encoding-aware case-folding of local string s" 432 b"best-effort encoding-aware case-folding of local string s"
404 try: 433 try:
405 return asciilower(s) 434 return asciilower(s)
406 except UnicodeDecodeError: 435 except UnicodeDecodeError:
407 pass 436 pass
420 except LookupError as k: 449 except LookupError as k:
421 raise error.Abort(k, hint=b"please check your locale settings") 450 raise error.Abort(k, hint=b"please check your locale settings")
422 451
423 452
424 def upper(s): 453 def upper(s):
454 # type: (bytes) -> bytes
425 b"best-effort encoding-aware case-folding of local string s" 455 b"best-effort encoding-aware case-folding of local string s"
426 try: 456 try:
427 return asciiupper(s) 457 return asciiupper(s)
428 except UnicodeDecodeError: 458 except UnicodeDecodeError:
429 return upperfallback(s) 459 return upperfallback(s)
430 460
431 461
432 def upperfallback(s): 462 def upperfallback(s):
463 # type: (Any) -> Any
433 try: 464 try:
434 if isinstance(s, localstr): 465 if isinstance(s, localstr):
435 u = s._utf8.decode("utf-8") 466 u = s._utf8.decode("utf-8")
436 else: 467 else:
437 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) 468 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
462 upper = 1 493 upper = 1
463 other = 0 494 other = 0
464 495
465 496
466 def jsonescape(s, paranoid=False): 497 def jsonescape(s, paranoid=False):
498 # type: (Any, Any) -> Any
467 '''returns a string suitable for JSON 499 '''returns a string suitable for JSON
468 500
469 JSON is problematic for us because it doesn't support non-Unicode 501 JSON is problematic for us because it doesn't support non-Unicode
470 bytes. To deal with this, we take the following approach: 502 bytes. To deal with this, we take the following approach:
471 503
525 557
526 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] 558 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
527 559
528 560
529 def getutf8char(s, pos): 561 def getutf8char(s, pos):
562 # type: (Any, Any) -> Any
530 '''get the next full utf-8 character in the given string, starting at pos 563 '''get the next full utf-8 character in the given string, starting at pos
531 564
532 Raises a UnicodeError if the given location does not start a valid 565 Raises a UnicodeError if the given location does not start a valid
533 utf-8 character. 566 utf-8 character.
534 ''' 567 '''
543 c.decode("utf-8", _utf8strict) 576 c.decode("utf-8", _utf8strict)
544 return c 577 return c
545 578
546 579
547 def toutf8b(s): 580 def toutf8b(s):
581 # type: (Any) -> Any
548 '''convert a local, possibly-binary string into UTF-8b 582 '''convert a local, possibly-binary string into UTF-8b
549 583
550 This is intended as a generic method to preserve data when working 584 This is intended as a generic method to preserve data when working
551 with schemes like JSON and XML that have no provision for 585 with schemes like JSON and XML that have no provision for
552 arbitrary byte strings. As Mercurial often doesn't know 586 arbitrary byte strings. As Mercurial often doesn't know
611 r += c 645 r += c
612 return r 646 return r
613 647
614 648
615 def fromutf8b(s): 649 def fromutf8b(s):
650 # type: (Text) -> bytes
616 '''Given a UTF-8b string, return a local, possibly-binary string. 651 '''Given a UTF-8b string, return a local, possibly-binary string.
617 652
618 return the original binary string. This 653 return the original binary string. This
619 is a round-trip process for strings like filenames, but metadata 654 is a round-trip process for strings like filenames, but metadata
620 that's was passed through tolocal will remain in UTF-8. 655 that's was passed through tolocal will remain in UTF-8.