mercurial-scm/hg: comparison mercurial/encoding.py

equal deleted inserted replaced

-:57875cf423c9
+:2372284d9457
 error,
 policy,
 pycompat,
 )
-from .pure import (
+from .pure import charencode as charencodepure
-charencode as charencodepure,
-)
 charencode = policy.importmod(r'charencode')
 isasciistr = charencode.isasciistr
 asciilower = charencode.asciilower
 unichr = chr
 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
 # "Unicode Subtleties"), so we need to ignore them in some places for
 # sanity.
-_ignore = [unichr(int(x, 16)).encode("utf-8") for x in
+_ignore = [
-"200c 200d 200e 200f 202a 202b 202c 202d 202e "
+unichr(int(x, 16)).encode("utf-8")
-"206a 206b 206c 206d 206e 206f feff".split()]
+for x in "200c 200d 200e 200f 202a 202b 202c 202d 202e "
+"206a 206b 206c 206d 206e 206f feff".split()
+]
 # verify the next function will work
 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
 def hfsignoreclean(s):
 """Remove codepoints ignored by HFS+ from s.
 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
 if "\xe2" in s or "\xef" in s:
 for c in _ignore:
 s = s.replace(c, '')
 return s
 # encoding.environ is provided read-only, which may not be used to modify
 # the process environment
-_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
+_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
 if not pycompat.ispy3:
 environ = os.environ  # re-exports
 elif _nativeenviron:
 environ = os.environb  # re-exports
 else:
 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
 # and recreate it once encoding is settled
-environ = dict((k.encode(r'utf-8'), v.encode(r'utf-8'))
+environ = dict(
-for k, v in os.environ.items())  # re-exports
+(k.encode(r'utf-8'), v.encode(r'utf-8'))
+for k, v in os.environ.items()  # re-exports
+)
 _encodingrewrites = {
 '646': 'ascii',
 'ANSI_X3.4-1968': 'ascii',
 }
 except locale.Error:
 encoding = 'ascii'
 encodingmode = environ.get("HGENCODINGMODE", "strict")
 fallbackencoding = 'ISO-8859-1'
 class localstr(bytes):
 '''This class allows strings that are unmodified to be
 round-tripped to the local encoding and back'''
 def __new__(cls, u, l):
 s = bytes.__new__(cls, l)
 s._utf8 = u
 return s
 def __hash__(self):
-return hash(self._utf8) # avoid collisions in local string space
+return hash(self._utf8)  # avoid collisions in local string space
 class safelocalstr(bytes):
 """Tagged string denoting it was previously an internal UTF-8 string,
 and can be converted back to UTF-8 losslessly
 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
 """
 def tolocal(s):
 """
 Convert a string from internal UTF-8 to local encoding
 if u == r.decode(_sysstr(encoding)):
 # r is a safe, non-lossy encoding of s
 return safelocalstr(r)
 return localstr(u.encode('UTF-8'), r)
 except UnicodeDecodeError:
-u = s.decode("utf-8", "replace") # last ditch
+u = s.decode("utf-8", "replace")  # last ditch
 # can't round-trip
 return u.encode(_sysstr(encoding), r"replace")
 except LookupError as k:
 raise error.Abort(k, hint="please check your locale settings")
 def fromlocal(s):
 """
 Convert a string from the local character encoding to UTF-8
 try:
 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
 return u.encode("utf-8")
 except UnicodeDecodeError as inst:
-sub = s[max(0, inst.start - 10):inst.start + 10]
+sub = s[max(0, inst.start - 10) : inst.start + 10]
-raise error.Abort("decoding near '%s': %s!"
+raise error.Abort(
-% (sub, pycompat.bytestr(inst)))
+"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
+)
 except LookupError as k:
 raise error.Abort(k, hint="please check your locale settings")
 def unitolocal(u):
 """Convert a unicode string to a byte string of local encoding"""
 return tolocal(u.encode('utf-8'))
 def unifromlocal(s):
 """Convert a byte string of local encoding to a unicode string"""
 return fromlocal(s).decode('utf-8')
 def unimethod(bytesfunc):
 """Create a proxy method that forwards __unicode__() and __str__() of
 Python 3 to __bytes__()"""
 def unifunc(obj):
 return unifromlocal(bytesfunc(obj))
 return unifunc
 # converter functions between native str and byte string. use these if the
 # character encoding is not aware (e.g. exception message) or is known to
 # be locale dependent (e.g. date formatting.)
 if pycompat.ispy3:
 strmethod = pycompat.identity
 if not _nativeenviron:
 # now encoding and helper functions are available, recreate the environ
 # dict to be exported to other modules
-environ = dict((tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
+environ = dict(
-for k, v in os.environ.items())  # re-exports
+(tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
+for k, v in os.environ.items()  # re-exports
+)
 if pycompat.ispy3:
 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
 # returns bytes.
 if pycompat.iswindows:
 getcwd = os.getcwdb  # re-exports
 else:
 getcwd = os.getcwd  # re-exports
 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
-_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
+_wide = _sysstr(
-and "WFA" or "WF")
+environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide" and "WFA" or "WF"
+)
 def colwidth(s):
 "Find the column width of a string for display in the local encoding"
 return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
 def ucolwidth(d):
 "Find the column width of a Unicode string for display"
 eaw = getattr(unicodedata, 'east_asian_width', None)
 if eaw is not None:
 return sum([eaw(c) in _wide and 2 or 1 for c in d])
 return len(d)
 def getcols(s, start, c):
 '''Use colwidth to find a c-column substring of s starting at byte
 index start'''
 for x in pycompat.xrange(start + c, len(s)):
 t = s[start:x]
 if colwidth(t) == c:
 return t
 def trim(s, width, ellipsis='', leftside=False):
 """Trim string 's' to at most 'width' columns (including 'ellipsis').
 If 'leftside' is True, left side of string 's' is trimmed.
 +
 """
 try:
 u = s.decode(_sysstr(encoding))
 except UnicodeDecodeError:
-if len(s) <= width: # trimming is not needed
+if len(s) <= width:  # trimming is not needed
 return s
 width -= len(ellipsis)
-if width <= 0: # no enough room even for ellipsis
+if width <= 0:  # no enough room even for ellipsis
-return ellipsis[:width + len(ellipsis)]
+return ellipsis[: width + len(ellipsis)]
 if leftside:
 return ellipsis + s[-width:]
 return s[:width] + ellipsis
-if ucolwidth(u) <= width: # trimming is not needed
+if ucolwidth(u) <= width:  # trimming is not needed
 return s
 width -= len(ellipsis)
-if width <= 0: # no enough room even for ellipsis
+if width <= 0:  # no enough room even for ellipsis
-return ellipsis[:width + len(ellipsis)]
+return ellipsis[: width + len(ellipsis)]
 if leftside:
 uslice = lambda i: u[i:]
 concat = lambda s: ellipsis + s
 else:
 concat = lambda s: s + ellipsis
 for i in pycompat.xrange(1, len(u)):
 usub = uslice(i)
 if ucolwidth(usub) <= width:
 return concat(usub.encode(_sysstr(encoding)))
-return ellipsis # no enough room for multi-column characters
+return ellipsis  # no enough room for multi-column characters
 def lower(s):
 "best-effort encoding-aware case-folding of local string s"
 try:
 return asciilower(s)
 else:
 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
 lu = u.lower()
 if u == lu:
-return s # preserve localstring
+return s  # preserve localstring
 return lu.encode(_sysstr(encoding))
 except UnicodeError:
-return s.lower() # we don't know how to fold this except in ASCII
+return s.lower()  # we don't know how to fold this except in ASCII
 except LookupError as k:
 raise error.Abort(k, hint="please check your locale settings")
 def upper(s):
 "best-effort encoding-aware case-folding of local string s"
 try:
 return asciiupper(s)
 except UnicodeDecodeError:
 return upperfallback(s)
 def upperfallback(s):
 try:
 if isinstance(s, localstr):
 u = s._utf8.decode("utf-8")
 else:
 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
 uu = u.upper()
 if u == uu:
-return s # preserve localstring
+return s  # preserve localstring
 return uu.encode(_sysstr(encoding))
 except UnicodeError:
-return s.upper() # we don't know how to fold this except in ASCII
+return s.upper()  # we don't know how to fold this except in ASCII
 except LookupError as k:
 raise error.Abort(k, hint="please check your locale settings")
 class normcasespecs(object):
 '''what a platform's normcase does to ASCII strings
 This is specified per platform, and should be consistent with what normcase
 lower: normcase lowercases ASCII strings
 upper: normcase uppercases ASCII strings
 other: the fallback function should always be called
 This should be kept in sync with normcase_spec in util.h.'''
 lower = -1
 upper = 1
 other = 0
 def jsonescape(s, paranoid=False):
 '''returns a string suitable for JSON
 JSON is problematic for us because it doesn't support non-Unicode
 return _jsonescapeu8fast(u8chars, paranoid)
 except ValueError:
 pass
 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
 # bytes are mapped to that range.
 if pycompat.ispy3:
 _utf8strict = r'surrogatepass'
 else:
 _utf8strict = r'strict'
 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
 def getutf8char(s, pos):
 '''get the next full utf-8 character in the given string, starting at pos
 Raises a UnicodeError if the given location does not start a valid
 utf-8 character.
 '''
 # find how many bytes to attempt decoding from first nibble
-l = _utf8len[ord(s[pos:pos + 1]) >> 4]
+l = _utf8len[ord(s[pos : pos + 1]) >> 4]
-if not l: # ascii
+if not l:  # ascii
-return s[pos:pos + 1]
+return s[pos : pos + 1]
-c = s[pos:pos + l]
+c = s[pos : pos + l]
 # validate with attempted decode
 c.decode("utf-8", _utf8strict)
 return c
 def toutf8b(s):
 '''convert a local, possibly-binary string into UTF-8b
 This is intended as a generic method to preserve data when working
 while pos < l:
 try:
 c = getutf8char(s, pos)
 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
 # have to re-escape existing U+DCxx characters
-c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
+c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
 pos += 1
 else:
 pos += len(c)
 except UnicodeDecodeError:
-c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
+c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
 pos += 1
 r += c
 return r
 def fromutf8b(s):
 '''Given a UTF-8b string, return a local, possibly-binary string.
 return the original binary string. This
 while pos < l:
 c = getutf8char(s, pos)
 pos += len(c)
 # unescape U+DCxx characters
 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
-c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
+c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
 r += c
 return r

changeset 43076	2372284d9457
parent 41836	25694a78e4a4
child 43077	687b865b95ad