mercurial/encoding.py
changeset 48892 fa2b1a46d92e
parent 48875 6000f5b25c9b
child 48946 642e31cb55f0
equal deleted inserted replaced
48891:4eae533354ae 48892:fa2b1a46d92e
    44 asciiupper = charencode.asciiupper
    44 asciiupper = charencode.asciiupper
    45 _jsonescapeu8fast = charencode.jsonescapeu8fast
    45 _jsonescapeu8fast = charencode.jsonescapeu8fast
    46 
    46 
    47 _sysstr = pycompat.sysstr
    47 _sysstr = pycompat.sysstr
    48 
    48 
    49 if pycompat.ispy3:
    49 unichr = chr
    50     unichr = chr
       
    51 
    50 
    52 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
    51 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
    53 # "Unicode Subtleties"), so we need to ignore them in some places for
    52 # "Unicode Subtleties"), so we need to ignore them in some places for
    54 # sanity.
    53 # sanity.
    55 _ignore = [
    54 _ignore = [
    76     return s
    75     return s
    77 
    76 
    78 
    77 
    79 # encoding.environ is provided read-only, which may not be used to modify
    78 # encoding.environ is provided read-only, which may not be used to modify
    80 # the process environment
    79 # the process environment
    81 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
    80 _nativeenviron = os.supports_bytes_environ
    82 if not pycompat.ispy3:
    81 if _nativeenviron:
    83     environ = os.environ  # re-exports
       
    84 elif _nativeenviron:
       
    85     environ = os.environb  # re-exports
    82     environ = os.environb  # re-exports
    86 else:
    83 else:
    87     # preferred encoding isn't known yet; use utf-8 to avoid unicode error
    84     # preferred encoding isn't known yet; use utf-8 to avoid unicode error
    88     # and recreate it once encoding is settled
    85     # and recreate it once encoding is settled
    89     environ = {
    86     environ = {
    96     b'ANSI_X3.4-1968': b'ascii',
    93     b'ANSI_X3.4-1968': b'ascii',
    97 }
    94 }
    98 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
    95 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
    99 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
    96 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
   100 # https://bugs.python.org/issue13216
    97 # https://bugs.python.org/issue13216
   101 if pycompat.iswindows and not pycompat.ispy3:
    98 if pycompat.iswindows:
   102     _encodingrewrites[b'cp65001'] = b'utf-8'
    99     _encodingrewrites[b'cp65001'] = b'utf-8'
   103 
   100 
   104 try:
   101 try:
   105     encoding = environ.get(b"HGENCODING")
   102     encoding = environ.get(b"HGENCODING")
   106     if not encoding:
   103     if not encoding:
   268 
   265 
   269 
   266 
   270 # converter functions between native str and byte string. use these if the
   267 # converter functions between native str and byte string. use these if the
   271 # character encoding is not aware (e.g. exception message) or is known to
   268 # character encoding is not aware (e.g. exception message) or is known to
   272 # be locale dependent (e.g. date formatting.)
   269 # be locale dependent (e.g. date formatting.)
   273 if pycompat.ispy3:
   270 strtolocal = unitolocal
   274     strtolocal = unitolocal
   271 strfromlocal = unifromlocal
   275     strfromlocal = unifromlocal
   272 strmethod = unimethod
   276     strmethod = unimethod
       
   277 else:
       
   278 
       
   279     def strtolocal(s):
       
   280         # type: (str) -> bytes
       
   281         return s  # pytype: disable=bad-return-type
       
   282 
       
   283     def strfromlocal(s):
       
   284         # type: (bytes) -> str
       
   285         return s  # pytype: disable=bad-return-type
       
   286 
       
   287     strmethod = pycompat.identity
       
   288 
   273 
   289 
   274 
   290 def lower(s):
   275 def lower(s):
   291     # type: (bytes) -> bytes
   276     # type: (bytes) -> bytes
   292     """best-effort encoding-aware case-folding of local string s"""
   277     """best-effort encoding-aware case-folding of local string s"""
   342 
   327 
   343 
   328 
   344 if not _nativeenviron:
   329 if not _nativeenviron:
   345     # now encoding and helper functions are available, recreate the environ
   330     # now encoding and helper functions are available, recreate the environ
   346     # dict to be exported to other modules
   331     # dict to be exported to other modules
   347     if pycompat.iswindows and pycompat.ispy3:
   332     if pycompat.iswindows:
   348 
   333 
   349         class WindowsEnviron(dict):
   334         class WindowsEnviron(dict):
   350             """`os.environ` normalizes environment variables to uppercase on windows"""
   335             """`os.environ` normalizes environment variables to uppercase on windows"""
   351 
   336 
   352             def get(self, key, default=None):
   337             def get(self, key, default=None):
   358         environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
   343         environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
   359 
   344 
   360 
   345 
   361 DRIVE_RE = re.compile(b'^[a-z]:')
   346 DRIVE_RE = re.compile(b'^[a-z]:')
   362 
   347 
   363 if pycompat.ispy3:
   348 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
   364     # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
   349 # returns bytes.
   365     # returns bytes.
   350 if pycompat.iswindows:
   366     if pycompat.iswindows:
   351     # Python 3 on Windows issues a DeprecationWarning about using the bytes
   367         # Python 3 on Windows issues a DeprecationWarning about using the bytes
   352     # API when os.getcwdb() is called.
   368         # API when os.getcwdb() is called.
   353     #
   369         #
   354     # Additionally, py3.8+ uppercases the drive letter when calling
   370         # Additionally, py3.8+ uppercases the drive letter when calling
   355     # os.path.realpath(), which is used on ``repo.root``.  Since those
   371         # os.path.realpath(), which is used on ``repo.root``.  Since those
   356     # strings are compared in various places as simple strings, also call
   372         # strings are compared in various places as simple strings, also call
   357     # realpath here.  See https://bugs.python.org/issue40368
   373         # realpath here.  See https://bugs.python.org/issue40368
   358     #
   374         #
   359     # However this is not reliable, so lets explicitly make this drive
   375         # However this is not reliable, so lets explicitly make this drive
   360     # letter upper case.
   376         # letter upper case.
   361     #
   377         #
   362     # note: we should consider dropping realpath here since it seems to
   378         # note: we should consider dropping realpath here since it seems to
   363     # change the semantic of `getcwd`.
   379         # change the semantic of `getcwd`.
   364 
   380 
   365     def getcwd():
   381         def getcwd():
   366         cwd = os.getcwd()  # re-exports
   382             cwd = os.getcwd()  # re-exports
   367         cwd = os.path.realpath(cwd)
   383             cwd = os.path.realpath(cwd)
   368         cwd = strtolocal(cwd)
   384             cwd = strtolocal(cwd)
   369         if DRIVE_RE.match(cwd):
   385             if DRIVE_RE.match(cwd):
   370             cwd = cwd[0:1].upper() + cwd[1:]
   386                 cwd = cwd[0:1].upper() + cwd[1:]
   371         return cwd
   387             return cwd
   372 
   388 
   373 
   389     else:
       
   390         getcwd = os.getcwdb  # re-exports
       
   391 else:
   374 else:
   392     getcwd = os.getcwd  # re-exports
   375     getcwd = os.getcwdb  # re-exports
   393 
   376 
   394 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
   377 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
   395 _wide = _sysstr(
   378 _wide = _sysstr(
   396     environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
   379     environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
   397     and b"WFA"
   380     and b"WFA"
   598     return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
   581     return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
   599 
   582 
   600 
   583 
   601 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
   584 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
   602 # bytes are mapped to that range.
   585 # bytes are mapped to that range.
   603 if pycompat.ispy3:
   586 _utf8strict = r'surrogatepass'
   604     _utf8strict = r'surrogatepass'
       
   605 else:
       
   606     _utf8strict = r'strict'
       
   607 
   587 
   608 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
   588 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
   609 
   589 
   610 
   590 
   611 def getutf8char(s, pos):
   591 def getutf8char(s, pos):