mercurial/encoding.py
changeset 43076 2372284d9457
parent 41836 25694a78e4a4
child 43077 687b865b95ad
equal deleted inserted replaced
43075:57875cf423c9 43076:2372284d9457
    15     error,
    15     error,
    16     policy,
    16     policy,
    17     pycompat,
    17     pycompat,
    18 )
    18 )
    19 
    19 
    20 from .pure import (
    20 from .pure import charencode as charencodepure
    21     charencode as charencodepure,
       
    22 )
       
    23 
    21 
    24 charencode = policy.importmod(r'charencode')
    22 charencode = policy.importmod(r'charencode')
    25 
    23 
    26 isasciistr = charencode.isasciistr
    24 isasciistr = charencode.isasciistr
    27 asciilower = charencode.asciilower
    25 asciilower = charencode.asciilower
    34     unichr = chr
    32     unichr = chr
    35 
    33 
    36 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
    34 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
    37 # "Unicode Subtleties"), so we need to ignore them in some places for
    35 # "Unicode Subtleties"), so we need to ignore them in some places for
    38 # sanity.
    36 # sanity.
    39 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in
    37 _ignore = [
    40            "200c 200d 200e 200f 202a 202b 202c 202d 202e "
    38     unichr(int(x, 16)).encode("utf-8")
    41            "206a 206b 206c 206d 206e 206f feff".split()]
    39     for x in "200c 200d 200e 200f 202a 202b 202c 202d 202e "
       
    40     "206a 206b 206c 206d 206e 206f feff".split()
       
    41 ]
    42 # verify the next function will work
    42 # verify the next function will work
    43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
    43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
       
    44 
    44 
    45 
    45 def hfsignoreclean(s):
    46 def hfsignoreclean(s):
    46     """Remove codepoints ignored by HFS+ from s.
    47     """Remove codepoints ignored by HFS+ from s.
    47 
    48 
    48     >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
    49     >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
    53     if "\xe2" in s or "\xef" in s:
    54     if "\xe2" in s or "\xef" in s:
    54         for c in _ignore:
    55         for c in _ignore:
    55             s = s.replace(c, '')
    56             s = s.replace(c, '')
    56     return s
    57     return s
    57 
    58 
       
    59 
    58 # encoding.environ is provided read-only, which may not be used to modify
    60 # encoding.environ is provided read-only, which may not be used to modify
    59 # the process environment
    61 # the process environment
    60 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
    62 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
    61 if not pycompat.ispy3:
    63 if not pycompat.ispy3:
    62     environ = os.environ  # re-exports
    64     environ = os.environ  # re-exports
    63 elif _nativeenviron:
    65 elif _nativeenviron:
    64     environ = os.environb  # re-exports
    66     environ = os.environb  # re-exports
    65 else:
    67 else:
    66     # preferred encoding isn't known yet; use utf-8 to avoid unicode error
    68     # preferred encoding isn't known yet; use utf-8 to avoid unicode error
    67     # and recreate it once encoding is settled
    69     # and recreate it once encoding is settled
    68     environ = dict((k.encode(r'utf-8'), v.encode(r'utf-8'))
    70     environ = dict(
    69                    for k, v in os.environ.items())  # re-exports
    71         (k.encode(r'utf-8'), v.encode(r'utf-8'))
       
    72         for k, v in os.environ.items()  # re-exports
       
    73     )
    70 
    74 
    71 _encodingrewrites = {
    75 _encodingrewrites = {
    72     '646': 'ascii',
    76     '646': 'ascii',
    73     'ANSI_X3.4-1968': 'ascii',
    77     'ANSI_X3.4-1968': 'ascii',
    74 }
    78 }
    86 except locale.Error:
    90 except locale.Error:
    87     encoding = 'ascii'
    91     encoding = 'ascii'
    88 encodingmode = environ.get("HGENCODINGMODE", "strict")
    92 encodingmode = environ.get("HGENCODINGMODE", "strict")
    89 fallbackencoding = 'ISO-8859-1'
    93 fallbackencoding = 'ISO-8859-1'
    90 
    94 
       
    95 
    91 class localstr(bytes):
    96 class localstr(bytes):
    92     '''This class allows strings that are unmodified to be
    97     '''This class allows strings that are unmodified to be
    93     round-tripped to the local encoding and back'''
    98     round-tripped to the local encoding and back'''
       
    99 
    94     def __new__(cls, u, l):
   100     def __new__(cls, u, l):
    95         s = bytes.__new__(cls, l)
   101         s = bytes.__new__(cls, l)
    96         s._utf8 = u
   102         s._utf8 = u
    97         return s
   103         return s
       
   104 
    98     def __hash__(self):
   105     def __hash__(self):
    99         return hash(self._utf8) # avoid collisions in local string space
   106         return hash(self._utf8)  # avoid collisions in local string space
       
   107 
   100 
   108 
   101 class safelocalstr(bytes):
   109 class safelocalstr(bytes):
   102     """Tagged string denoting it was previously an internal UTF-8 string,
   110     """Tagged string denoting it was previously an internal UTF-8 string,
   103     and can be converted back to UTF-8 losslessly
   111     and can be converted back to UTF-8 losslessly
   104 
   112 
   105     >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
   113     >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
   106     >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
   114     >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
   107     >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
   115     >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
   108     >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
   116     >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
   109     """
   117     """
       
   118 
   110 
   119 
   111 def tolocal(s):
   120 def tolocal(s):
   112     """
   121     """
   113     Convert a string from internal UTF-8 to local encoding
   122     Convert a string from internal UTF-8 to local encoding
   114 
   123 
   165                 if u == r.decode(_sysstr(encoding)):
   174                 if u == r.decode(_sysstr(encoding)):
   166                     # r is a safe, non-lossy encoding of s
   175                     # r is a safe, non-lossy encoding of s
   167                     return safelocalstr(r)
   176                     return safelocalstr(r)
   168                 return localstr(u.encode('UTF-8'), r)
   177                 return localstr(u.encode('UTF-8'), r)
   169             except UnicodeDecodeError:
   178             except UnicodeDecodeError:
   170                 u = s.decode("utf-8", "replace") # last ditch
   179                 u = s.decode("utf-8", "replace")  # last ditch
   171                 # can't round-trip
   180                 # can't round-trip
   172                 return u.encode(_sysstr(encoding), r"replace")
   181                 return u.encode(_sysstr(encoding), r"replace")
   173     except LookupError as k:
   182     except LookupError as k:
   174         raise error.Abort(k, hint="please check your locale settings")
   183         raise error.Abort(k, hint="please check your locale settings")
       
   184 
   175 
   185 
   176 def fromlocal(s):
   186 def fromlocal(s):
   177     """
   187     """
   178     Convert a string from the local character encoding to UTF-8
   188     Convert a string from the local character encoding to UTF-8
   179 
   189 
   192 
   202 
   193     try:
   203     try:
   194         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
   204         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
   195         return u.encode("utf-8")
   205         return u.encode("utf-8")
   196     except UnicodeDecodeError as inst:
   206     except UnicodeDecodeError as inst:
   197         sub = s[max(0, inst.start - 10):inst.start + 10]
   207         sub = s[max(0, inst.start - 10) : inst.start + 10]
   198         raise error.Abort("decoding near '%s': %s!"
   208         raise error.Abort(
   199                           % (sub, pycompat.bytestr(inst)))
   209             "decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
       
   210         )
   200     except LookupError as k:
   211     except LookupError as k:
   201         raise error.Abort(k, hint="please check your locale settings")
   212         raise error.Abort(k, hint="please check your locale settings")
       
   213 
   202 
   214 
   203 def unitolocal(u):
   215 def unitolocal(u):
   204     """Convert a unicode string to a byte string of local encoding"""
   216     """Convert a unicode string to a byte string of local encoding"""
   205     return tolocal(u.encode('utf-8'))
   217     return tolocal(u.encode('utf-8'))
   206 
   218 
       
   219 
   207 def unifromlocal(s):
   220 def unifromlocal(s):
   208     """Convert a byte string of local encoding to a unicode string"""
   221     """Convert a byte string of local encoding to a unicode string"""
   209     return fromlocal(s).decode('utf-8')
   222     return fromlocal(s).decode('utf-8')
   210 
   223 
       
   224 
   211 def unimethod(bytesfunc):
   225 def unimethod(bytesfunc):
   212     """Create a proxy method that forwards __unicode__() and __str__() of
   226     """Create a proxy method that forwards __unicode__() and __str__() of
   213     Python 3 to __bytes__()"""
   227     Python 3 to __bytes__()"""
       
   228 
   214     def unifunc(obj):
   229     def unifunc(obj):
   215         return unifromlocal(bytesfunc(obj))
   230         return unifromlocal(bytesfunc(obj))
       
   231 
   216     return unifunc
   232     return unifunc
       
   233 
   217 
   234 
   218 # converter functions between native str and byte string. use these if the
   235 # converter functions between native str and byte string. use these if the
   219 # character encoding is not aware (e.g. exception message) or is known to
   236 # character encoding is not aware (e.g. exception message) or is known to
   220 # be locale dependent (e.g. date formatting.)
   237 # be locale dependent (e.g. date formatting.)
   221 if pycompat.ispy3:
   238 if pycompat.ispy3:
   228     strmethod = pycompat.identity
   245     strmethod = pycompat.identity
   229 
   246 
   230 if not _nativeenviron:
   247 if not _nativeenviron:
   231     # now encoding and helper functions are available, recreate the environ
   248     # now encoding and helper functions are available, recreate the environ
   232     # dict to be exported to other modules
   249     # dict to be exported to other modules
   233     environ = dict((tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
   250     environ = dict(
   234                    for k, v in os.environ.items())  # re-exports
   251         (tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
       
   252         for k, v in os.environ.items()  # re-exports
       
   253     )
   235 
   254 
   236 if pycompat.ispy3:
   255 if pycompat.ispy3:
   237     # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
   256     # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
   238     # returns bytes.
   257     # returns bytes.
   239     if pycompat.iswindows:
   258     if pycompat.iswindows:
   244         getcwd = os.getcwdb  # re-exports
   263         getcwd = os.getcwdb  # re-exports
   245 else:
   264 else:
   246     getcwd = os.getcwd  # re-exports
   265     getcwd = os.getcwd  # re-exports
   247 
   266 
   248 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
   267 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
   249 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
   268 _wide = _sysstr(
   250                 and "WFA" or "WF")
   269     environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide" and "WFA" or "WF"
       
   270 )
       
   271 
   251 
   272 
   252 def colwidth(s):
   273 def colwidth(s):
   253     "Find the column width of a string for display in the local encoding"
   274     "Find the column width of a string for display in the local encoding"
   254     return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
   275     return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
       
   276 
   255 
   277 
   256 def ucolwidth(d):
   278 def ucolwidth(d):
   257     "Find the column width of a Unicode string for display"
   279     "Find the column width of a Unicode string for display"
   258     eaw = getattr(unicodedata, 'east_asian_width', None)
   280     eaw = getattr(unicodedata, 'east_asian_width', None)
   259     if eaw is not None:
   281     if eaw is not None:
   260         return sum([eaw(c) in _wide and 2 or 1 for c in d])
   282         return sum([eaw(c) in _wide and 2 or 1 for c in d])
   261     return len(d)
   283     return len(d)
       
   284 
   262 
   285 
   263 def getcols(s, start, c):
   286 def getcols(s, start, c):
   264     '''Use colwidth to find a c-column substring of s starting at byte
   287     '''Use colwidth to find a c-column substring of s starting at byte
   265     index start'''
   288     index start'''
   266     for x in pycompat.xrange(start + c, len(s)):
   289     for x in pycompat.xrange(start + c, len(s)):
   267         t = s[start:x]
   290         t = s[start:x]
   268         if colwidth(t) == c:
   291         if colwidth(t) == c:
   269             return t
   292             return t
       
   293 
   270 
   294 
   271 def trim(s, width, ellipsis='', leftside=False):
   295 def trim(s, width, ellipsis='', leftside=False):
   272     """Trim string 's' to at most 'width' columns (including 'ellipsis').
   296     """Trim string 's' to at most 'width' columns (including 'ellipsis').
   273 
   297 
   274     If 'leftside' is True, left side of string 's' is trimmed.
   298     If 'leftside' is True, left side of string 's' is trimmed.
   334     +
   358     +
   335     """
   359     """
   336     try:
   360     try:
   337         u = s.decode(_sysstr(encoding))
   361         u = s.decode(_sysstr(encoding))
   338     except UnicodeDecodeError:
   362     except UnicodeDecodeError:
   339         if len(s) <= width: # trimming is not needed
   363         if len(s) <= width:  # trimming is not needed
   340             return s
   364             return s
   341         width -= len(ellipsis)
   365         width -= len(ellipsis)
   342         if width <= 0: # no enough room even for ellipsis
   366         if width <= 0:  # no enough room even for ellipsis
   343             return ellipsis[:width + len(ellipsis)]
   367             return ellipsis[: width + len(ellipsis)]
   344         if leftside:
   368         if leftside:
   345             return ellipsis + s[-width:]
   369             return ellipsis + s[-width:]
   346         return s[:width] + ellipsis
   370         return s[:width] + ellipsis
   347 
   371 
   348     if ucolwidth(u) <= width: # trimming is not needed
   372     if ucolwidth(u) <= width:  # trimming is not needed
   349         return s
   373         return s
   350 
   374 
   351     width -= len(ellipsis)
   375     width -= len(ellipsis)
   352     if width <= 0: # no enough room even for ellipsis
   376     if width <= 0:  # no enough room even for ellipsis
   353         return ellipsis[:width + len(ellipsis)]
   377         return ellipsis[: width + len(ellipsis)]
   354 
   378 
   355     if leftside:
   379     if leftside:
   356         uslice = lambda i: u[i:]
   380         uslice = lambda i: u[i:]
   357         concat = lambda s: ellipsis + s
   381         concat = lambda s: ellipsis + s
   358     else:
   382     else:
   360         concat = lambda s: s + ellipsis
   384         concat = lambda s: s + ellipsis
   361     for i in pycompat.xrange(1, len(u)):
   385     for i in pycompat.xrange(1, len(u)):
   362         usub = uslice(i)
   386         usub = uslice(i)
   363         if ucolwidth(usub) <= width:
   387         if ucolwidth(usub) <= width:
   364             return concat(usub.encode(_sysstr(encoding)))
   388             return concat(usub.encode(_sysstr(encoding)))
   365     return ellipsis # no enough room for multi-column characters
   389     return ellipsis  # no enough room for multi-column characters
       
   390 
   366 
   391 
   367 def lower(s):
   392 def lower(s):
   368     "best-effort encoding-aware case-folding of local string s"
   393     "best-effort encoding-aware case-folding of local string s"
   369     try:
   394     try:
   370         return asciilower(s)
   395         return asciilower(s)
   376         else:
   401         else:
   377             u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
   402             u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
   378 
   403 
   379         lu = u.lower()
   404         lu = u.lower()
   380         if u == lu:
   405         if u == lu:
   381             return s # preserve localstring
   406             return s  # preserve localstring
   382         return lu.encode(_sysstr(encoding))
   407         return lu.encode(_sysstr(encoding))
   383     except UnicodeError:
   408     except UnicodeError:
   384         return s.lower() # we don't know how to fold this except in ASCII
   409         return s.lower()  # we don't know how to fold this except in ASCII
   385     except LookupError as k:
   410     except LookupError as k:
   386         raise error.Abort(k, hint="please check your locale settings")
   411         raise error.Abort(k, hint="please check your locale settings")
       
   412 
   387 
   413 
   388 def upper(s):
   414 def upper(s):
   389     "best-effort encoding-aware case-folding of local string s"
   415     "best-effort encoding-aware case-folding of local string s"
   390     try:
   416     try:
   391         return asciiupper(s)
   417         return asciiupper(s)
   392     except UnicodeDecodeError:
   418     except UnicodeDecodeError:
   393         return upperfallback(s)
   419         return upperfallback(s)
       
   420 
   394 
   421 
   395 def upperfallback(s):
   422 def upperfallback(s):
   396     try:
   423     try:
   397         if isinstance(s, localstr):
   424         if isinstance(s, localstr):
   398             u = s._utf8.decode("utf-8")
   425             u = s._utf8.decode("utf-8")
   399         else:
   426         else:
   400             u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
   427             u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
   401 
   428 
   402         uu = u.upper()
   429         uu = u.upper()
   403         if u == uu:
   430         if u == uu:
   404             return s # preserve localstring
   431             return s  # preserve localstring
   405         return uu.encode(_sysstr(encoding))
   432         return uu.encode(_sysstr(encoding))
   406     except UnicodeError:
   433     except UnicodeError:
   407         return s.upper() # we don't know how to fold this except in ASCII
   434         return s.upper()  # we don't know how to fold this except in ASCII
   408     except LookupError as k:
   435     except LookupError as k:
   409         raise error.Abort(k, hint="please check your locale settings")
   436         raise error.Abort(k, hint="please check your locale settings")
       
   437 
   410 
   438 
   411 class normcasespecs(object):
   439 class normcasespecs(object):
   412     '''what a platform's normcase does to ASCII strings
   440     '''what a platform's normcase does to ASCII strings
   413 
   441 
   414     This is specified per platform, and should be consistent with what normcase
   442     This is specified per platform, and should be consistent with what normcase
   417     lower: normcase lowercases ASCII strings
   445     lower: normcase lowercases ASCII strings
   418     upper: normcase uppercases ASCII strings
   446     upper: normcase uppercases ASCII strings
   419     other: the fallback function should always be called
   447     other: the fallback function should always be called
   420 
   448 
   421     This should be kept in sync with normcase_spec in util.h.'''
   449     This should be kept in sync with normcase_spec in util.h.'''
       
   450 
   422     lower = -1
   451     lower = -1
   423     upper = 1
   452     upper = 1
   424     other = 0
   453     other = 0
       
   454 
   425 
   455 
   426 def jsonescape(s, paranoid=False):
   456 def jsonescape(s, paranoid=False):
   427     '''returns a string suitable for JSON
   457     '''returns a string suitable for JSON
   428 
   458 
   429     JSON is problematic for us because it doesn't support non-Unicode
   459     JSON is problematic for us because it doesn't support non-Unicode
   473         return _jsonescapeu8fast(u8chars, paranoid)
   503         return _jsonescapeu8fast(u8chars, paranoid)
   474     except ValueError:
   504     except ValueError:
   475         pass
   505         pass
   476     return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
   506     return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
   477 
   507 
       
   508 
   478 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
   509 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
   479 # bytes are mapped to that range.
   510 # bytes are mapped to that range.
   480 if pycompat.ispy3:
   511 if pycompat.ispy3:
   481     _utf8strict = r'surrogatepass'
   512     _utf8strict = r'surrogatepass'
   482 else:
   513 else:
   483     _utf8strict = r'strict'
   514     _utf8strict = r'strict'
   484 
   515 
   485 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
   516 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
   486 
   517 
       
   518 
   487 def getutf8char(s, pos):
   519 def getutf8char(s, pos):
   488     '''get the next full utf-8 character in the given string, starting at pos
   520     '''get the next full utf-8 character in the given string, starting at pos
   489 
   521 
   490     Raises a UnicodeError if the given location does not start a valid
   522     Raises a UnicodeError if the given location does not start a valid
   491     utf-8 character.
   523     utf-8 character.
   492     '''
   524     '''
   493 
   525 
   494     # find how many bytes to attempt decoding from first nibble
   526     # find how many bytes to attempt decoding from first nibble
   495     l = _utf8len[ord(s[pos:pos + 1]) >> 4]
   527     l = _utf8len[ord(s[pos : pos + 1]) >> 4]
   496     if not l: # ascii
   528     if not l:  # ascii
   497         return s[pos:pos + 1]
   529         return s[pos : pos + 1]
   498 
   530 
   499     c = s[pos:pos + l]
   531     c = s[pos : pos + l]
   500     # validate with attempted decode
   532     # validate with attempted decode
   501     c.decode("utf-8", _utf8strict)
   533     c.decode("utf-8", _utf8strict)
   502     return c
   534     return c
       
   535 
   503 
   536 
   504 def toutf8b(s):
   537 def toutf8b(s):
   505     '''convert a local, possibly-binary string into UTF-8b
   538     '''convert a local, possibly-binary string into UTF-8b
   506 
   539 
   507     This is intended as a generic method to preserve data when working
   540     This is intended as a generic method to preserve data when working
   556     while pos < l:
   589     while pos < l:
   557         try:
   590         try:
   558             c = getutf8char(s, pos)
   591             c = getutf8char(s, pos)
   559             if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
   592             if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
   560                 # have to re-escape existing U+DCxx characters
   593                 # have to re-escape existing U+DCxx characters
   561                 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
   594                 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
   562                 pos += 1
   595                 pos += 1
   563             else:
   596             else:
   564                 pos += len(c)
   597                 pos += len(c)
   565         except UnicodeDecodeError:
   598         except UnicodeDecodeError:
   566             c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
   599             c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
   567             pos += 1
   600             pos += 1
   568         r += c
   601         r += c
   569     return r
   602     return r
       
   603 
   570 
   604 
   571 def fromutf8b(s):
   605 def fromutf8b(s):
   572     '''Given a UTF-8b string, return a local, possibly-binary string.
   606     '''Given a UTF-8b string, return a local, possibly-binary string.
   573 
   607 
   574     return the original binary string. This
   608     return the original binary string. This
   609     while pos < l:
   643     while pos < l:
   610         c = getutf8char(s, pos)
   644         c = getutf8char(s, pos)
   611         pos += len(c)
   645         pos += len(c)
   612         # unescape U+DCxx characters
   646         # unescape U+DCxx characters
   613         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
   647         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
   614             c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
   648             c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
   615         r += c
   649         r += c
   616     return r
   650     return r