mercurial/encoding.py
changeset 7948 de377b1a9a84
child 8225 46293a0c7e9f
equal deleted inserted replaced
7947:a454eeb1b827 7948:de377b1a9a84
       
     1 """
       
     2 encoding.py - character transcoding support for Mercurial
       
     3 
       
     4  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
       
     5 
       
     6 This software may be used and distributed according to the terms of
       
     7 the GNU General Public License version 2, incorporated herein by
       
     8 reference.
       
     9 """
       
    10 
       
    11 import sys, unicodedata, locale, os, error
       
    12 
       
    13 _encodingfixup = {'646': 'ascii', 'ANSI_X3.4-1968': 'ascii'}
       
    14 
       
    15 try:
       
    16     encoding = os.environ.get("HGENCODING")
       
    17     if sys.platform == 'darwin' and not encoding:
       
    18         # On darwin, getpreferredencoding ignores the locale environment and
       
    19         # always returns mac-roman. We override this if the environment is
       
    20         # not C (has been customized by the user).
       
    21         locale.setlocale(locale.LC_CTYPE, '')
       
    22         encoding = locale.getlocale()[1]
       
    23     if not encoding:
       
    24         encoding = locale.getpreferredencoding() or 'ascii'
       
    25         encoding = _encodingfixup.get(encoding, encoding)
       
    26 except locale.Error:
       
    27     encoding = 'ascii'
       
    28 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
       
    29 fallbackencoding = 'ISO-8859-1'
       
    30 
       
    31 def tolocal(s):
       
    32     """
       
    33     Convert a string from internal UTF-8 to local encoding
       
    34 
       
    35     All internal strings should be UTF-8 but some repos before the
       
    36     implementation of locale support may contain latin1 or possibly
       
    37     other character sets. We attempt to decode everything strictly
       
    38     using UTF-8, then Latin-1, and failing that, we use UTF-8 and
       
    39     replace unknown characters.
       
    40     """
       
    41     for e in ('UTF-8', fallbackencoding):
       
    42         try:
       
    43             u = s.decode(e) # attempt strict decoding
       
    44             return u.encode(encoding, "replace")
       
    45         except LookupError, k:
       
    46             raise error.Abort("%s, please check your locale settings" % k)
       
    47         except UnicodeDecodeError:
       
    48             pass
       
    49     u = s.decode("utf-8", "replace") # last ditch
       
    50     return u.encode(encoding, "replace")
       
    51 
       
    52 def fromlocal(s):
       
    53     """
       
    54     Convert a string from the local character encoding to UTF-8
       
    55 
       
    56     We attempt to decode strings using the encoding mode set by
       
    57     HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
       
    58     characters will cause an error message. Other modes include
       
    59     'replace', which replaces unknown characters with a special
       
    60     Unicode character, and 'ignore', which drops the character.
       
    61     """
       
    62     try:
       
    63         return s.decode(encoding, encodingmode).encode("utf-8")
       
    64     except UnicodeDecodeError, inst:
       
    65         sub = s[max(0, inst.start-10):inst.start+10]
       
    66         raise error.Abort("decoding near '%s': %s!" % (sub, inst))
       
    67     except LookupError, k:
       
    68         raise error.Abort("%s, please check your locale settings" % k)
       
    69 
       
    70 def colwidth(s):
       
    71     "Find the column width of a UTF-8 string for display"
       
    72     d = s.decode(encoding, 'replace')
       
    73     if hasattr(unicodedata, 'east_asian_width'):
       
    74         w = unicodedata.east_asian_width
       
    75         return sum([w(c) in 'WF' and 2 or 1 for c in d])
       
    76     return len(d)
       
    77