Mercurial > public > mercurial-scm > hg-stable
comparison mercurial/encoding.py @ 13046:7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
This allows UTF-8 strings to losslessly round-trip through Mercurial
author | Matt Mackall <mpm@selenic.com> |
---|---|
date | Wed, 24 Nov 2010 15:38:52 -0600 |
parents | eddc20306ab6 |
children | 120eccaaa522 |
comparison
equal
deleted
inserted
replaced
13045:1b1cbc246377 | 13046:7cc4263e07a9 |
---|---|
46 except locale.Error: | 46 except locale.Error: |
47 encoding = 'ascii' | 47 encoding = 'ascii' |
48 encodingmode = os.environ.get("HGENCODINGMODE", "strict") | 48 encodingmode = os.environ.get("HGENCODINGMODE", "strict") |
49 fallbackencoding = 'ISO-8859-1' | 49 fallbackencoding = 'ISO-8859-1' |
50 | 50 |
51 class localstr(str): | |
52 '''This class allows strings that are unmodified to be | |
53 round-tripped to the local encoding and back''' | |
54 def __new__(cls, u, l): | |
55 s = str.__new__(cls, l) | |
56 s._utf8 = u | |
57 return s | |
58 def __hash__(self): | |
59 return hash(self._utf8) # avoid collisions in local string space | |
60 | |
51 def tolocal(s): | 61 def tolocal(s): |
52 """ | 62 """ |
53 Convert a string from internal UTF-8 to local encoding | 63 Convert a string from internal UTF-8 to local encoding |
54 | 64 |
55 All internal strings should be UTF-8 but some repos before the | 65 All internal strings should be UTF-8 but some repos before the |
56 implementation of locale support may contain latin1 or possibly | 66 implementation of locale support may contain latin1 or possibly |
57 other character sets. We attempt to decode everything strictly | 67 other character sets. We attempt to decode everything strictly |
58 using UTF-8, then Latin-1, and failing that, we use UTF-8 and | 68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and |
59 replace unknown characters. | 69 replace unknown characters. |
70 | |
71 The localstr class is used to cache the known UTF-8 encoding of | |
72 strings next to their local representation to allow lossless | |
73 round-trip conversion back to UTF-8. | |
74 | |
75 >>> u = 'foo: \\xc3\\xa4' # utf-8 | |
76 >>> l = tolocal(u) | |
77 >>> l | |
78 'foo: ?' | |
79 >>> fromlocal(l) | |
80 'foo: \\xc3\\xa4' | |
81 >>> u2 = 'foo: \\xc3\\xa1' | |
82 >>> d = { l: 1, tolocal(u2): 2 } | |
83 >>> d # no collision | |
84 {'foo: ?': 1, 'foo: ?': 2} | |
85 >>> 'foo: ?' in d | |
86 False | |
87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback | |
88 >>> l = tolocal(l1) | |
89 >>> l | |
90 'foo: ?' | |
91 >>> fromlocal(l) # magically in utf-8 | |
92 'foo: \\xc3\\xa4' | |
60 """ | 93 """ |
94 | |
61 for e in ('UTF-8', fallbackencoding): | 95 for e in ('UTF-8', fallbackencoding): |
62 try: | 96 try: |
63 u = s.decode(e) # attempt strict decoding | 97 u = s.decode(e) # attempt strict decoding |
64 return u.encode(encoding, "replace") | 98 if u == 'UTF-8': |
99 return localstr(s, u.encode(encoding, "replace")) | |
100 else: | |
101 return localstr(u.encode('UTF-8'), | |
102 u.encode(encoding, "replace")) | |
65 except LookupError, k: | 103 except LookupError, k: |
66 raise error.Abort("%s, please check your locale settings" % k) | 104 raise error.Abort("%s, please check your locale settings" % k) |
67 except UnicodeDecodeError: | 105 except UnicodeDecodeError: |
68 pass | 106 pass |
69 u = s.decode("utf-8", "replace") # last ditch | 107 u = s.decode("utf-8", "replace") # last ditch |
70 return u.encode(encoding, "replace") | 108 return u.encode(encoding, "replace") # can't round-trip |
71 | 109 |
72 def fromlocal(s): | 110 def fromlocal(s): |
73 """ | 111 """ |
74 Convert a string from the local character encoding to UTF-8 | 112 Convert a string from the local character encoding to UTF-8 |
75 | 113 |
77 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown | 115 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown |
78 characters will cause an error message. Other modes include | 116 characters will cause an error message. Other modes include |
79 'replace', which replaces unknown characters with a special | 117 'replace', which replaces unknown characters with a special |
80 Unicode character, and 'ignore', which drops the character. | 118 Unicode character, and 'ignore', which drops the character. |
81 """ | 119 """ |
120 | |
121 # can we do a lossless round-trip? | |
122 if isinstance(s, localstr): | |
123 return s._utf8 | |
124 | |
82 try: | 125 try: |
83 return s.decode(encoding, encodingmode).encode("utf-8") | 126 return s.decode(encoding, encodingmode).encode("utf-8") |
84 except UnicodeDecodeError, inst: | 127 except UnicodeDecodeError, inst: |
85 sub = s[max(0, inst.start - 10):inst.start + 10] | 128 sub = s[max(0, inst.start - 10):inst.start + 10] |
86 raise error.Abort("decoding near '%s': %s!" % (sub, inst)) | 129 raise error.Abort("decoding near '%s': %s!" % (sub, inst)) |