Mercurial > public > mercurial-scm > hg-stable
comparison mercurial/util.py @ 3770:f96c158ea3a3
Add functions for transcoding and manipulating multibyte strings
author | Matt Mackall <mpm@selenic.com> |
---|---|
date | Sun, 03 Dec 2006 16:16:33 -0600 |
parents | 96095d9ff1f8 |
children | 1427949b8f80 |
comparison
equal
deleted
inserted
replaced
3769:96095d9ff1f8 | 3770:f96c158ea3a3 |
---|---|
16 from demandload import * | 16 from demandload import * |
17 demandload(globals(), "cStringIO errno getpass popen2 re shutil sys tempfile") | 17 demandload(globals(), "cStringIO errno getpass popen2 re shutil sys tempfile") |
18 demandload(globals(), "os threading time calendar ConfigParser locale") | 18 demandload(globals(), "os threading time calendar ConfigParser locale") |
19 | 19 |
20 _encoding = os.environ.get("HGENCODING") or locale.getpreferredencoding() | 20 _encoding = os.environ.get("HGENCODING") or locale.getpreferredencoding() |
21 _encodingmode = os.environ.get("HGENCODINGMODE", "strict") | |
22 | |
23 def tolocal(s): | |
24 """ | |
25 Convert a string from internal UTF-8 to local encoding | |
26 | |
27 All internal strings should be UTF-8 but some repos before the | |
28 implementation of locale support may contain latin1 or possibly | |
29 other character sets. We attempt to decode everything strictly | |
30 using UTF-8, then Latin-1, and failing that, we use UTF-8 and | |
31 replace unknown characters. | |
32 """ | |
33 for e in "utf-8 latin1".split(): | |
34 try: | |
35 u = s.decode(e) # attempt strict decoding | |
36 return u.encode(_encoding, "replace") | |
37 except UnicodeDecodeError: | |
38 pass | |
39 u = s.decode("utf-8", "replace") # last ditch | |
40 return u.encode(_encoding, "replace") | |
41 | |
42 def fromlocal(s): | |
43 """ | |
44 Convert a string from the local character encoding to UTF-8 | |
45 | |
46 We attempt to decode strings using the encoding mode set by | |
47 HG_ENCODINGMODE, which defaults to 'strict'. In this mode, unknown | |
48 characters will cause an error message. Other modes include | |
49 'replace', which replaces unknown characters with a special | |
50 Unicode character, and 'ignore', which drops the character. | |
51 """ | |
52 try: | |
53 return s.decode(_encoding, _encodingmode).encode("utf-8") | |
54 except UnicodeDecodeError, inst: | |
55 sub = s[max(0, inst.start-10):inst.start+10] | |
56 raise Abort("decoding near '%s': %s!\n" % (sub, inst)) | |
57 | |
58 def locallen(s): | |
59 """Find the length in characters of a local string""" | |
60 return len(s.decode(_encoding, "replace")) | |
61 | |
62 def localsub(s, a, b=None): | |
63 try: | |
64 u = s.decode(_encoding, _encodingmode) | |
65 if b is not None: | |
66 u = u[a:b] | |
67 else: | |
68 u = u[:a] | |
69 return u.encode(_encoding, _encodingmode) | |
70 except UnicodeDecodeError, inst: | |
71 sub = s[max(0, inst.start-10), inst.start+10] | |
72 raise Abort("decoding near '%s': %s!\n" % (sub, inst)) | |
21 | 73 |
22 # used by parsedate | 74 # used by parsedate |
23 defaultdateformats = ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M', | 75 defaultdateformats = ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M', |
24 '%a %b %d %H:%M:%S %Y') | 76 '%a %b %d %H:%M:%S %Y') |
25 | 77 |