Mercurial > public > mercurial-scm > hg-stable
comparison mercurial/util.py @ 11297:d320e70442a5
replace Python standard textwrap by MBCS sensitive one for i18n text
Mercurial has problem around text wrapping/filling in MBCS encoding
environment, because standard 'textwrap' module of Python can not
treat it correctly. It splits byte sequence for one character into two
lines.
According to unicode specification, "east asian width" classifies
characters into:
W(ide), N(arrow), F(ull-width), H(alf-width), A(mbiguous)
W/N/F/H can be always recognized as 2/1/2/1 bytes in byte sequence,
but 'A' can not. Size of 'A' depends on language in which it is used.
Unicode specification says:
If the context(= language) cannot be established reliably they
should be treated as narrow characters by default
but many of class 'A' characters are full-width, at least, in Japanese
environment.
So, this patch treats class 'A' characters as full-width always for
safety wrapping.
This patch focuses only on MBCS safe-ness, not on writing/printing
rule strict wrapping for each languages
MBCS sensitive textwrap class is originally implemented
by ITO Nobuaki <daydream.trippers@gmail.com>.
author | FUJIWARA Katsunori <foozy@lares.dti.ne.jp> |
---|---|
date | Sun, 06 Jun 2010 17:20:10 +0900 |
parents | 94b7b3a1ae1b |
children | c37f35d7f2f5 |
comparison
equal
deleted
inserted
replaced
11296:0054a328b98f | 11297:d320e70442a5 |
---|---|
14 """ | 14 """ |
15 | 15 |
16 from i18n import _ | 16 from i18n import _ |
17 import error, osutil, encoding | 17 import error, osutil, encoding |
18 import cStringIO, errno, re, shutil, sys, tempfile, traceback | 18 import cStringIO, errno, re, shutil, sys, tempfile, traceback |
19 import os, stat, time, calendar, textwrap, signal | 19 import os, stat, time, calendar, textwrap, unicodedata, signal |
20 import imp | 20 import imp |
21 | 21 |
22 # Python compatibility | 22 # Python compatibility |
23 | 23 |
24 def sha1(s): | 24 def sha1(s): |
1255 | 1255 |
1256 def uirepr(s): | 1256 def uirepr(s): |
1257 # Avoid double backslash in Windows path repr() | 1257 # Avoid double backslash in Windows path repr() |
1258 return repr(s).replace('\\\\', '\\') | 1258 return repr(s).replace('\\\\', '\\') |
1259 | 1259 |
1260 def wrap(line, hangindent, width=None): | 1260 #### naming convention of below implementation follows 'textwrap' module |
1261 | |
1262 class MBTextWrapper(textwrap.TextWrapper): | |
1263 def __init__(self, **kwargs): | |
1264 textwrap.TextWrapper.__init__(self, **kwargs) | |
1265 | |
1266 def _cutdown(self, str, space_left): | |
1267 l = 0 | |
1268 ucstr = unicode(str, encoding.encoding) | |
1269 w = unicodedata.east_asian_width | |
1270 for i in xrange(len(ucstr)): | |
1271 l += w(ucstr[i]) in 'WFA' and 2 or 1 | |
1272 if space_left < l: | |
1273 return (ucstr[:i].encode(encoding.encoding), | |
1274 ucstr[i:].encode(encoding.encoding)) | |
1275 return str, '' | |
1276 | |
1277 # ---------------------------------------- | |
1278 # overriding of base class | |
1279 | |
1280 def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): | |
1281 space_left = max(width - cur_len, 1) | |
1282 | |
1283 if self.break_long_words: | |
1284 cut, res = self._cutdown(reversed_chunks[-1], space_left) | |
1285 cur_line.append(cut) | |
1286 reversed_chunks[-1] = res | |
1287 elif not cur_line: | |
1288 cur_line.append(reversed_chunks.pop()) | |
1289 | |
1290 #### naming convention of above implementation follows 'textwrap' module | |
1291 | |
1292 def wrap(line, width=None, initindent='', hangindent=''): | |
1261 if width is None: | 1293 if width is None: |
1262 width = termwidth() - 2 | 1294 width = termwidth() - 2 |
1263 if width <= hangindent: | 1295 maxindent = max(len(hangindent), len(initindent)) |
1296 if width <= maxindent: | |
1264 # adjust for weird terminal size | 1297 # adjust for weird terminal size |
1265 width = max(78, hangindent + 1) | 1298 width = max(78, maxindent + 1) |
1266 padding = '\n' + ' ' * hangindent | 1299 wrapper = MBTextWrapper(width=width, |
1267 # To avoid corrupting multi-byte characters in line, we must wrap | 1300 initial_indent=initindent, |
1268 # a Unicode string instead of a bytestring. | 1301 subsequent_indent=hangindent) |
1269 try: | 1302 return wrapper.fill(line) |
1270 u = line.decode(encoding.encoding) | |
1271 w = padding.join(textwrap.wrap(u, width=width - hangindent)) | |
1272 return w.encode(encoding.encoding) | |
1273 except UnicodeDecodeError: | |
1274 return padding.join(textwrap.wrap(line, width=width - hangindent)) | |
1275 | 1303 |
1276 def iterlines(iterator): | 1304 def iterlines(iterator): |
1277 for chunk in iterator: | 1305 for chunk in iterator: |
1278 for line in chunk.splitlines(): | 1306 for line in chunk.splitlines(): |
1279 yield line | 1307 yield line |