|
1 # stringutil.py - utility for generic string formatting, parsing, etc. |
|
2 # |
|
3 # Copyright 2005 K. Thananchayan <thananck@yahoo.com> |
|
4 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com> |
|
5 # Copyright 2006 Vadim Gelfer <vadim.gelfer@gmail.com> |
|
6 # |
|
7 # This software may be used and distributed according to the terms of the |
|
8 # GNU General Public License version 2 or any later version. |
|
9 |
|
10 from __future__ import absolute_import |
|
11 |
|
12 import codecs |
|
13 import re as remod |
|
14 import textwrap |
|
15 |
|
16 from ..i18n import _ |
|
17 |
|
18 from .. import ( |
|
19 encoding, |
|
20 error, |
|
21 pycompat, |
|
22 ) |
|
23 |
|
24 _DATA_ESCAPE_MAP = {pycompat.bytechr(i): br'\x%02x' % i for i in range(256)} |
|
25 _DATA_ESCAPE_MAP.update({ |
|
26 b'\\': b'\\\\', |
|
27 b'\r': br'\r', |
|
28 b'\n': br'\n', |
|
29 }) |
|
30 _DATA_ESCAPE_RE = remod.compile(br'[\x00-\x08\x0a-\x1f\\\x7f-\xff]') |
|
31 |
|
32 def escapedata(s): |
|
33 if isinstance(s, bytearray): |
|
34 s = bytes(s) |
|
35 |
|
36 return _DATA_ESCAPE_RE.sub(lambda m: _DATA_ESCAPE_MAP[m.group(0)], s) |
|
37 |
|
38 def binary(s): |
|
39 """return true if a string is binary data""" |
|
40 return bool(s and '\0' in s) |
|
41 |
|
42 def stringmatcher(pattern, casesensitive=True): |
|
43 """ |
|
44 accepts a string, possibly starting with 're:' or 'literal:' prefix. |
|
45 returns the matcher name, pattern, and matcher function. |
|
46 missing or unknown prefixes are treated as literal matches. |
|
47 |
|
48 helper for tests: |
|
49 >>> def test(pattern, *tests): |
|
50 ... kind, pattern, matcher = stringmatcher(pattern) |
|
51 ... return (kind, pattern, [bool(matcher(t)) for t in tests]) |
|
52 >>> def itest(pattern, *tests): |
|
53 ... kind, pattern, matcher = stringmatcher(pattern, casesensitive=False) |
|
54 ... return (kind, pattern, [bool(matcher(t)) for t in tests]) |
|
55 |
|
56 exact matching (no prefix): |
|
57 >>> test(b'abcdefg', b'abc', b'def', b'abcdefg') |
|
58 ('literal', 'abcdefg', [False, False, True]) |
|
59 |
|
60 regex matching ('re:' prefix) |
|
61 >>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar') |
|
62 ('re', 'a.+b', [False, False, True]) |
|
63 |
|
64 force exact matches ('literal:' prefix) |
|
65 >>> test(b'literal:re:foobar', b'foobar', b're:foobar') |
|
66 ('literal', 're:foobar', [False, True]) |
|
67 |
|
68 unknown prefixes are ignored and treated as literals |
|
69 >>> test(b'foo:bar', b'foo', b'bar', b'foo:bar') |
|
70 ('literal', 'foo:bar', [False, False, True]) |
|
71 |
|
72 case insensitive regex matches |
|
73 >>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar') |
|
74 ('re', 'A.+b', [False, False, True]) |
|
75 |
|
76 case insensitive literal matches |
|
77 >>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg') |
|
78 ('literal', 'ABCDEFG', [False, False, True]) |
|
79 """ |
|
80 if pattern.startswith('re:'): |
|
81 pattern = pattern[3:] |
|
82 try: |
|
83 flags = 0 |
|
84 if not casesensitive: |
|
85 flags = remod.I |
|
86 regex = remod.compile(pattern, flags) |
|
87 except remod.error as e: |
|
88 raise error.ParseError(_('invalid regular expression: %s') |
|
89 % e) |
|
90 return 're', pattern, regex.search |
|
91 elif pattern.startswith('literal:'): |
|
92 pattern = pattern[8:] |
|
93 |
|
94 match = pattern.__eq__ |
|
95 |
|
96 if not casesensitive: |
|
97 ipat = encoding.lower(pattern) |
|
98 match = lambda s: ipat == encoding.lower(s) |
|
99 return 'literal', pattern, match |
|
100 |
|
101 def shortuser(user): |
|
102 """Return a short representation of a user name or email address.""" |
|
103 f = user.find('@') |
|
104 if f >= 0: |
|
105 user = user[:f] |
|
106 f = user.find('<') |
|
107 if f >= 0: |
|
108 user = user[f + 1:] |
|
109 f = user.find(' ') |
|
110 if f >= 0: |
|
111 user = user[:f] |
|
112 f = user.find('.') |
|
113 if f >= 0: |
|
114 user = user[:f] |
|
115 return user |
|
116 |
|
117 def emailuser(user): |
|
118 """Return the user portion of an email address.""" |
|
119 f = user.find('@') |
|
120 if f >= 0: |
|
121 user = user[:f] |
|
122 f = user.find('<') |
|
123 if f >= 0: |
|
124 user = user[f + 1:] |
|
125 return user |
|
126 |
|
127 def email(author): |
|
128 '''get email of author.''' |
|
129 r = author.find('>') |
|
130 if r == -1: |
|
131 r = None |
|
132 return author[author.find('<') + 1:r] |
|
133 |
|
134 def ellipsis(text, maxlength=400): |
|
135 """Trim string to at most maxlength (default: 400) columns in display.""" |
|
136 return encoding.trim(text, maxlength, ellipsis='...') |
|
137 |
|
138 def escapestr(s): |
|
139 # call underlying function of s.encode('string_escape') directly for |
|
140 # Python 3 compatibility |
|
141 return codecs.escape_encode(s)[0] |
|
142 |
|
143 def unescapestr(s): |
|
144 return codecs.escape_decode(s)[0] |
|
145 |
|
146 def forcebytestr(obj): |
|
147 """Portably format an arbitrary object (e.g. exception) into a byte |
|
148 string.""" |
|
149 try: |
|
150 return pycompat.bytestr(obj) |
|
151 except UnicodeEncodeError: |
|
152 # non-ascii string, may be lossy |
|
153 return pycompat.bytestr(encoding.strtolocal(str(obj))) |
|
154 |
|
155 def uirepr(s): |
|
156 # Avoid double backslash in Windows path repr() |
|
157 return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\') |
|
158 |
|
159 # delay import of textwrap |
|
160 def _MBTextWrapper(**kwargs): |
|
161 class tw(textwrap.TextWrapper): |
|
162 """ |
|
163 Extend TextWrapper for width-awareness. |
|
164 |
|
165 Neither number of 'bytes' in any encoding nor 'characters' is |
|
166 appropriate to calculate terminal columns for specified string. |
|
167 |
|
168 Original TextWrapper implementation uses built-in 'len()' directly, |
|
169 so overriding is needed to use width information of each characters. |
|
170 |
|
171 In addition, characters classified into 'ambiguous' width are |
|
172 treated as wide in East Asian area, but as narrow in other. |
|
173 |
|
174 This requires use decision to determine width of such characters. |
|
175 """ |
|
176 def _cutdown(self, ucstr, space_left): |
|
177 l = 0 |
|
178 colwidth = encoding.ucolwidth |
|
179 for i in xrange(len(ucstr)): |
|
180 l += colwidth(ucstr[i]) |
|
181 if space_left < l: |
|
182 return (ucstr[:i], ucstr[i:]) |
|
183 return ucstr, '' |
|
184 |
|
185 # overriding of base class |
|
186 def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): |
|
187 space_left = max(width - cur_len, 1) |
|
188 |
|
189 if self.break_long_words: |
|
190 cut, res = self._cutdown(reversed_chunks[-1], space_left) |
|
191 cur_line.append(cut) |
|
192 reversed_chunks[-1] = res |
|
193 elif not cur_line: |
|
194 cur_line.append(reversed_chunks.pop()) |
|
195 |
|
196 # this overriding code is imported from TextWrapper of Python 2.6 |
|
197 # to calculate columns of string by 'encoding.ucolwidth()' |
|
198 def _wrap_chunks(self, chunks): |
|
199 colwidth = encoding.ucolwidth |
|
200 |
|
201 lines = [] |
|
202 if self.width <= 0: |
|
203 raise ValueError("invalid width %r (must be > 0)" % self.width) |
|
204 |
|
205 # Arrange in reverse order so items can be efficiently popped |
|
206 # from a stack of chucks. |
|
207 chunks.reverse() |
|
208 |
|
209 while chunks: |
|
210 |
|
211 # Start the list of chunks that will make up the current line. |
|
212 # cur_len is just the length of all the chunks in cur_line. |
|
213 cur_line = [] |
|
214 cur_len = 0 |
|
215 |
|
216 # Figure out which static string will prefix this line. |
|
217 if lines: |
|
218 indent = self.subsequent_indent |
|
219 else: |
|
220 indent = self.initial_indent |
|
221 |
|
222 # Maximum width for this line. |
|
223 width = self.width - len(indent) |
|
224 |
|
225 # First chunk on line is whitespace -- drop it, unless this |
|
226 # is the very beginning of the text (i.e. no lines started yet). |
|
227 if self.drop_whitespace and chunks[-1].strip() == r'' and lines: |
|
228 del chunks[-1] |
|
229 |
|
230 while chunks: |
|
231 l = colwidth(chunks[-1]) |
|
232 |
|
233 # Can at least squeeze this chunk onto the current line. |
|
234 if cur_len + l <= width: |
|
235 cur_line.append(chunks.pop()) |
|
236 cur_len += l |
|
237 |
|
238 # Nope, this line is full. |
|
239 else: |
|
240 break |
|
241 |
|
242 # The current line is full, and the next chunk is too big to |
|
243 # fit on *any* line (not just this one). |
|
244 if chunks and colwidth(chunks[-1]) > width: |
|
245 self._handle_long_word(chunks, cur_line, cur_len, width) |
|
246 |
|
247 # If the last chunk on this line is all whitespace, drop it. |
|
248 if (self.drop_whitespace and |
|
249 cur_line and cur_line[-1].strip() == r''): |
|
250 del cur_line[-1] |
|
251 |
|
252 # Convert current line back to a string and store it in list |
|
253 # of all lines (return value). |
|
254 if cur_line: |
|
255 lines.append(indent + r''.join(cur_line)) |
|
256 |
|
257 return lines |
|
258 |
|
259 global _MBTextWrapper |
|
260 _MBTextWrapper = tw |
|
261 return tw(**kwargs) |
|
262 |
|
263 def wrap(line, width, initindent='', hangindent=''): |
|
264 maxindent = max(len(hangindent), len(initindent)) |
|
265 if width <= maxindent: |
|
266 # adjust for weird terminal size |
|
267 width = max(78, maxindent + 1) |
|
268 line = line.decode(pycompat.sysstr(encoding.encoding), |
|
269 pycompat.sysstr(encoding.encodingmode)) |
|
270 initindent = initindent.decode(pycompat.sysstr(encoding.encoding), |
|
271 pycompat.sysstr(encoding.encodingmode)) |
|
272 hangindent = hangindent.decode(pycompat.sysstr(encoding.encoding), |
|
273 pycompat.sysstr(encoding.encodingmode)) |
|
274 wrapper = _MBTextWrapper(width=width, |
|
275 initial_indent=initindent, |
|
276 subsequent_indent=hangindent) |
|
277 return wrapper.fill(line).encode(pycompat.sysstr(encoding.encoding)) |
|
278 |
|
279 _booleans = {'1': True, 'yes': True, 'true': True, 'on': True, 'always': True, |
|
280 '0': False, 'no': False, 'false': False, 'off': False, |
|
281 'never': False} |
|
282 |
|
283 def parsebool(s): |
|
284 """Parse s into a boolean. |
|
285 |
|
286 If s is not a valid boolean, returns None. |
|
287 """ |
|
288 return _booleans.get(s.lower(), None) |