Mercurial > public > mercurial-scm > hg
comparison mercurial/utils/stringutil.py @ 37083:f99d64e8a4e4
stringutil: move generic string helpers to new module
Per https://phab.mercurial-scm.org/D2903#46738
URL and file paths functions are left since they are big enough to make
separate modules.
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Thu, 22 Mar 2018 21:19:31 +0900 |
parents | mercurial/util.py@1a1d1c44b570 |
children | f8e1f48de118 |
comparison
equal
deleted
inserted
replaced
37082:1a1d1c44b570 | 37083:f99d64e8a4e4 |
---|---|
1 # stringutil.py - utility for generic string formatting, parsing, etc. | |
2 # | |
3 # Copyright 2005 K. Thananchayan <thananck@yahoo.com> | |
4 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com> | |
5 # Copyright 2006 Vadim Gelfer <vadim.gelfer@gmail.com> | |
6 # | |
7 # This software may be used and distributed according to the terms of the | |
8 # GNU General Public License version 2 or any later version. | |
9 | |
10 from __future__ import absolute_import | |
11 | |
12 import codecs | |
13 import re as remod | |
14 import textwrap | |
15 | |
16 from ..i18n import _ | |
17 | |
18 from .. import ( | |
19 encoding, | |
20 error, | |
21 pycompat, | |
22 ) | |
23 | |
24 _DATA_ESCAPE_MAP = {pycompat.bytechr(i): br'\x%02x' % i for i in range(256)} | |
25 _DATA_ESCAPE_MAP.update({ | |
26 b'\\': b'\\\\', | |
27 b'\r': br'\r', | |
28 b'\n': br'\n', | |
29 }) | |
30 _DATA_ESCAPE_RE = remod.compile(br'[\x00-\x08\x0a-\x1f\\\x7f-\xff]') | |
31 | |
32 def escapedata(s): | |
33 if isinstance(s, bytearray): | |
34 s = bytes(s) | |
35 | |
36 return _DATA_ESCAPE_RE.sub(lambda m: _DATA_ESCAPE_MAP[m.group(0)], s) | |
37 | |
38 def binary(s): | |
39 """return true if a string is binary data""" | |
40 return bool(s and '\0' in s) | |
41 | |
42 def stringmatcher(pattern, casesensitive=True): | |
43 """ | |
44 accepts a string, possibly starting with 're:' or 'literal:' prefix. | |
45 returns the matcher name, pattern, and matcher function. | |
46 missing or unknown prefixes are treated as literal matches. | |
47 | |
48 helper for tests: | |
49 >>> def test(pattern, *tests): | |
50 ... kind, pattern, matcher = stringmatcher(pattern) | |
51 ... return (kind, pattern, [bool(matcher(t)) for t in tests]) | |
52 >>> def itest(pattern, *tests): | |
53 ... kind, pattern, matcher = stringmatcher(pattern, casesensitive=False) | |
54 ... return (kind, pattern, [bool(matcher(t)) for t in tests]) | |
55 | |
56 exact matching (no prefix): | |
57 >>> test(b'abcdefg', b'abc', b'def', b'abcdefg') | |
58 ('literal', 'abcdefg', [False, False, True]) | |
59 | |
60 regex matching ('re:' prefix) | |
61 >>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar') | |
62 ('re', 'a.+b', [False, False, True]) | |
63 | |
64 force exact matches ('literal:' prefix) | |
65 >>> test(b'literal:re:foobar', b'foobar', b're:foobar') | |
66 ('literal', 're:foobar', [False, True]) | |
67 | |
68 unknown prefixes are ignored and treated as literals | |
69 >>> test(b'foo:bar', b'foo', b'bar', b'foo:bar') | |
70 ('literal', 'foo:bar', [False, False, True]) | |
71 | |
72 case insensitive regex matches | |
73 >>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar') | |
74 ('re', 'A.+b', [False, False, True]) | |
75 | |
76 case insensitive literal matches | |
77 >>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg') | |
78 ('literal', 'ABCDEFG', [False, False, True]) | |
79 """ | |
80 if pattern.startswith('re:'): | |
81 pattern = pattern[3:] | |
82 try: | |
83 flags = 0 | |
84 if not casesensitive: | |
85 flags = remod.I | |
86 regex = remod.compile(pattern, flags) | |
87 except remod.error as e: | |
88 raise error.ParseError(_('invalid regular expression: %s') | |
89 % e) | |
90 return 're', pattern, regex.search | |
91 elif pattern.startswith('literal:'): | |
92 pattern = pattern[8:] | |
93 | |
94 match = pattern.__eq__ | |
95 | |
96 if not casesensitive: | |
97 ipat = encoding.lower(pattern) | |
98 match = lambda s: ipat == encoding.lower(s) | |
99 return 'literal', pattern, match | |
100 | |
101 def shortuser(user): | |
102 """Return a short representation of a user name or email address.""" | |
103 f = user.find('@') | |
104 if f >= 0: | |
105 user = user[:f] | |
106 f = user.find('<') | |
107 if f >= 0: | |
108 user = user[f + 1:] | |
109 f = user.find(' ') | |
110 if f >= 0: | |
111 user = user[:f] | |
112 f = user.find('.') | |
113 if f >= 0: | |
114 user = user[:f] | |
115 return user | |
116 | |
117 def emailuser(user): | |
118 """Return the user portion of an email address.""" | |
119 f = user.find('@') | |
120 if f >= 0: | |
121 user = user[:f] | |
122 f = user.find('<') | |
123 if f >= 0: | |
124 user = user[f + 1:] | |
125 return user | |
126 | |
127 def email(author): | |
128 '''get email of author.''' | |
129 r = author.find('>') | |
130 if r == -1: | |
131 r = None | |
132 return author[author.find('<') + 1:r] | |
133 | |
134 def ellipsis(text, maxlength=400): | |
135 """Trim string to at most maxlength (default: 400) columns in display.""" | |
136 return encoding.trim(text, maxlength, ellipsis='...') | |
137 | |
138 def escapestr(s): | |
139 # call underlying function of s.encode('string_escape') directly for | |
140 # Python 3 compatibility | |
141 return codecs.escape_encode(s)[0] | |
142 | |
143 def unescapestr(s): | |
144 return codecs.escape_decode(s)[0] | |
145 | |
146 def forcebytestr(obj): | |
147 """Portably format an arbitrary object (e.g. exception) into a byte | |
148 string.""" | |
149 try: | |
150 return pycompat.bytestr(obj) | |
151 except UnicodeEncodeError: | |
152 # non-ascii string, may be lossy | |
153 return pycompat.bytestr(encoding.strtolocal(str(obj))) | |
154 | |
155 def uirepr(s): | |
156 # Avoid double backslash in Windows path repr() | |
157 return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\') | |
158 | |
159 # delay import of textwrap | |
160 def _MBTextWrapper(**kwargs): | |
161 class tw(textwrap.TextWrapper): | |
162 """ | |
163 Extend TextWrapper for width-awareness. | |
164 | |
165 Neither number of 'bytes' in any encoding nor 'characters' is | |
166 appropriate to calculate terminal columns for specified string. | |
167 | |
168 Original TextWrapper implementation uses built-in 'len()' directly, | |
169 so overriding is needed to use width information of each characters. | |
170 | |
171 In addition, characters classified into 'ambiguous' width are | |
172 treated as wide in East Asian area, but as narrow in other. | |
173 | |
174 This requires use decision to determine width of such characters. | |
175 """ | |
176 def _cutdown(self, ucstr, space_left): | |
177 l = 0 | |
178 colwidth = encoding.ucolwidth | |
179 for i in xrange(len(ucstr)): | |
180 l += colwidth(ucstr[i]) | |
181 if space_left < l: | |
182 return (ucstr[:i], ucstr[i:]) | |
183 return ucstr, '' | |
184 | |
185 # overriding of base class | |
186 def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): | |
187 space_left = max(width - cur_len, 1) | |
188 | |
189 if self.break_long_words: | |
190 cut, res = self._cutdown(reversed_chunks[-1], space_left) | |
191 cur_line.append(cut) | |
192 reversed_chunks[-1] = res | |
193 elif not cur_line: | |
194 cur_line.append(reversed_chunks.pop()) | |
195 | |
196 # this overriding code is imported from TextWrapper of Python 2.6 | |
197 # to calculate columns of string by 'encoding.ucolwidth()' | |
198 def _wrap_chunks(self, chunks): | |
199 colwidth = encoding.ucolwidth | |
200 | |
201 lines = [] | |
202 if self.width <= 0: | |
203 raise ValueError("invalid width %r (must be > 0)" % self.width) | |
204 | |
205 # Arrange in reverse order so items can be efficiently popped | |
206 # from a stack of chucks. | |
207 chunks.reverse() | |
208 | |
209 while chunks: | |
210 | |
211 # Start the list of chunks that will make up the current line. | |
212 # cur_len is just the length of all the chunks in cur_line. | |
213 cur_line = [] | |
214 cur_len = 0 | |
215 | |
216 # Figure out which static string will prefix this line. | |
217 if lines: | |
218 indent = self.subsequent_indent | |
219 else: | |
220 indent = self.initial_indent | |
221 | |
222 # Maximum width for this line. | |
223 width = self.width - len(indent) | |
224 | |
225 # First chunk on line is whitespace -- drop it, unless this | |
226 # is the very beginning of the text (i.e. no lines started yet). | |
227 if self.drop_whitespace and chunks[-1].strip() == r'' and lines: | |
228 del chunks[-1] | |
229 | |
230 while chunks: | |
231 l = colwidth(chunks[-1]) | |
232 | |
233 # Can at least squeeze this chunk onto the current line. | |
234 if cur_len + l <= width: | |
235 cur_line.append(chunks.pop()) | |
236 cur_len += l | |
237 | |
238 # Nope, this line is full. | |
239 else: | |
240 break | |
241 | |
242 # The current line is full, and the next chunk is too big to | |
243 # fit on *any* line (not just this one). | |
244 if chunks and colwidth(chunks[-1]) > width: | |
245 self._handle_long_word(chunks, cur_line, cur_len, width) | |
246 | |
247 # If the last chunk on this line is all whitespace, drop it. | |
248 if (self.drop_whitespace and | |
249 cur_line and cur_line[-1].strip() == r''): | |
250 del cur_line[-1] | |
251 | |
252 # Convert current line back to a string and store it in list | |
253 # of all lines (return value). | |
254 if cur_line: | |
255 lines.append(indent + r''.join(cur_line)) | |
256 | |
257 return lines | |
258 | |
259 global _MBTextWrapper | |
260 _MBTextWrapper = tw | |
261 return tw(**kwargs) | |
262 | |
263 def wrap(line, width, initindent='', hangindent=''): | |
264 maxindent = max(len(hangindent), len(initindent)) | |
265 if width <= maxindent: | |
266 # adjust for weird terminal size | |
267 width = max(78, maxindent + 1) | |
268 line = line.decode(pycompat.sysstr(encoding.encoding), | |
269 pycompat.sysstr(encoding.encodingmode)) | |
270 initindent = initindent.decode(pycompat.sysstr(encoding.encoding), | |
271 pycompat.sysstr(encoding.encodingmode)) | |
272 hangindent = hangindent.decode(pycompat.sysstr(encoding.encoding), | |
273 pycompat.sysstr(encoding.encodingmode)) | |
274 wrapper = _MBTextWrapper(width=width, | |
275 initial_indent=initindent, | |
276 subsequent_indent=hangindent) | |
277 return wrapper.fill(line).encode(pycompat.sysstr(encoding.encoding)) | |
278 | |
279 _booleans = {'1': True, 'yes': True, 'true': True, 'on': True, 'always': True, | |
280 '0': False, 'no': False, 'false': False, 'off': False, | |
281 'never': False} | |
282 | |
283 def parsebool(s): | |
284 """Parse s into a boolean. | |
285 | |
286 If s is not a valid boolean, returns None. | |
287 """ | |
288 return _booleans.get(s.lower(), None) |