comparison mercurial/utils/stringutil.py @ 37083:f99d64e8a4e4

stringutil: move generic string helpers to new module Per https://phab.mercurial-scm.org/D2903#46738 URL and file paths functions are left since they are big enough to make separate modules.
author Yuya Nishihara <yuya@tcha.org>
date Thu, 22 Mar 2018 21:19:31 +0900
parents mercurial/util.py@1a1d1c44b570
children f8e1f48de118
comparison
equal deleted inserted replaced
37082:1a1d1c44b570 37083:f99d64e8a4e4
1 # stringutil.py - utility for generic string formatting, parsing, etc.
2 #
3 # Copyright 2005 K. Thananchayan <thananck@yahoo.com>
4 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
5 # Copyright 2006 Vadim Gelfer <vadim.gelfer@gmail.com>
6 #
7 # This software may be used and distributed according to the terms of the
8 # GNU General Public License version 2 or any later version.
9
10 from __future__ import absolute_import
11
12 import codecs
13 import re as remod
14 import textwrap
15
16 from ..i18n import _
17
18 from .. import (
19 encoding,
20 error,
21 pycompat,
22 )
23
24 _DATA_ESCAPE_MAP = {pycompat.bytechr(i): br'\x%02x' % i for i in range(256)}
25 _DATA_ESCAPE_MAP.update({
26 b'\\': b'\\\\',
27 b'\r': br'\r',
28 b'\n': br'\n',
29 })
30 _DATA_ESCAPE_RE = remod.compile(br'[\x00-\x08\x0a-\x1f\\\x7f-\xff]')
31
32 def escapedata(s):
33 if isinstance(s, bytearray):
34 s = bytes(s)
35
36 return _DATA_ESCAPE_RE.sub(lambda m: _DATA_ESCAPE_MAP[m.group(0)], s)
37
38 def binary(s):
39 """return true if a string is binary data"""
40 return bool(s and '\0' in s)
41
42 def stringmatcher(pattern, casesensitive=True):
43 """
44 accepts a string, possibly starting with 're:' or 'literal:' prefix.
45 returns the matcher name, pattern, and matcher function.
46 missing or unknown prefixes are treated as literal matches.
47
48 helper for tests:
49 >>> def test(pattern, *tests):
50 ... kind, pattern, matcher = stringmatcher(pattern)
51 ... return (kind, pattern, [bool(matcher(t)) for t in tests])
52 >>> def itest(pattern, *tests):
53 ... kind, pattern, matcher = stringmatcher(pattern, casesensitive=False)
54 ... return (kind, pattern, [bool(matcher(t)) for t in tests])
55
56 exact matching (no prefix):
57 >>> test(b'abcdefg', b'abc', b'def', b'abcdefg')
58 ('literal', 'abcdefg', [False, False, True])
59
60 regex matching ('re:' prefix)
61 >>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar')
62 ('re', 'a.+b', [False, False, True])
63
64 force exact matches ('literal:' prefix)
65 >>> test(b'literal:re:foobar', b'foobar', b're:foobar')
66 ('literal', 're:foobar', [False, True])
67
68 unknown prefixes are ignored and treated as literals
69 >>> test(b'foo:bar', b'foo', b'bar', b'foo:bar')
70 ('literal', 'foo:bar', [False, False, True])
71
72 case insensitive regex matches
73 >>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar')
74 ('re', 'A.+b', [False, False, True])
75
76 case insensitive literal matches
77 >>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg')
78 ('literal', 'ABCDEFG', [False, False, True])
79 """
80 if pattern.startswith('re:'):
81 pattern = pattern[3:]
82 try:
83 flags = 0
84 if not casesensitive:
85 flags = remod.I
86 regex = remod.compile(pattern, flags)
87 except remod.error as e:
88 raise error.ParseError(_('invalid regular expression: %s')
89 % e)
90 return 're', pattern, regex.search
91 elif pattern.startswith('literal:'):
92 pattern = pattern[8:]
93
94 match = pattern.__eq__
95
96 if not casesensitive:
97 ipat = encoding.lower(pattern)
98 match = lambda s: ipat == encoding.lower(s)
99 return 'literal', pattern, match
100
101 def shortuser(user):
102 """Return a short representation of a user name or email address."""
103 f = user.find('@')
104 if f >= 0:
105 user = user[:f]
106 f = user.find('<')
107 if f >= 0:
108 user = user[f + 1:]
109 f = user.find(' ')
110 if f >= 0:
111 user = user[:f]
112 f = user.find('.')
113 if f >= 0:
114 user = user[:f]
115 return user
116
117 def emailuser(user):
118 """Return the user portion of an email address."""
119 f = user.find('@')
120 if f >= 0:
121 user = user[:f]
122 f = user.find('<')
123 if f >= 0:
124 user = user[f + 1:]
125 return user
126
127 def email(author):
128 '''get email of author.'''
129 r = author.find('>')
130 if r == -1:
131 r = None
132 return author[author.find('<') + 1:r]
133
134 def ellipsis(text, maxlength=400):
135 """Trim string to at most maxlength (default: 400) columns in display."""
136 return encoding.trim(text, maxlength, ellipsis='...')
137
138 def escapestr(s):
139 # call underlying function of s.encode('string_escape') directly for
140 # Python 3 compatibility
141 return codecs.escape_encode(s)[0]
142
143 def unescapestr(s):
144 return codecs.escape_decode(s)[0]
145
146 def forcebytestr(obj):
147 """Portably format an arbitrary object (e.g. exception) into a byte
148 string."""
149 try:
150 return pycompat.bytestr(obj)
151 except UnicodeEncodeError:
152 # non-ascii string, may be lossy
153 return pycompat.bytestr(encoding.strtolocal(str(obj)))
154
155 def uirepr(s):
156 # Avoid double backslash in Windows path repr()
157 return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\')
158
159 # delay import of textwrap
160 def _MBTextWrapper(**kwargs):
161 class tw(textwrap.TextWrapper):
162 """
163 Extend TextWrapper for width-awareness.
164
165 Neither number of 'bytes' in any encoding nor 'characters' is
166 appropriate to calculate terminal columns for specified string.
167
168 Original TextWrapper implementation uses built-in 'len()' directly,
169 so overriding is needed to use width information of each characters.
170
171 In addition, characters classified into 'ambiguous' width are
172 treated as wide in East Asian area, but as narrow in other.
173
174 This requires use decision to determine width of such characters.
175 """
176 def _cutdown(self, ucstr, space_left):
177 l = 0
178 colwidth = encoding.ucolwidth
179 for i in xrange(len(ucstr)):
180 l += colwidth(ucstr[i])
181 if space_left < l:
182 return (ucstr[:i], ucstr[i:])
183 return ucstr, ''
184
185 # overriding of base class
186 def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
187 space_left = max(width - cur_len, 1)
188
189 if self.break_long_words:
190 cut, res = self._cutdown(reversed_chunks[-1], space_left)
191 cur_line.append(cut)
192 reversed_chunks[-1] = res
193 elif not cur_line:
194 cur_line.append(reversed_chunks.pop())
195
196 # this overriding code is imported from TextWrapper of Python 2.6
197 # to calculate columns of string by 'encoding.ucolwidth()'
198 def _wrap_chunks(self, chunks):
199 colwidth = encoding.ucolwidth
200
201 lines = []
202 if self.width <= 0:
203 raise ValueError("invalid width %r (must be > 0)" % self.width)
204
205 # Arrange in reverse order so items can be efficiently popped
206 # from a stack of chucks.
207 chunks.reverse()
208
209 while chunks:
210
211 # Start the list of chunks that will make up the current line.
212 # cur_len is just the length of all the chunks in cur_line.
213 cur_line = []
214 cur_len = 0
215
216 # Figure out which static string will prefix this line.
217 if lines:
218 indent = self.subsequent_indent
219 else:
220 indent = self.initial_indent
221
222 # Maximum width for this line.
223 width = self.width - len(indent)
224
225 # First chunk on line is whitespace -- drop it, unless this
226 # is the very beginning of the text (i.e. no lines started yet).
227 if self.drop_whitespace and chunks[-1].strip() == r'' and lines:
228 del chunks[-1]
229
230 while chunks:
231 l = colwidth(chunks[-1])
232
233 # Can at least squeeze this chunk onto the current line.
234 if cur_len + l <= width:
235 cur_line.append(chunks.pop())
236 cur_len += l
237
238 # Nope, this line is full.
239 else:
240 break
241
242 # The current line is full, and the next chunk is too big to
243 # fit on *any* line (not just this one).
244 if chunks and colwidth(chunks[-1]) > width:
245 self._handle_long_word(chunks, cur_line, cur_len, width)
246
247 # If the last chunk on this line is all whitespace, drop it.
248 if (self.drop_whitespace and
249 cur_line and cur_line[-1].strip() == r''):
250 del cur_line[-1]
251
252 # Convert current line back to a string and store it in list
253 # of all lines (return value).
254 if cur_line:
255 lines.append(indent + r''.join(cur_line))
256
257 return lines
258
259 global _MBTextWrapper
260 _MBTextWrapper = tw
261 return tw(**kwargs)
262
263 def wrap(line, width, initindent='', hangindent=''):
264 maxindent = max(len(hangindent), len(initindent))
265 if width <= maxindent:
266 # adjust for weird terminal size
267 width = max(78, maxindent + 1)
268 line = line.decode(pycompat.sysstr(encoding.encoding),
269 pycompat.sysstr(encoding.encodingmode))
270 initindent = initindent.decode(pycompat.sysstr(encoding.encoding),
271 pycompat.sysstr(encoding.encodingmode))
272 hangindent = hangindent.decode(pycompat.sysstr(encoding.encoding),
273 pycompat.sysstr(encoding.encodingmode))
274 wrapper = _MBTextWrapper(width=width,
275 initial_indent=initindent,
276 subsequent_indent=hangindent)
277 return wrapper.fill(line).encode(pycompat.sysstr(encoding.encoding))
278
279 _booleans = {'1': True, 'yes': True, 'true': True, 'on': True, 'always': True,
280 '0': False, 'no': False, 'false': False, 'off': False,
281 'never': False}
282
283 def parsebool(s):
284 """Parse s into a boolean.
285
286 If s is not a valid boolean, returns None.
287 """
288 return _booleans.get(s.lower(), None)