comparison mercurial/thirdparty/tomli/_parser.py @ 50758:2c34c9b61a4f

thirdparty: vendor tomli The next commit will introduce a .toml file to abstract configitems away from Python. Python 3.11 has a toml read-only library (`tomllib`), which gives us a way out of vendoring eventually. For now, we vendor the backport, specifically version 1.2.3 which is still compatible with Python 3.6.
author Rapha?l Gom?s <rgomes@octobus.net>
date Mon, 23 Jan 2023 17:11:42 +0100
parents
children
comparison
equal deleted inserted replaced
50757:b584dae08774 50758:2c34c9b61a4f
1 import string
2 from types import MappingProxyType
3 from typing import Any, BinaryIO, Dict, FrozenSet, Iterable, NamedTuple, Optional, Tuple
4 import warnings
5
6 from ._re import (
7 RE_DATETIME,
8 RE_LOCALTIME,
9 RE_NUMBER,
10 match_to_datetime,
11 match_to_localtime,
12 match_to_number,
13 )
14 from ._types import Key, ParseFloat, Pos
15
16 ASCII_CTRL = frozenset(chr(i) for i in range(32)) | frozenset(chr(127))
17
18 # Neither of these sets include quotation mark or backslash. They are
19 # currently handled as separate cases in the parser functions.
20 ILLEGAL_BASIC_STR_CHARS = ASCII_CTRL - frozenset("\t")
21 ILLEGAL_MULTILINE_BASIC_STR_CHARS = ASCII_CTRL - frozenset("\t\n")
22
23 ILLEGAL_LITERAL_STR_CHARS = ILLEGAL_BASIC_STR_CHARS
24 ILLEGAL_MULTILINE_LITERAL_STR_CHARS = ILLEGAL_MULTILINE_BASIC_STR_CHARS
25
26 ILLEGAL_COMMENT_CHARS = ILLEGAL_BASIC_STR_CHARS
27
28 TOML_WS = frozenset(" \t")
29 TOML_WS_AND_NEWLINE = TOML_WS | frozenset("\n")
30 BARE_KEY_CHARS = frozenset(string.ascii_letters + string.digits + "-_")
31 KEY_INITIAL_CHARS = BARE_KEY_CHARS | frozenset("\"'")
32 HEXDIGIT_CHARS = frozenset(string.hexdigits)
33
34 BASIC_STR_ESCAPE_REPLACEMENTS = MappingProxyType(
35 {
36 "\\b": "\u0008", # backspace
37 "\\t": "\u0009", # tab
38 "\\n": "\u000A", # linefeed
39 "\\f": "\u000C", # form feed
40 "\\r": "\u000D", # carriage return
41 '\\"': "\u0022", # quote
42 "\\\\": "\u005C", # backslash
43 }
44 )
45
46
47 class TOMLDecodeError(ValueError):
48 """An error raised if a document is not valid TOML."""
49
50
51 def load(fp: BinaryIO, *, parse_float: ParseFloat = float) -> Dict[str, Any]:
52 """Parse TOML from a binary file object."""
53 s_bytes = fp.read()
54 try:
55 s = s_bytes.decode()
56 except AttributeError:
57 warnings.warn(
58 "Text file object support is deprecated in favor of binary file objects."
59 ' Use `open("foo.toml", "rb")` to open the file in binary mode.',
60 DeprecationWarning,
61 stacklevel=2,
62 )
63 s = s_bytes # type: ignore[assignment]
64 return loads(s, parse_float=parse_float)
65
66
67 def loads(s: str, *, parse_float: ParseFloat = float) -> Dict[str, Any]: # noqa: C901
68 """Parse TOML from a string."""
69
70 # The spec allows converting "\r\n" to "\n", even in string
71 # literals. Let's do so to simplify parsing.
72 src = s.replace("\r\n", "\n")
73 pos = 0
74 out = Output(NestedDict(), Flags())
75 header: Key = ()
76
77 # Parse one statement at a time
78 # (typically means one line in TOML source)
79 while True:
80 # 1. Skip line leading whitespace
81 pos = skip_chars(src, pos, TOML_WS)
82
83 # 2. Parse rules. Expect one of the following:
84 # - end of file
85 # - end of line
86 # - comment
87 # - key/value pair
88 # - append dict to list (and move to its namespace)
89 # - create dict (and move to its namespace)
90 # Skip trailing whitespace when applicable.
91 try:
92 char = src[pos]
93 except IndexError:
94 break
95 if char == "\n":
96 pos += 1
97 continue
98 if char in KEY_INITIAL_CHARS:
99 pos = key_value_rule(src, pos, out, header, parse_float)
100 pos = skip_chars(src, pos, TOML_WS)
101 elif char == "[":
102 try:
103 second_char: Optional[str] = src[pos + 1]
104 except IndexError:
105 second_char = None
106 if second_char == "[":
107 pos, header = create_list_rule(src, pos, out)
108 else:
109 pos, header = create_dict_rule(src, pos, out)
110 pos = skip_chars(src, pos, TOML_WS)
111 elif char != "#":
112 raise suffixed_err(src, pos, "Invalid statement")
113
114 # 3. Skip comment
115 pos = skip_comment(src, pos)
116
117 # 4. Expect end of line or end of file
118 try:
119 char = src[pos]
120 except IndexError:
121 break
122 if char != "\n":
123 raise suffixed_err(
124 src, pos, "Expected newline or end of document after a statement"
125 )
126 pos += 1
127
128 return out.data.dict
129
130
131 class Flags:
132 """Flags that map to parsed keys/namespaces."""
133
134 # Marks an immutable namespace (inline array or inline table).
135 FROZEN = 0
136 # Marks a nest that has been explicitly created and can no longer
137 # be opened using the "[table]" syntax.
138 EXPLICIT_NEST = 1
139
140 def __init__(self) -> None:
141 self._flags: Dict[str, dict] = {}
142
143 def unset_all(self, key: Key) -> None:
144 cont = self._flags
145 for k in key[:-1]:
146 if k not in cont:
147 return
148 cont = cont[k]["nested"]
149 cont.pop(key[-1], None)
150
151 def set_for_relative_key(self, head_key: Key, rel_key: Key, flag: int) -> None:
152 cont = self._flags
153 for k in head_key:
154 if k not in cont:
155 cont[k] = {"flags": set(), "recursive_flags": set(), "nested": {}}
156 cont = cont[k]["nested"]
157 for k in rel_key:
158 if k in cont:
159 cont[k]["flags"].add(flag)
160 else:
161 cont[k] = {"flags": {flag}, "recursive_flags": set(), "nested": {}}
162 cont = cont[k]["nested"]
163
164 def set(self, key: Key, flag: int, *, recursive: bool) -> None: # noqa: A003
165 cont = self._flags
166 key_parent, key_stem = key[:-1], key[-1]
167 for k in key_parent:
168 if k not in cont:
169 cont[k] = {"flags": set(), "recursive_flags": set(), "nested": {}}
170 cont = cont[k]["nested"]
171 if key_stem not in cont:
172 cont[key_stem] = {"flags": set(), "recursive_flags": set(), "nested": {}}
173 cont[key_stem]["recursive_flags" if recursive else "flags"].add(flag)
174
175 def is_(self, key: Key, flag: int) -> bool:
176 if not key:
177 return False # document root has no flags
178 cont = self._flags
179 for k in key[:-1]:
180 if k not in cont:
181 return False
182 inner_cont = cont[k]
183 if flag in inner_cont["recursive_flags"]:
184 return True
185 cont = inner_cont["nested"]
186 key_stem = key[-1]
187 if key_stem in cont:
188 cont = cont[key_stem]
189 return flag in cont["flags"] or flag in cont["recursive_flags"]
190 return False
191
192
193 class NestedDict:
194 def __init__(self) -> None:
195 # The parsed content of the TOML document
196 self.dict: Dict[str, Any] = {}
197
198 def get_or_create_nest(
199 self,
200 key: Key,
201 *,
202 access_lists: bool = True,
203 ) -> dict:
204 cont: Any = self.dict
205 for k in key:
206 if k not in cont:
207 cont[k] = {}
208 cont = cont[k]
209 if access_lists and isinstance(cont, list):
210 cont = cont[-1]
211 if not isinstance(cont, dict):
212 raise KeyError("There is no nest behind this key")
213 return cont
214
215 def append_nest_to_list(self, key: Key) -> None:
216 cont = self.get_or_create_nest(key[:-1])
217 last_key = key[-1]
218 if last_key in cont:
219 list_ = cont[last_key]
220 try:
221 list_.append({})
222 except AttributeError:
223 raise KeyError("An object other than list found behind this key")
224 else:
225 cont[last_key] = [{}]
226
227
228 class Output(NamedTuple):
229 data: NestedDict
230 flags: Flags
231
232
233 def skip_chars(src: str, pos: Pos, chars: Iterable[str]) -> Pos:
234 try:
235 while src[pos] in chars:
236 pos += 1
237 except IndexError:
238 pass
239 return pos
240
241
242 def skip_until(
243 src: str,
244 pos: Pos,
245 expect: str,
246 *,
247 error_on: FrozenSet[str],
248 error_on_eof: bool,
249 ) -> Pos:
250 try:
251 new_pos = src.index(expect, pos)
252 except ValueError:
253 new_pos = len(src)
254 if error_on_eof:
255 raise suffixed_err(src, new_pos, f"Expected {expect!r}") from None
256
257 if not error_on.isdisjoint(src[pos:new_pos]):
258 while src[pos] not in error_on:
259 pos += 1
260 raise suffixed_err(src, pos, f"Found invalid character {src[pos]!r}")
261 return new_pos
262
263
264 def skip_comment(src: str, pos: Pos) -> Pos:
265 try:
266 char: Optional[str] = src[pos]
267 except IndexError:
268 char = None
269 if char == "#":
270 return skip_until(
271 src, pos + 1, "\n", error_on=ILLEGAL_COMMENT_CHARS, error_on_eof=False
272 )
273 return pos
274
275
276 def skip_comments_and_array_ws(src: str, pos: Pos) -> Pos:
277 while True:
278 pos_before_skip = pos
279 pos = skip_chars(src, pos, TOML_WS_AND_NEWLINE)
280 pos = skip_comment(src, pos)
281 if pos == pos_before_skip:
282 return pos
283
284
285 def create_dict_rule(src: str, pos: Pos, out: Output) -> Tuple[Pos, Key]:
286 pos += 1 # Skip "["
287 pos = skip_chars(src, pos, TOML_WS)
288 pos, key = parse_key(src, pos)
289
290 if out.flags.is_(key, Flags.EXPLICIT_NEST) or out.flags.is_(key, Flags.FROZEN):
291 raise suffixed_err(src, pos, f"Can not declare {key} twice")
292 out.flags.set(key, Flags.EXPLICIT_NEST, recursive=False)
293 try:
294 out.data.get_or_create_nest(key)
295 except KeyError:
296 raise suffixed_err(src, pos, "Can not overwrite a value") from None
297
298 if not src.startswith("]", pos):
299 raise suffixed_err(src, pos, 'Expected "]" at the end of a table declaration')
300 return pos + 1, key
301
302
303 def create_list_rule(src: str, pos: Pos, out: Output) -> Tuple[Pos, Key]:
304 pos += 2 # Skip "[["
305 pos = skip_chars(src, pos, TOML_WS)
306 pos, key = parse_key(src, pos)
307
308 if out.flags.is_(key, Flags.FROZEN):
309 raise suffixed_err(src, pos, f"Can not mutate immutable namespace {key}")
310 # Free the namespace now that it points to another empty list item...
311 out.flags.unset_all(key)
312 # ...but this key precisely is still prohibited from table declaration
313 out.flags.set(key, Flags.EXPLICIT_NEST, recursive=False)
314 try:
315 out.data.append_nest_to_list(key)
316 except KeyError:
317 raise suffixed_err(src, pos, "Can not overwrite a value") from None
318
319 if not src.startswith("]]", pos):
320 raise suffixed_err(src, pos, 'Expected "]]" at the end of an array declaration')
321 return pos + 2, key
322
323
324 def key_value_rule(
325 src: str, pos: Pos, out: Output, header: Key, parse_float: ParseFloat
326 ) -> Pos:
327 pos, key, value = parse_key_value_pair(src, pos, parse_float)
328 key_parent, key_stem = key[:-1], key[-1]
329 abs_key_parent = header + key_parent
330
331 if out.flags.is_(abs_key_parent, Flags.FROZEN):
332 raise suffixed_err(
333 src, pos, f"Can not mutate immutable namespace {abs_key_parent}"
334 )
335 # Containers in the relative path can't be opened with the table syntax after this
336 out.flags.set_for_relative_key(header, key, Flags.EXPLICIT_NEST)
337 try:
338 nest = out.data.get_or_create_nest(abs_key_parent)
339 except KeyError:
340 raise suffixed_err(src, pos, "Can not overwrite a value") from None
341 if key_stem in nest:
342 raise suffixed_err(src, pos, "Can not overwrite a value")
343 # Mark inline table and array namespaces recursively immutable
344 if isinstance(value, (dict, list)):
345 out.flags.set(header + key, Flags.FROZEN, recursive=True)
346 nest[key_stem] = value
347 return pos
348
349
350 def parse_key_value_pair(
351 src: str, pos: Pos, parse_float: ParseFloat
352 ) -> Tuple[Pos, Key, Any]:
353 pos, key = parse_key(src, pos)
354 try:
355 char: Optional[str] = src[pos]
356 except IndexError:
357 char = None
358 if char != "=":
359 raise suffixed_err(src, pos, 'Expected "=" after a key in a key/value pair')
360 pos += 1
361 pos = skip_chars(src, pos, TOML_WS)
362 pos, value = parse_value(src, pos, parse_float)
363 return pos, key, value
364
365
366 def parse_key(src: str, pos: Pos) -> Tuple[Pos, Key]:
367 pos, key_part = parse_key_part(src, pos)
368 key: Key = (key_part,)
369 pos = skip_chars(src, pos, TOML_WS)
370 while True:
371 try:
372 char: Optional[str] = src[pos]
373 except IndexError:
374 char = None
375 if char != ".":
376 return pos, key
377 pos += 1
378 pos = skip_chars(src, pos, TOML_WS)
379 pos, key_part = parse_key_part(src, pos)
380 key += (key_part,)
381 pos = skip_chars(src, pos, TOML_WS)
382
383
384 def parse_key_part(src: str, pos: Pos) -> Tuple[Pos, str]:
385 try:
386 char: Optional[str] = src[pos]
387 except IndexError:
388 char = None
389 if char in BARE_KEY_CHARS:
390 start_pos = pos
391 pos = skip_chars(src, pos, BARE_KEY_CHARS)
392 return pos, src[start_pos:pos]
393 if char == "'":
394 return parse_literal_str(src, pos)
395 if char == '"':
396 return parse_one_line_basic_str(src, pos)
397 raise suffixed_err(src, pos, "Invalid initial character for a key part")
398
399
400 def parse_one_line_basic_str(src: str, pos: Pos) -> Tuple[Pos, str]:
401 pos += 1
402 return parse_basic_str(src, pos, multiline=False)
403
404
405 def parse_array(src: str, pos: Pos, parse_float: ParseFloat) -> Tuple[Pos, list]:
406 pos += 1
407 array: list = []
408
409 pos = skip_comments_and_array_ws(src, pos)
410 if src.startswith("]", pos):
411 return pos + 1, array
412 while True:
413 pos, val = parse_value(src, pos, parse_float)
414 array.append(val)
415 pos = skip_comments_and_array_ws(src, pos)
416
417 c = src[pos : pos + 1]
418 if c == "]":
419 return pos + 1, array
420 if c != ",":
421 raise suffixed_err(src, pos, "Unclosed array")
422 pos += 1
423
424 pos = skip_comments_and_array_ws(src, pos)
425 if src.startswith("]", pos):
426 return pos + 1, array
427
428
429 def parse_inline_table(src: str, pos: Pos, parse_float: ParseFloat) -> Tuple[Pos, dict]:
430 pos += 1
431 nested_dict = NestedDict()
432 flags = Flags()
433
434 pos = skip_chars(src, pos, TOML_WS)
435 if src.startswith("}", pos):
436 return pos + 1, nested_dict.dict
437 while True:
438 pos, key, value = parse_key_value_pair(src, pos, parse_float)
439 key_parent, key_stem = key[:-1], key[-1]
440 if flags.is_(key, Flags.FROZEN):
441 raise suffixed_err(src, pos, f"Can not mutate immutable namespace {key}")
442 try:
443 nest = nested_dict.get_or_create_nest(key_parent, access_lists=False)
444 except KeyError:
445 raise suffixed_err(src, pos, "Can not overwrite a value") from None
446 if key_stem in nest:
447 raise suffixed_err(src, pos, f"Duplicate inline table key {key_stem!r}")
448 nest[key_stem] = value
449 pos = skip_chars(src, pos, TOML_WS)
450 c = src[pos : pos + 1]
451 if c == "}":
452 return pos + 1, nested_dict.dict
453 if c != ",":
454 raise suffixed_err(src, pos, "Unclosed inline table")
455 if isinstance(value, (dict, list)):
456 flags.set(key, Flags.FROZEN, recursive=True)
457 pos += 1
458 pos = skip_chars(src, pos, TOML_WS)
459
460
461 def parse_basic_str_escape( # noqa: C901
462 src: str, pos: Pos, *, multiline: bool = False
463 ) -> Tuple[Pos, str]:
464 escape_id = src[pos : pos + 2]
465 pos += 2
466 if multiline and escape_id in {"\\ ", "\\\t", "\\\n"}:
467 # Skip whitespace until next non-whitespace character or end of
468 # the doc. Error if non-whitespace is found before newline.
469 if escape_id != "\\\n":
470 pos = skip_chars(src, pos, TOML_WS)
471 try:
472 char = src[pos]
473 except IndexError:
474 return pos, ""
475 if char != "\n":
476 raise suffixed_err(src, pos, 'Unescaped "\\" in a string')
477 pos += 1
478 pos = skip_chars(src, pos, TOML_WS_AND_NEWLINE)
479 return pos, ""
480 if escape_id == "\\u":
481 return parse_hex_char(src, pos, 4)
482 if escape_id == "\\U":
483 return parse_hex_char(src, pos, 8)
484 try:
485 return pos, BASIC_STR_ESCAPE_REPLACEMENTS[escape_id]
486 except KeyError:
487 if len(escape_id) != 2:
488 raise suffixed_err(src, pos, "Unterminated string") from None
489 raise suffixed_err(src, pos, 'Unescaped "\\" in a string') from None
490
491
492 def parse_basic_str_escape_multiline(src: str, pos: Pos) -> Tuple[Pos, str]:
493 return parse_basic_str_escape(src, pos, multiline=True)
494
495
496 def parse_hex_char(src: str, pos: Pos, hex_len: int) -> Tuple[Pos, str]:
497 hex_str = src[pos : pos + hex_len]
498 if len(hex_str) != hex_len or not HEXDIGIT_CHARS.issuperset(hex_str):
499 raise suffixed_err(src, pos, "Invalid hex value")
500 pos += hex_len
501 hex_int = int(hex_str, 16)
502 if not is_unicode_scalar_value(hex_int):
503 raise suffixed_err(src, pos, "Escaped character is not a Unicode scalar value")
504 return pos, chr(hex_int)
505
506
507 def parse_literal_str(src: str, pos: Pos) -> Tuple[Pos, str]:
508 pos += 1 # Skip starting apostrophe
509 start_pos = pos
510 pos = skip_until(
511 src, pos, "'", error_on=ILLEGAL_LITERAL_STR_CHARS, error_on_eof=True
512 )
513 return pos + 1, src[start_pos:pos] # Skip ending apostrophe
514
515
516 def parse_multiline_str(src: str, pos: Pos, *, literal: bool) -> Tuple[Pos, str]:
517 pos += 3
518 if src.startswith("\n", pos):
519 pos += 1
520
521 if literal:
522 delim = "'"
523 end_pos = skip_until(
524 src,
525 pos,
526 "'''",
527 error_on=ILLEGAL_MULTILINE_LITERAL_STR_CHARS,
528 error_on_eof=True,
529 )
530 result = src[pos:end_pos]
531 pos = end_pos + 3
532 else:
533 delim = '"'
534 pos, result = parse_basic_str(src, pos, multiline=True)
535
536 # Add at maximum two extra apostrophes/quotes if the end sequence
537 # is 4 or 5 chars long instead of just 3.
538 if not src.startswith(delim, pos):
539 return pos, result
540 pos += 1
541 if not src.startswith(delim, pos):
542 return pos, result + delim
543 pos += 1
544 return pos, result + (delim * 2)
545
546
547 def parse_basic_str(src: str, pos: Pos, *, multiline: bool) -> Tuple[Pos, str]:
548 if multiline:
549 error_on = ILLEGAL_MULTILINE_BASIC_STR_CHARS
550 parse_escapes = parse_basic_str_escape_multiline
551 else:
552 error_on = ILLEGAL_BASIC_STR_CHARS
553 parse_escapes = parse_basic_str_escape
554 result = ""
555 start_pos = pos
556 while True:
557 try:
558 char = src[pos]
559 except IndexError:
560 raise suffixed_err(src, pos, "Unterminated string") from None
561 if char == '"':
562 if not multiline:
563 return pos + 1, result + src[start_pos:pos]
564 if src.startswith('"""', pos):
565 return pos + 3, result + src[start_pos:pos]
566 pos += 1
567 continue
568 if char == "\\":
569 result += src[start_pos:pos]
570 pos, parsed_escape = parse_escapes(src, pos)
571 result += parsed_escape
572 start_pos = pos
573 continue
574 if char in error_on:
575 raise suffixed_err(src, pos, f"Illegal character {char!r}")
576 pos += 1
577
578
579 def parse_value( # noqa: C901
580 src: str, pos: Pos, parse_float: ParseFloat
581 ) -> Tuple[Pos, Any]:
582 try:
583 char: Optional[str] = src[pos]
584 except IndexError:
585 char = None
586
587 # Basic strings
588 if char == '"':
589 if src.startswith('"""', pos):
590 return parse_multiline_str(src, pos, literal=False)
591 return parse_one_line_basic_str(src, pos)
592
593 # Literal strings
594 if char == "'":
595 if src.startswith("'''", pos):
596 return parse_multiline_str(src, pos, literal=True)
597 return parse_literal_str(src, pos)
598
599 # Booleans
600 if char == "t":
601 if src.startswith("true", pos):
602 return pos + 4, True
603 if char == "f":
604 if src.startswith("false", pos):
605 return pos + 5, False
606
607 # Dates and times
608 datetime_match = RE_DATETIME.match(src, pos)
609 if datetime_match:
610 try:
611 datetime_obj = match_to_datetime(datetime_match)
612 except ValueError as e:
613 raise suffixed_err(src, pos, "Invalid date or datetime") from e
614 return datetime_match.end(), datetime_obj
615 localtime_match = RE_LOCALTIME.match(src, pos)
616 if localtime_match:
617 return localtime_match.end(), match_to_localtime(localtime_match)
618
619 # Integers and "normal" floats.
620 # The regex will greedily match any type starting with a decimal
621 # char, so needs to be located after handling of dates and times.
622 number_match = RE_NUMBER.match(src, pos)
623 if number_match:
624 return number_match.end(), match_to_number(number_match, parse_float)
625
626 # Arrays
627 if char == "[":
628 return parse_array(src, pos, parse_float)
629
630 # Inline tables
631 if char == "{":
632 return parse_inline_table(src, pos, parse_float)
633
634 # Special floats
635 first_three = src[pos : pos + 3]
636 if first_three in {"inf", "nan"}:
637 return pos + 3, parse_float(first_three)
638 first_four = src[pos : pos + 4]
639 if first_four in {"-inf", "+inf", "-nan", "+nan"}:
640 return pos + 4, parse_float(first_four)
641
642 raise suffixed_err(src, pos, "Invalid value")
643
644
645 def suffixed_err(src: str, pos: Pos, msg: str) -> TOMLDecodeError:
646 """Return a `TOMLDecodeError` where error message is suffixed with
647 coordinates in source."""
648
649 def coord_repr(src: str, pos: Pos) -> str:
650 if pos >= len(src):
651 return "end of document"
652 line = src.count("\n", 0, pos) + 1
653 if line == 1:
654 column = pos + 1
655 else:
656 column = pos - src.rindex("\n", 0, pos)
657 return f"line {line}, column {column}"
658
659 return TOMLDecodeError(f"{msg} (at {coord_repr(src, pos)})")
660
661
662 def is_unicode_scalar_value(codepoint: int) -> bool:
663 return (0 <= codepoint <= 55295) or (57344 <= codepoint <= 1114111)