diff -r fcc6bd11444b -r aeb551a3bb8a mercurial/utils/cborutil.py --- a/mercurial/utils/cborutil.py Tue Aug 28 15:22:06 2018 -0700 +++ b/mercurial/utils/cborutil.py Tue Aug 28 15:02:48 2018 -0700 @@ -8,6 +8,7 @@ from __future__ import absolute_import import struct +import sys from ..thirdparty.cbor.cbor2 import ( decoder as decodermod, @@ -35,11 +36,16 @@ SUBTYPE_MASK = 0b00011111 +SUBTYPE_FALSE = 20 +SUBTYPE_TRUE = 21 +SUBTYPE_NULL = 22 SUBTYPE_HALF_FLOAT = 25 SUBTYPE_SINGLE_FLOAT = 26 SUBTYPE_DOUBLE_FLOAT = 27 SUBTYPE_INDEFINITE = 31 +SEMANTIC_TAG_FINITE_SET = 258 + # Indefinite types begin with their major type ORd with information value 31. BEGIN_INDEFINITE_BYTESTRING = struct.pack( r'>B', MAJOR_TYPE_BYTESTRING << 5 | SUBTYPE_INDEFINITE) @@ -146,7 +152,7 @@ def streamencodeset(s): # https://www.iana.org/assignments/cbor-tags/cbor-tags.xhtml defines # semantic tag 258 for finite sets. - yield encodelength(MAJOR_TYPE_SEMANTIC, 258) + yield encodelength(MAJOR_TYPE_SEMANTIC, SEMANTIC_TAG_FINITE_SET) for chunk in streamencodearray(sorted(s, key=_mixedtypesortkey)): yield chunk @@ -260,3 +266,710 @@ len(chunk), length)) yield chunk + +class CBORDecodeError(Exception): + """Represents an error decoding CBOR.""" + +if sys.version_info.major >= 3: + def _elementtointeger(b, i): + return b[i] +else: + def _elementtointeger(b, i): + return ord(b[i]) + +STRUCT_BIG_UBYTE = struct.Struct(r'>B') +STRUCT_BIG_USHORT = struct.Struct('>H') +STRUCT_BIG_ULONG = struct.Struct('>L') +STRUCT_BIG_ULONGLONG = struct.Struct('>Q') + +SPECIAL_NONE = 0 +SPECIAL_START_INDEFINITE_BYTESTRING = 1 +SPECIAL_START_ARRAY = 2 +SPECIAL_START_MAP = 3 +SPECIAL_START_SET = 4 +SPECIAL_INDEFINITE_BREAK = 5 + +def decodeitem(b, offset=0): + """Decode a new CBOR value from a buffer at offset. + + This function attempts to decode up to one complete CBOR value + from ``b`` starting at offset ``offset``. + + The beginning of a collection (such as an array, map, set, or + indefinite length bytestring) counts as a single value. For these + special cases, a state flag will indicate that a special value was seen. + + When called, the function either returns a decoded value or gives + a hint as to how many more bytes are needed to do so. By calling + the function repeatedly given a stream of bytes, the caller can + build up the original values. + + Returns a tuple with the following elements: + + * Bool indicating whether a complete value was decoded. + * A decoded value if first value is True otherwise None + * Integer number of bytes. If positive, the number of bytes + read. If negative, the number of bytes we need to read to + decode this value or the next chunk in this value. + * One of the ``SPECIAL_*`` constants indicating special treatment + for this value. ``SPECIAL_NONE`` means this is a fully decoded + simple value (such as an integer or bool). + """ + + initial = _elementtointeger(b, offset) + offset += 1 + + majortype = initial >> 5 + subtype = initial & SUBTYPE_MASK + + if majortype == MAJOR_TYPE_UINT: + complete, value, readcount = decodeuint(subtype, b, offset) + + if complete: + return True, value, readcount + 1, SPECIAL_NONE + else: + return False, None, readcount, SPECIAL_NONE + + elif majortype == MAJOR_TYPE_NEGINT: + # Negative integers are the same as UINT except inverted minus 1. + complete, value, readcount = decodeuint(subtype, b, offset) + + if complete: + return True, -value - 1, readcount + 1, SPECIAL_NONE + else: + return False, None, readcount, SPECIAL_NONE + + elif majortype == MAJOR_TYPE_BYTESTRING: + # Beginning of bytestrings are treated as uints in order to + # decode their length, which may be indefinite. + complete, size, readcount = decodeuint(subtype, b, offset, + allowindefinite=True) + + # We don't know the size of the bytestring. It must be a definitive + # length since the indefinite subtype would be encoded in the initial + # byte. + if not complete: + return False, None, readcount, SPECIAL_NONE + + # We know the length of the bytestring. + if size is not None: + # And the data is available in the buffer. + if offset + readcount + size <= len(b): + value = b[offset + readcount:offset + readcount + size] + return True, value, readcount + size + 1, SPECIAL_NONE + + # And we need more data in order to return the bytestring. + else: + wanted = len(b) - offset - readcount - size + return False, None, wanted, SPECIAL_NONE + + # It is an indefinite length bytestring. + else: + return True, None, 1, SPECIAL_START_INDEFINITE_BYTESTRING + + elif majortype == MAJOR_TYPE_STRING: + raise CBORDecodeError('string major type not supported') + + elif majortype == MAJOR_TYPE_ARRAY: + # Beginning of arrays are treated as uints in order to decode their + # length. We don't allow indefinite length arrays. + complete, size, readcount = decodeuint(subtype, b, offset) + + if complete: + return True, size, readcount + 1, SPECIAL_START_ARRAY + else: + return False, None, readcount, SPECIAL_NONE + + elif majortype == MAJOR_TYPE_MAP: + # Beginning of maps are treated as uints in order to decode their + # number of elements. We don't allow indefinite length arrays. + complete, size, readcount = decodeuint(subtype, b, offset) + + if complete: + return True, size, readcount + 1, SPECIAL_START_MAP + else: + return False, None, readcount, SPECIAL_NONE + + elif majortype == MAJOR_TYPE_SEMANTIC: + # Semantic tag value is read the same as a uint. + complete, tagvalue, readcount = decodeuint(subtype, b, offset) + + if not complete: + return False, None, readcount, SPECIAL_NONE + + # This behavior here is a little wonky. The main type being "decorated" + # by this semantic tag follows. A more robust parser would probably emit + # a special flag indicating this as a semantic tag and let the caller + # deal with the types that follow. But since we don't support many + # semantic tags, it is easier to deal with the special cases here and + # hide complexity from the caller. If we add support for more semantic + # tags, we should probably move semantic tag handling into the caller. + if tagvalue == SEMANTIC_TAG_FINITE_SET: + if offset + readcount >= len(b): + return False, None, -1, SPECIAL_NONE + + complete, size, readcount2, special = decodeitem(b, + offset + readcount) + + if not complete: + return False, None, readcount2, SPECIAL_NONE + + if special != SPECIAL_START_ARRAY: + raise CBORDecodeError('expected array after finite set ' + 'semantic tag') + + return True, size, readcount + readcount2 + 1, SPECIAL_START_SET + + else: + raise CBORDecodeError('semantic tag %d not allowed' % tagvalue) + + elif majortype == MAJOR_TYPE_SPECIAL: + # Only specific values for the information field are allowed. + if subtype == SUBTYPE_FALSE: + return True, False, 1, SPECIAL_NONE + elif subtype == SUBTYPE_TRUE: + return True, True, 1, SPECIAL_NONE + elif subtype == SUBTYPE_NULL: + return True, None, 1, SPECIAL_NONE + elif subtype == SUBTYPE_INDEFINITE: + return True, None, 1, SPECIAL_INDEFINITE_BREAK + # If value is 24, subtype is in next byte. + else: + raise CBORDecodeError('special type %d not allowed' % subtype) + else: + assert False + +def decodeuint(subtype, b, offset=0, allowindefinite=False): + """Decode an unsigned integer. + + ``subtype`` is the lower 5 bits from the initial byte CBOR item + "header." ``b`` is a buffer containing bytes. ``offset`` points to + the index of the first byte after the byte that ``subtype`` was + derived from. + + ``allowindefinite`` allows the special indefinite length value + indicator. + + Returns a 3-tuple of (successful, value, count). + + The first element is a bool indicating if decoding completed. The 2nd + is the decoded integer value or None if not fully decoded or the subtype + is 31 and ``allowindefinite`` is True. The 3rd value is the count of bytes. + If positive, it is the number of additional bytes decoded. If negative, + it is the number of additional bytes needed to decode this value. + """ + + # Small values are inline. + if subtype < 24: + return True, subtype, 0 + # Indefinite length specifier. + elif subtype == 31: + if allowindefinite: + return True, None, 0 + else: + raise CBORDecodeError('indefinite length uint not allowed here') + elif subtype >= 28: + raise CBORDecodeError('unsupported subtype on integer type: %d' % + subtype) + + if subtype == 24: + s = STRUCT_BIG_UBYTE + elif subtype == 25: + s = STRUCT_BIG_USHORT + elif subtype == 26: + s = STRUCT_BIG_ULONG + elif subtype == 27: + s = STRUCT_BIG_ULONGLONG + else: + raise CBORDecodeError('bounds condition checking violation') + + if len(b) - offset >= s.size: + return True, s.unpack_from(b, offset)[0], s.size + else: + return False, None, len(b) - offset - s.size + +class bytestringchunk(bytes): + """Represents a chunk/segment in an indefinite length bytestring. + + This behaves like a ``bytes`` but in addition has the ``isfirst`` + and ``islast`` attributes indicating whether this chunk is the first + or last in an indefinite length bytestring. + """ + + def __new__(cls, v, first=False, last=False): + self = bytes.__new__(cls, v) + self.isfirst = first + self.islast = last + + return self + +class sansiodecoder(object): + """A CBOR decoder that doesn't perform its own I/O. + + To use, construct an instance and feed it segments containing + CBOR-encoded bytes via ``decode()``. The return value from ``decode()`` + indicates whether a fully-decoded value is available, how many bytes + were consumed, and offers a hint as to how many bytes should be fed + in next time to decode the next value. + + The decoder assumes it will decode N discrete CBOR values, not just + a single value. i.e. if the bytestream contains uints packed one after + the other, the decoder will decode them all, rather than just the initial + one. + + When ``decode()`` indicates a value is available, call ``getavailable()`` + to return all fully decoded values. + + ``decode()`` can partially decode input. It is up to the caller to keep + track of what data was consumed and to pass unconsumed data in on the + next invocation. + + The decoder decodes atomically at the *item* level. See ``decodeitem()``. + If an *item* cannot be fully decoded, the decoder won't record it as + partially consumed. Instead, the caller will be instructed to pass in + the initial bytes of this item on the next invocation. This does result + in some redundant parsing. But the overhead should be minimal. + + This decoder only supports a subset of CBOR as required by Mercurial. + It lacks support for: + + * Indefinite length arrays + * Indefinite length maps + * Use of indefinite length bytestrings as keys or values within + arrays, maps, or sets. + * Nested arrays, maps, or sets within sets + * Any semantic tag that isn't a mathematical finite set + * Floating point numbers + * Undefined special value + + CBOR types are decoded to Python types as follows: + + uint -> int + negint -> int + bytestring -> bytes + map -> dict + array -> list + True -> bool + False -> bool + null -> None + indefinite length bytestring chunk -> [bytestringchunk] + + The only non-obvious mapping here is an indefinite length bytestring + to the ``bytestringchunk`` type. This is to facilitate streaming + indefinite length bytestrings out of the decoder and to differentiate + a regular bytestring from an indefinite length bytestring. + """ + + _STATE_NONE = 0 + _STATE_WANT_MAP_KEY = 1 + _STATE_WANT_MAP_VALUE = 2 + _STATE_WANT_ARRAY_VALUE = 3 + _STATE_WANT_SET_VALUE = 4 + _STATE_WANT_BYTESTRING_CHUNK_FIRST = 5 + _STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT = 6 + + def __init__(self): + # TODO add support for limiting size of bytestrings + # TODO add support for limiting number of keys / values in collections + # TODO add support for limiting size of buffered partial values + + self.decodedbytecount = 0 + + self._state = self._STATE_NONE + + # Stack of active nested collections. Each entry is a dict describing + # the collection. + self._collectionstack = [] + + # Fully decoded key to use for the current map. + self._currentmapkey = None + + # Fully decoded values available for retrieval. + self._decodedvalues = [] + + @property + def inprogress(self): + """Whether the decoder has partially decoded a value.""" + return self._state != self._STATE_NONE + + def decode(self, b, offset=0): + """Attempt to decode bytes from an input buffer. + + ``b`` is a collection of bytes and ``offset`` is the byte + offset within that buffer from which to begin reading data. + + ``b`` must support ``len()`` and accessing bytes slices via + ``__slice__``. Typically ``bytes`` instances are used. + + Returns a tuple with the following fields: + + * Bool indicating whether values are available for retrieval. + * Integer indicating the number of bytes that were fully consumed, + starting from ``offset``. + * Integer indicating the number of bytes that are desired for the + next call in order to decode an item. + """ + if not b: + return bool(self._decodedvalues), 0, 0 + + initialoffset = offset + + # We could easily split the body of this loop into a function. But + # Python performance is sensitive to function calls and collections + # are composed of many items. So leaving as a while loop could help + # with performance. One thing that may not help is the use of + # if..elif versus a lookup/dispatch table. There may be value + # in switching that. + while offset < len(b): + # Attempt to decode an item. This could be a whole value or a + # special value indicating an event, such as start or end of a + # collection or indefinite length type. + complete, value, readcount, special = decodeitem(b, offset) + + if readcount > 0: + self.decodedbytecount += readcount + + if not complete: + assert readcount < 0 + return ( + bool(self._decodedvalues), + offset - initialoffset, + -readcount, + ) + + offset += readcount + + # No nested state. We either have a full value or beginning of a + # complex value to deal with. + if self._state == self._STATE_NONE: + # A normal value. + if special == SPECIAL_NONE: + self._decodedvalues.append(value) + + elif special == SPECIAL_START_ARRAY: + self._collectionstack.append({ + 'remaining': value, + 'v': [], + }) + self._state = self._STATE_WANT_ARRAY_VALUE + + elif special == SPECIAL_START_MAP: + self._collectionstack.append({ + 'remaining': value, + 'v': {}, + }) + self._state = self._STATE_WANT_MAP_KEY + + elif special == SPECIAL_START_SET: + self._collectionstack.append({ + 'remaining': value, + 'v': set(), + }) + self._state = self._STATE_WANT_SET_VALUE + + elif special == SPECIAL_START_INDEFINITE_BYTESTRING: + self._state = self._STATE_WANT_BYTESTRING_CHUNK_FIRST + + else: + raise CBORDecodeError('unhandled special state: %d' % + special) + + # This value becomes an element of the current array. + elif self._state == self._STATE_WANT_ARRAY_VALUE: + # Simple values get appended. + if special == SPECIAL_NONE: + c = self._collectionstack[-1] + c['v'].append(value) + c['remaining'] -= 1 + + # self._state doesn't need changed. + + # An array nested within an array. + elif special == SPECIAL_START_ARRAY: + lastc = self._collectionstack[-1] + newvalue = [] + + lastc['v'].append(newvalue) + lastc['remaining'] -= 1 + + self._collectionstack.append({ + 'remaining': value, + 'v': newvalue, + }) + + # self._state doesn't need changed. + + # A map nested within an array. + elif special == SPECIAL_START_MAP: + lastc = self._collectionstack[-1] + newvalue = {} + + lastc['v'].append(newvalue) + lastc['remaining'] -= 1 + + self._collectionstack.append({ + 'remaining': value, + 'v': newvalue + }) + + self._state = self._STATE_WANT_MAP_KEY + + elif special == SPECIAL_START_SET: + lastc = self._collectionstack[-1] + newvalue = set() + + lastc['v'].append(newvalue) + lastc['remaining'] -= 1 + + self._collectionstack.append({ + 'remaining': value, + 'v': newvalue, + }) + + self._state = self._STATE_WANT_SET_VALUE + + elif special == SPECIAL_START_INDEFINITE_BYTESTRING: + raise CBORDecodeError('indefinite length bytestrings ' + 'not allowed as array values') + + else: + raise CBORDecodeError('unhandled special item when ' + 'expecting array value: %d' % special) + + # This value becomes the key of the current map instance. + elif self._state == self._STATE_WANT_MAP_KEY: + if special == SPECIAL_NONE: + self._currentmapkey = value + self._state = self._STATE_WANT_MAP_VALUE + + elif special == SPECIAL_START_INDEFINITE_BYTESTRING: + raise CBORDecodeError('indefinite length bytestrings ' + 'not allowed as map keys') + + elif special in (SPECIAL_START_ARRAY, SPECIAL_START_MAP, + SPECIAL_START_SET): + raise CBORDecodeError('collections not supported as map ' + 'keys') + + # We do not allow special values to be used as map keys. + else: + raise CBORDecodeError('unhandled special item when ' + 'expecting map key: %d' % special) + + # This value becomes the value of the current map key. + elif self._state == self._STATE_WANT_MAP_VALUE: + # Simple values simply get inserted into the map. + if special == SPECIAL_NONE: + lastc = self._collectionstack[-1] + lastc['v'][self._currentmapkey] = value + lastc['remaining'] -= 1 + + self._state = self._STATE_WANT_MAP_KEY + + # A new array is used as the map value. + elif special == SPECIAL_START_ARRAY: + lastc = self._collectionstack[-1] + newvalue = [] + + lastc['v'][self._currentmapkey] = newvalue + lastc['remaining'] -= 1 + + self._collectionstack.append({ + 'remaining': value, + 'v': newvalue, + }) + + self._state = self._STATE_WANT_ARRAY_VALUE + + # A new map is used as the map value. + elif special == SPECIAL_START_MAP: + lastc = self._collectionstack[-1] + newvalue = {} + + lastc['v'][self._currentmapkey] = newvalue + lastc['remaining'] -= 1 + + self._collectionstack.append({ + 'remaining': value, + 'v': newvalue, + }) + + self._state = self._STATE_WANT_MAP_KEY + + # A new set is used as the map value. + elif special == SPECIAL_START_SET: + lastc = self._collectionstack[-1] + newvalue = set() + + lastc['v'][self._currentmapkey] = newvalue + lastc['remaining'] -= 1 + + self._collectionstack.append({ + 'remaining': value, + 'v': newvalue, + }) + + self._state = self._STATE_WANT_SET_VALUE + + elif special == SPECIAL_START_INDEFINITE_BYTESTRING: + raise CBORDecodeError('indefinite length bytestrings not ' + 'allowed as map values') + + else: + raise CBORDecodeError('unhandled special item when ' + 'expecting map value: %d' % special) + + self._currentmapkey = None + + # This value is added to the current set. + elif self._state == self._STATE_WANT_SET_VALUE: + if special == SPECIAL_NONE: + lastc = self._collectionstack[-1] + lastc['v'].add(value) + lastc['remaining'] -= 1 + + elif special == SPECIAL_START_INDEFINITE_BYTESTRING: + raise CBORDecodeError('indefinite length bytestrings not ' + 'allowed as set values') + + elif special in (SPECIAL_START_ARRAY, + SPECIAL_START_MAP, + SPECIAL_START_SET): + raise CBORDecodeError('collections not allowed as set ' + 'values') + + # We don't allow non-trivial types to exist as set values. + else: + raise CBORDecodeError('unhandled special item when ' + 'expecting set value: %d' % special) + + # This value represents the first chunk in an indefinite length + # bytestring. + elif self._state == self._STATE_WANT_BYTESTRING_CHUNK_FIRST: + # We received a full chunk. + if special == SPECIAL_NONE: + self._decodedvalues.append(bytestringchunk(value, + first=True)) + + self._state = self._STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT + + # The end of stream marker. This means it is an empty + # indefinite length bytestring. + elif special == SPECIAL_INDEFINITE_BREAK: + # We /could/ convert this to a b''. But we want to preserve + # the nature of the underlying data so consumers expecting + # an indefinite length bytestring get one. + self._decodedvalues.append(bytestringchunk(b'', + first=True, + last=True)) + + # Since indefinite length bytestrings can't be used in + # collections, we must be at the root level. + assert not self._collectionstack + self._state = self._STATE_NONE + + else: + raise CBORDecodeError('unexpected special value when ' + 'expecting bytestring chunk: %d' % + special) + + # This value represents the non-initial chunk in an indefinite + # length bytestring. + elif self._state == self._STATE_WANT_BYTESTRING_CHUNK_SUBSEQUENT: + # We received a full chunk. + if special == SPECIAL_NONE: + self._decodedvalues.append(bytestringchunk(value)) + + # The end of stream marker. + elif special == SPECIAL_INDEFINITE_BREAK: + self._decodedvalues.append(bytestringchunk(b'', last=True)) + + # Since indefinite length bytestrings can't be used in + # collections, we must be at the root level. + assert not self._collectionstack + self._state = self._STATE_NONE + + else: + raise CBORDecodeError('unexpected special value when ' + 'expecting bytestring chunk: %d' % + special) + + else: + raise CBORDecodeError('unhandled decoder state: %d' % + self._state) + + # We could have just added the final value in a collection. End + # all complete collections at the top of the stack. + while True: + # Bail if we're not waiting on a new collection item. + if self._state not in (self._STATE_WANT_ARRAY_VALUE, + self._STATE_WANT_MAP_KEY, + self._STATE_WANT_SET_VALUE): + break + + # Or we are expecting more items for this collection. + lastc = self._collectionstack[-1] + + if lastc['remaining']: + break + + # The collection at the top of the stack is complete. + + # Discard it, as it isn't needed for future items. + self._collectionstack.pop() + + # If this is a nested collection, we don't emit it, since it + # will be emitted by its parent collection. But we do need to + # update state to reflect what the new top-most collection + # on the stack is. + if self._collectionstack: + self._state = { + list: self._STATE_WANT_ARRAY_VALUE, + dict: self._STATE_WANT_MAP_KEY, + set: self._STATE_WANT_SET_VALUE, + }[type(self._collectionstack[-1]['v'])] + + # If this is the root collection, emit it. + else: + self._decodedvalues.append(lastc['v']) + self._state = self._STATE_NONE + + return ( + bool(self._decodedvalues), + offset - initialoffset, + 0, + ) + + def getavailable(self): + """Returns an iterator over fully decoded values. + + Once values are retrieved, they won't be available on the next call. + """ + + l = list(self._decodedvalues) + self._decodedvalues = [] + return l + +def decodeall(b): + """Decode all CBOR items present in an iterable of bytes. + + In addition to regular decode errors, raises CBORDecodeError if the + entirety of the passed buffer does not fully decode to complete CBOR + values. This includes failure to decode any value, incomplete collection + types, incomplete indefinite length items, and extra data at the end of + the buffer. + """ + if not b: + return [] + + decoder = sansiodecoder() + + havevalues, readcount, wantbytes = decoder.decode(b) + + if readcount != len(b): + raise CBORDecodeError('input data not fully consumed') + + if decoder.inprogress: + raise CBORDecodeError('input data not complete') + + return decoder.getavailable()