Created
April 1, 2019 16:18
-
-
Save gwenzek/800e29521a0fa926aec0cd9db60ce6b6 to your computer and use it in GitHub Desktop.
Revisions
-
gwenzek created this gist
Apr 1, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,284 @@ """Implementation of a flexible JSON decoder import json from json_decoder import FlexibleJSONDecoder j = json.loads(raw, cls=FlexibleJSONDecoder) """ import json import re from json import scanner, JSONDecodeError try: from _json import scanstring as c_scanstring except ImportError: c_scanstring = None FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL NaN = float('nan') PosInf = float('inf') NegInf = float('-inf') _CONSTANTS = { '-Infinity': NegInf, 'Infinity': PosInf, 'NaN': NaN, } STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS) BACKSLASH = { '"': '"', '\\': '\\', '/': '/', 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t', } def _decode_uXXXX(s, pos): esc = s[pos + 1:pos + 5] if len(esc) == 4 and esc[1] not in 'xX': try: return int(esc, 16) except ValueError: pass msg = "Invalid \\uXXXX escape" raise JSONDecodeError(msg, s, pos) def py_scanstring(s, end, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match): """Scan the string s for a JSON string. End is the index of the character in s after the quote that started the JSON string. Unescapes all valid JSON string escape sequences and raises ValueError on attempt to decode an invalid string. If strict is False then literal control characters are allowed in the string. Returns a tuple of the decoded string and the index of the character in s after the end quote.""" chunks = [] _append = chunks.append begin = end - 1 while 1: chunk = _m(s, end) if chunk is None: raise JSONDecodeError("Unterminated string starting at", s, begin) end = chunk.end() content, terminator = chunk.groups() # Content is contains zero or more unescaped string characters if content: _append(content) # Terminator is the end of string, a literal control character, # or a backslash denoting that an escape sequence follows if terminator == '"': break elif terminator != '\\': if strict: #msg = "Invalid control character %r at" % (terminator,) msg = "Invalid control character {0!r} at".format(terminator) raise JSONDecodeError(msg, s, end) else: _append(terminator) continue try: esc = s[end] except IndexError: raise JSONDecodeError("Unterminated string starting at", s, begin) from None # If not a unicode escape sequence, must be in the lookup table if esc != 'u': try: char = _b[esc] except KeyError: msg = "Invalid \\escape: {0!r}".format(esc) raise JSONDecodeError(msg, s, end) end += 1 else: uni = _decode_uXXXX(s, end) end += 5 if 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u': uni2 = _decode_uXXXX(s, end + 1) if 0xdc00 <= uni2 <= 0xdfff: uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) end += 6 char = chr(uni) _append(char) return ''.join(chunks), end # Use speedup if available scanstring = c_scanstring or py_scanstring WHITESPACE = re.compile(r'([ \t\n\r]|//.*?\n)*', FLAGS) WHITESPACE_STR = ' \t\n\r/' def FlexibleJSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook, memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR): s, end = s_and_end pairs = [] pairs_append = pairs.append # Backwards compatibility if memo is None: memo = {} memo_get = memo.setdefault # Use a slice to prevent IndexError from being raised, the following # check will raise a more specific ValueError if the string is empty nextchar = s[end:end + 1] # Normally we expect nextchar == '"' if nextchar != '"': if nextchar in _ws: end = _w(s, end).end() nextchar = s[end:end + 1] # Trivial empty object if nextchar == '}': if object_pairs_hook is not None: result = object_pairs_hook(pairs) return result, end + 1 pairs = {} if object_hook is not None: pairs = object_hook(pairs) return pairs, end + 1 elif nextchar != '"': raise JSONDecodeError( "Expecting property name enclosed in double quotes", s, end) end += 1 while True: key, end = scanstring(s, end, strict) key = memo_get(key, key) # To skip some function call overhead we optimize the fast paths where # the JSON key separator is ": " or just ":". if s[end:end + 1] != ':': end = _w(s, end).end() if s[end:end + 1] != ':': raise JSONDecodeError("Expecting ':' delimiter", s, end) end += 1 try: if s[end] in _ws: end = _w(s, end + 1).end() except IndexError: pass try: value, end = scan_once(s, end) except StopIteration as err: raise JSONDecodeError("Expecting value", s, err.value) from None pairs_append((key, value)) try: nextchar = s[end] if nextchar in _ws: end = _w(s, end + 1).end() nextchar = s[end] except IndexError: nextchar = '' end += 1 if nextchar == '}': break elif nextchar != ',': raise JSONDecodeError("Expecting ',' delimiter", s, end - 1) end = _w(s, end).end() nextchar = s[end:end + 1] end += 1 if nextchar == '}': break elif nextchar != '"': raise JSONDecodeError( "Expecting property name enclosed in double quotes", s, end - 1) if object_pairs_hook is not None: result = object_pairs_hook(pairs) return result, end pairs = dict(pairs) if object_hook is not None: pairs = object_hook(pairs) return pairs, end def FlexibleJSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR): s, end = s_and_end values = [] nextchar = s[end:end + 1] if nextchar in _ws: end = _w(s, end + 1).end() nextchar = s[end:end + 1] # Look-ahead for trivial empty array if nextchar == ']': return values, end + 1 _append = values.append while True: try: value, end = scan_once(s, end) except StopIteration as err: if s[err.value] == ']': return values, err.value + 1 raise JSONDecodeError("Expecting value", s, err.value) from None _append(value) nextchar = s[end:end + 1] if nextchar in _ws: end = _w(s, end + 1).end() nextchar = s[end:end + 1] end += 1 if nextchar == ']': break elif nextchar != ',': raise JSONDecodeError("Expecting ',' delimiter", s, end - 1) try: if s[end] in _ws: end = _w(s, end + 1).end() except IndexError: pass return values, end class FlexibleJSONDecoder(json.JSONDecoder): """JSON decoder. It allows for trailing commas.""" def __init__(self): super().__init__() self.parse_object = FlexibleJSONObject self.parse_array = FlexibleJSONArray self.scan_once = scanner.py_make_scanner(self) def decode(self, s, _w=WHITESPACE.match): """Return the Python representation of ``s`` (a ``str`` instance containing a JSON document). """ obj, end = self.raw_decode(s, idx=_w(s, 0).end()) end = _w(s, end).end() if end != len(s): raise JSONDecodeError("Extra data", s, end) return obj def raw_decode(self, s, idx=0): """Decode a JSON document from ``s`` (a ``str`` beginning with a JSON document) and return a 2-tuple of the Python representation and the index in ``s`` where the document ended. This can be used to decode a JSON document from a string that may have extraneous data at the end. """ try: obj, end = self.scan_once(s, idx) except StopIteration as err: raise JSONDecodeError("Expecting value", s, err.value) from None return obj, end if __name__ == '__main__': def test(raw, expected): decoded = json.loads(raw, cls=FlexibleJSONDecoder) assert decoded == expected test('{"foo": "bar"}', {"foo":"bar"}) test('{"foo": "bar", }', {"foo":"bar"}) test('["foo", "bar"]', ["foo", "bar"]) test('["foo", "bar", ]', ["foo", "bar"]) test('["foo", {"bar": "baz"}]', ["foo", {"bar": "baz"}]) test('["foo", {"bar": "baz",},]', ["foo", {"bar": "baz"}]) test('''[ "foo", // Note the comment and the trailing comma "bar", ]''', ["foo", "bar"])