changeset 34218 | aa877860d4d7 |
parent 34216 | 1c601df9894c |
child 36549 | 3696efeab66f |
34217:5307cc57f271 | 34218:aa877860d4d7 |
---|---|
446 return _jsonescapeu8fast(u8chars, paranoid) |
446 return _jsonescapeu8fast(u8chars, paranoid) |
447 except ValueError: |
447 except ValueError: |
448 pass |
448 pass |
449 return charencodepure.jsonescapeu8fallback(u8chars, paranoid) |
449 return charencodepure.jsonescapeu8fallback(u8chars, paranoid) |
450 |
450 |
451 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8 |
|
452 # bytes are mapped to that range. |
|
453 if pycompat.ispy3: |
|
454 _utf8strict = r'surrogatepass' |
|
455 else: |
|
456 _utf8strict = r'strict' |
|
457 |
|
451 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] |
458 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] |
452 |
459 |
453 def getutf8char(s, pos): |
460 def getutf8char(s, pos): |
454 '''get the next full utf-8 character in the given string, starting at pos |
461 '''get the next full utf-8 character in the given string, starting at pos |
455 |
462 |
462 if not l: # ascii |
469 if not l: # ascii |
463 return s[pos:pos + 1] |
470 return s[pos:pos + 1] |
464 |
471 |
465 c = s[pos:pos + l] |
472 c = s[pos:pos + l] |
466 # validate with attempted decode |
473 # validate with attempted decode |
467 c.decode("utf-8") |
474 c.decode("utf-8", _utf8strict) |
468 return c |
475 return c |
469 |
476 |
470 def toutf8b(s): |
477 def toutf8b(s): |
471 '''convert a local, possibly-binary string into UTF-8b |
478 '''convert a local, possibly-binary string into UTF-8b |
472 |
479 |
501 return s |
508 return s |
502 if "\xed" not in s: |
509 if "\xed" not in s: |
503 if isinstance(s, localstr): |
510 if isinstance(s, localstr): |
504 return s._utf8 |
511 return s._utf8 |
505 try: |
512 try: |
506 s.decode('utf-8') |
513 s.decode('utf-8', _utf8strict) |
507 return s |
514 return s |
508 except UnicodeDecodeError: |
515 except UnicodeDecodeError: |
509 pass |
516 pass |
510 |
517 |
511 s = pycompat.bytestr(s) |
518 s = pycompat.bytestr(s) |
515 while pos < l: |
522 while pos < l: |
516 try: |
523 try: |
517 c = getutf8char(s, pos) |
524 c = getutf8char(s, pos) |
518 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": |
525 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": |
519 # have to re-escape existing U+DCxx characters |
526 # have to re-escape existing U+DCxx characters |
520 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') |
527 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict) |
521 pos += 1 |
528 pos += 1 |
522 else: |
529 else: |
523 pos += len(c) |
530 pos += len(c) |
524 except UnicodeDecodeError: |
531 except UnicodeDecodeError: |
525 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') |
532 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict) |
526 pos += 1 |
533 pos += 1 |
527 r += c |
534 r += c |
528 return r |
535 return r |
529 |
536 |
530 def fromutf8b(s): |
537 def fromutf8b(s): |
568 while pos < l: |
575 while pos < l: |
569 c = getutf8char(s, pos) |
576 c = getutf8char(s, pos) |
570 pos += len(c) |
577 pos += len(c) |
571 # unescape U+DCxx characters |
578 # unescape U+DCxx characters |
572 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": |
579 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": |
573 c = chr(ord(c.decode("utf-8")) & 0xff) |
580 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff) |
574 r += c |
581 r += c |
575 return r |
582 return r |
576 |
583 |
577 if pycompat.ispy3: |
584 if pycompat.ispy3: |
578 class strio(io.TextIOWrapper): |
585 class strio(io.TextIOWrapper): |