Mercurial > public > mercurial-scm > hg
diff hgext/convert/darcs.py @ 12717:89df79b3c011 stable
convert/darcs: support changelogs with bytes 0x7F-0xFF (issue2411)
This is a followup to 4481f8a93c7a, which only fixed the conversion of
patches with UTF-8 metadata.
This patch allows a changelog to have any bytes with values
0x7F-0xFF. It parses the XML changelog as Latin-1 and uses
converter_source.recode() to decode the data as UTF-8/Latin-1.
Caveats:
- Since the convert extension doesn't provide any way to specify the
source encoding, users are still limited to UTF-8 and Latin-1.
- etree will still complain if the changelog has bytes with values
0x00-0x19. XML only allows printable characters.
author | Brodie Rao <brodie@bitheap.org> |
---|---|
date | Fri, 01 Oct 2010 10:15:04 -0500 |
parents | 84ceedcfeb6a |
children | 4e5a36eeefd1 |
line wrap: on
line diff
--- a/hgext/convert/darcs.py Tue Oct 05 11:34:13 2010 +0200 +++ b/hgext/convert/darcs.py Fri Oct 01 10:15:04 2010 -0500 @@ -7,22 +7,22 @@ from common import NoRepo, checktool, commandline, commit, converter_source from mercurial.i18n import _ -from mercurial import util +from mercurial import encoding, util import os, shutil, tempfile, re # The naming drift of ElementTree is fun! try: - from xml.etree.cElementTree import ElementTree + from xml.etree.cElementTree import ElementTree, XMLParser except ImportError: try: - from xml.etree.ElementTree import ElementTree + from xml.etree.ElementTree import ElementTree, XMLParser except ImportError: try: - from elementtree.cElementTree import ElementTree + from elementtree.cElementTree import ElementTree, XMLParser except ImportError: try: - from elementtree.ElementTree import ElementTree + from elementtree.ElementTree import ElementTree, XMLParser except ImportError: ElementTree = None @@ -88,12 +88,24 @@ self.ui.debug('cleaning up %s\n' % self.tmppath) shutil.rmtree(self.tmppath, ignore_errors=True) + def recode(self, s, encoding=None): + if isinstance(s, unicode): + # XMLParser returns unicode objects for anything it can't + # encode into ASCII. We convert them back to str to get + # recode's normal conversion behavior. + s = s.encode('latin-1') + return super(darcs_source, self).recode(s, encoding) + def xml(self, cmd, **kwargs): # NOTE: darcs is currently encoding agnostic and will print # patch metadata byte-for-byte, even in the XML changelog. etree = ElementTree() + # While we are decoding the XML as latin-1 to be as liberal as + # possible, etree will still raise an exception if any + # non-printable characters are in the XML changelog. + parser = XMLParser(encoding='latin-1') fp = self._run(cmd, **kwargs) - etree.parse(fp) + etree.parse(fp, parser=parser) self.checkexit(fp.close()) return etree.getroot()