Mercurial > public > mercurial-scm > hg
comparison mercurial/posix.py @ 26876:b8381832ce2b
posix: use getutf8char to handle OS X filename percent-escaping
This replaces an open-coded utf-8 parser that was ignoring subtle issues
like overlong encodings.
author | Matt Mackall <mpm@selenic.com> |
---|---|
date | Thu, 05 Nov 2015 17:09:00 -0600 |
parents | 99b6afff09ae |
children | 8b2fbe3f59b1 |
comparison
equal
deleted
inserted
replaced
26875:cf47bdb2183c | 26876:b8381832ce2b |
---|---|
253 try: | 253 try: |
254 u = path.decode('utf-8') | 254 u = path.decode('utf-8') |
255 except UnicodeDecodeError: | 255 except UnicodeDecodeError: |
256 # OS X percent-encodes any bytes that aren't valid utf-8 | 256 # OS X percent-encodes any bytes that aren't valid utf-8 |
257 s = '' | 257 s = '' |
258 g = '' | 258 pos = 0 |
259 l = 0 | 259 l = len(s) |
260 for c in path: | 260 while pos < l: |
261 o = ord(c) | 261 try: |
262 if l and o < 128 or o >= 192: | 262 c = encoding.getutf8char(path, pos) |
263 # we want a continuation byte, but didn't get one | 263 pos += len(c) |
264 s += ''.join(["%%%02X" % ord(x) for x in g]) | 264 except ValueError: |
265 g = '' | 265 c = '%%%%02X' % path[pos] |
266 l = 0 | 266 pos += 1 |
267 if l == 0 and o < 128: | 267 s += c |
268 # ascii | 268 |
269 s += c | |
270 elif l == 0 and 194 <= o < 245: | |
271 # valid leading bytes | |
272 if o < 224: | |
273 l = 1 | |
274 elif o < 240: | |
275 l = 2 | |
276 else: | |
277 l = 3 | |
278 g = c | |
279 elif l > 0 and 128 <= o < 192: | |
280 # valid continuations | |
281 g += c | |
282 l -= 1 | |
283 if not l: | |
284 s += g | |
285 g = '' | |
286 else: | |
287 # invalid | |
288 s += "%%%02X" % o | |
289 | |
290 # any remaining partial characters | |
291 s += ''.join(["%%%02X" % ord(x) for x in g]) | |
292 u = s.decode('utf-8') | 269 u = s.decode('utf-8') |
293 | 270 |
294 # Decompose then lowercase (HFS+ technote specifies lower) | 271 # Decompose then lowercase (HFS+ technote specifies lower) |
295 enc = unicodedata.normalize('NFD', u).lower().encode('utf-8') | 272 enc = unicodedata.normalize('NFD', u).lower().encode('utf-8') |
296 # drop HFS+ ignored characters | 273 # drop HFS+ ignored characters |