comparison mercurial/posix.py @ 26876:b8381832ce2b

posix: use getutf8char to handle OS X filename percent-escaping This replaces an open-coded utf-8 parser that was ignoring subtle issues like overlong encodings.
author Matt Mackall <mpm@selenic.com>
date Thu, 05 Nov 2015 17:09:00 -0600
parents 99b6afff09ae
children 8b2fbe3f59b1
comparison
equal deleted inserted replaced
26875:cf47bdb2183c 26876:b8381832ce2b
253 try: 253 try:
254 u = path.decode('utf-8') 254 u = path.decode('utf-8')
255 except UnicodeDecodeError: 255 except UnicodeDecodeError:
256 # OS X percent-encodes any bytes that aren't valid utf-8 256 # OS X percent-encodes any bytes that aren't valid utf-8
257 s = '' 257 s = ''
258 g = '' 258 pos = 0
259 l = 0 259 l = len(s)
260 for c in path: 260 while pos < l:
261 o = ord(c) 261 try:
262 if l and o < 128 or o >= 192: 262 c = encoding.getutf8char(path, pos)
263 # we want a continuation byte, but didn't get one 263 pos += len(c)
264 s += ''.join(["%%%02X" % ord(x) for x in g]) 264 except ValueError:
265 g = '' 265 c = '%%%%02X' % path[pos]
266 l = 0 266 pos += 1
267 if l == 0 and o < 128: 267 s += c
268 # ascii 268
269 s += c
270 elif l == 0 and 194 <= o < 245:
271 # valid leading bytes
272 if o < 224:
273 l = 1
274 elif o < 240:
275 l = 2
276 else:
277 l = 3
278 g = c
279 elif l > 0 and 128 <= o < 192:
280 # valid continuations
281 g += c
282 l -= 1
283 if not l:
284 s += g
285 g = ''
286 else:
287 # invalid
288 s += "%%%02X" % o
289
290 # any remaining partial characters
291 s += ''.join(["%%%02X" % ord(x) for x in g])
292 u = s.decode('utf-8') 269 u = s.decode('utf-8')
293 270
294 # Decompose then lowercase (HFS+ technote specifies lower) 271 # Decompose then lowercase (HFS+ technote specifies lower)
295 enc = unicodedata.normalize('NFD', u).lower().encode('utf-8') 272 enc = unicodedata.normalize('NFD', u).lower().encode('utf-8')
296 # drop HFS+ ignored characters 273 # drop HFS+ ignored characters