diff mercurial/patch.py @ 35392:82c3762349ac

patch: do not break up multibyte character when highlighting word This changes {\W} to {\W - any 8bit characters} so that multibyte sequences are taken as words. Since we don't know the encoding of user content, this is the most sensible definition of a non-word.
author Yuya Nishihara <yuya@tcha.org>
date Mon, 11 Dec 2017 22:38:31 +0900
parents dce761558329
children 72b91f905065
line wrap: on
line diff
--- a/mercurial/patch.py	Sun Dec 10 00:16:11 2017 -0500
+++ b/mercurial/patch.py	Mon Dec 11 22:38:31 2017 +0900
@@ -46,6 +46,7 @@
 
 gitre = re.compile(br'diff --git a/(.*) b/(.*)')
 tabsplitter = re.compile(br'(\t+|[^\t]+)')
+_nonwordre = re.compile(br'([^a-zA-Z0-9_\x80-\xff])')
 
 PatchError = error.PatchError
 
@@ -2578,7 +2579,7 @@
         raise error.ProgrammingError("Case not expected, operation = %s" %
                                      operation)
 
-    s = difflib.ndiff(re.split(br'(\W)', s2), re.split(br'(\W)', s1))
+    s = difflib.ndiff(_nonwordre.split(s2), _nonwordre.split(s1))
     for part in s:
         if part[0] in operation_skip or len(part) == 2:
             continue