Mercurial > public > mercurial-scm > hg-stable
annotate mercurial/similar.py @ 30805:0ae287eb6a4f
similar: move score function to module level
Future patches will use this to report the similarity of a rename / copy
in the patch output.
author | Sean Farley <sean@farley.io> |
---|---|
date | Sat, 07 Jan 2017 20:47:57 -0800 |
parents | ada160a8cfd8 |
children | 8614546154cb |
rev | line source |
---|---|
11059
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
1 # similar.py - mechanisms for finding similar files |
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
2 # |
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com> |
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
4 # |
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
5 # This software may be used and distributed according to the terms of the |
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
6 # GNU General Public License version 2 or any later version. |
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
7 |
27359
a56c47ed3885
similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
16683
diff
changeset
|
8 from __future__ import absolute_import |
a56c47ed3885
similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
16683
diff
changeset
|
9 |
29341
0d83ad967bf8
cleanup: replace uses of util.(md5|sha1|sha256|sha512) with hashlib.\1
Augie Fackler <raf@durin42.com>
parents:
29337
diff
changeset
|
10 import hashlib |
0d83ad967bf8
cleanup: replace uses of util.(md5|sha1|sha256|sha512) with hashlib.\1
Augie Fackler <raf@durin42.com>
parents:
29337
diff
changeset
|
11 |
27359
a56c47ed3885
similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
16683
diff
changeset
|
12 from .i18n import _ |
a56c47ed3885
similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
16683
diff
changeset
|
13 from . import ( |
a56c47ed3885
similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
16683
diff
changeset
|
14 bdiff, |
a56c47ed3885
similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
16683
diff
changeset
|
15 mdiff, |
a56c47ed3885
similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
16683
diff
changeset
|
16 util, |
a56c47ed3885
similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
16683
diff
changeset
|
17 ) |
11059
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
18 |
11060
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
19 def _findexactmatches(repo, added, removed): |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
20 '''find renamed files that have no changes |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
21 |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
22 Takes a list of new filectxs and a list of removed filectxs, and yields |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
23 (before, after) tuples of exact matches. |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
24 ''' |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
25 numfiles = len(added) + len(removed) |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
26 |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
27 # Get hashes of removed files. |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
28 hashes = {} |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
29 for i, fctx in enumerate(removed): |
28468
0d6b3630b9a3
similar: specify unit for ui.progress when operating on files
Anton Shestakov <av6@dwimlabs.net>
parents:
27359
diff
changeset
|
30 repo.ui.progress(_('searching for exact renames'), i, total=numfiles, |
0d6b3630b9a3
similar: specify unit for ui.progress when operating on files
Anton Shestakov <av6@dwimlabs.net>
parents:
27359
diff
changeset
|
31 unit=_('files')) |
29341
0d83ad967bf8
cleanup: replace uses of util.(md5|sha1|sha256|sha512) with hashlib.\1
Augie Fackler <raf@durin42.com>
parents:
29337
diff
changeset
|
32 h = hashlib.sha1(fctx.data()).digest() |
11060
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
33 hashes[h] = fctx |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
34 |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
35 # For each added file, see if it corresponds to a removed file. |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
36 for i, fctx in enumerate(added): |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
37 repo.ui.progress(_('searching for exact renames'), i + len(removed), |
28468
0d6b3630b9a3
similar: specify unit for ui.progress when operating on files
Anton Shestakov <av6@dwimlabs.net>
parents:
27359
diff
changeset
|
38 total=numfiles, unit=_('files')) |
29341
0d83ad967bf8
cleanup: replace uses of util.(md5|sha1|sha256|sha512) with hashlib.\1
Augie Fackler <raf@durin42.com>
parents:
29337
diff
changeset
|
39 h = hashlib.sha1(fctx.data()).digest() |
11060
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
40 if h in hashes: |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
41 yield (hashes[h], fctx) |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
42 |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
43 # Done |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
44 repo.ui.progress(_('searching for exact renames'), None) |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
45 |
30805
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
46 @util.cachefunc |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
47 def _ctxdata(fctx): |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
48 # lazily load text |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
49 orig = fctx.data() |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
50 return orig, mdiff.splitnewlines(orig) |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
51 |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
52 @util.cachefunc |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
53 def score(fctx1, fctx2): |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
54 text = fctx1.data() |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
55 orig, lines = _ctxdata(fctx2) |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
56 # bdiff.blocks() returns blocks of matching lines |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
57 # count the number of bytes in each |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
58 equal = 0 |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
59 matches = bdiff.blocks(text, orig) |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
60 for x1, x2, y1, y2 in matches: |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
61 for line in lines[y1:y2]: |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
62 equal += len(line) |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
63 |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
64 lengths = len(text) + len(orig) |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
65 return equal * 2.0 / lengths |
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
66 |
11060
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
67 def _findsimilarmatches(repo, added, removed, threshold): |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
68 '''find potentially renamed files based on similar file content |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
69 |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
70 Takes a list of new filectxs and a list of removed filectxs, and yields |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
71 (before, after, score) tuples of partial matches. |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
72 ''' |
11059
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
73 copies = {} |
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
74 for i, r in enumerate(removed): |
16683 | 75 repo.ui.progress(_('searching for similar files'), i, |
28468
0d6b3630b9a3
similar: specify unit for ui.progress when operating on files
Anton Shestakov <av6@dwimlabs.net>
parents:
27359
diff
changeset
|
76 total=len(removed), unit=_('files')) |
11059
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
77 |
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
78 for a in added: |
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
79 bestscore = copies.get(a, (None, threshold))[1] |
30805
0ae287eb6a4f
similar: move score function to module level
Sean Farley <sean@farley.io>
parents:
30791
diff
changeset
|
80 myscore = score(a, r) |
11059
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
81 if myscore >= bestscore: |
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
82 copies[a] = (r, myscore) |
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
83 repo.ui.progress(_('searching'), None) |
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
84 |
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
85 for dest, v in copies.iteritems(): |
30791
ada160a8cfd8
similar: rename local variable to not collide with previous
Sean Farley <sean@farley.io>
parents:
29341
diff
changeset
|
86 source, bscore = v |
ada160a8cfd8
similar: rename local variable to not collide with previous
Sean Farley <sean@farley.io>
parents:
29341
diff
changeset
|
87 yield source, dest, bscore |
11059
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
88 |
11060
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
89 def findrenames(repo, added, removed, threshold): |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
90 '''find renamed files -- yields (before, after, score) tuples''' |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
91 parentctx = repo['.'] |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
92 workingctx = repo[None] |
11059
ef4aa90b1e58
Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff
changeset
|
93 |
11060
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
94 # Zero length files will be frequently unrelated to each other, and |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
95 # tracking the deletion/addition of such a file will probably cause more |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
96 # harm than good. We strip them out here to avoid matching them later on. |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
97 addedfiles = set([workingctx[fp] for fp in added |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
98 if workingctx[fp].size() > 0]) |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
99 removedfiles = set([parentctx[fp] for fp in removed |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
100 if fp in parentctx and parentctx[fp].size() > 0]) |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
101 |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
102 # Find exact matches. |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
103 for (a, b) in _findexactmatches(repo, |
11085
0c8646292ca4
fix coding style
Benoit Boissinot <benoit.boissinot@ens-lyon.org>
parents:
11060
diff
changeset
|
104 sorted(addedfiles), sorted(removedfiles)): |
11060
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
105 addedfiles.remove(b) |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
106 yield (a.path(), b.path(), 1.0) |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
107 |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
108 # If the user requested similar files to be matched, search for them also. |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
109 if threshold < 1.0: |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
110 for (a, b, score) in _findsimilarmatches(repo, |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
111 sorted(addedfiles), sorted(removedfiles), threshold): |
e6df01776e08
findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
11059
diff
changeset
|
112 yield (a.path(), b.path(), score) |