Mercurial > public > mercurial-scm > hg-stable
annotate mercurial/base85.c @ 26117:4dc5b51f38fe
revlog: change generaldelta delta parent heuristic
The old generaldelta heuristic was "if p1 (or p2) was closer than the last full text,
use it, otherwise use prev". This was problematic when a repo contained multiple
branches that were very different. If commits to branch A were pushed, and the
last full text was branch B, it would generate a fulltext. Then if branch B was
pushed, it would generate another fulltext. The problem is that the last
fulltext (and delta'ing against `prev` in general) has no correlation with the
contents of the incoming revision, and therefore will always have degenerate
cases.
According to the blame, that algorithm was chosen to minimize the chain length.
Since there is already code that protects against that (the delta-vs-fulltext
code), and since it has been improved since the original generaldelta algorithm
went in (2011), I believe the chain length criteria will still be preserved.
The new algorithm always diffs against p1 (or p2 if it's closer), unless the
resulting delta will fail the delta-vs-fulltext check, in which case we delta
against prev.
Some before and after stats on manifest.d size.
internal large repo
old heuristic - 2.0 GB
new heuristic - 1.2 GB
mozilla-central
old heuristic - 242 MB
new heuristic - 261 MB
The regression in mozilla central is due to the new heuristic choosing p2r as
the delta when it's closer to the tip. Switching the algorithm to always prefer
p1r brings the size back down (242 MB). This is result of the way in which
mozilla does merges and pushes, and the result could easily swing the other
direction in other repos (depending on if they merge X into Y or Y into X), but
will never be as degenerate as before.
I future patch will address the regression by introducing an optional, even more
aggressive delta heuristic which will knock the mozilla manifest size down
dramatically.
author | Durham Goode <durham@fb.com> |
---|---|
date | Sun, 30 Aug 2015 13:58:11 -0700 |
parents | c1aefe57cf4e |
children | 4613a89bea42 |
rev | line source |
---|---|
3283 | 1 /* |
2 base85 codec | |
3 | |
4 Copyright 2006 Brendan Cully <brendan@kublai.com> | |
5 | |
6 This software may be used and distributed according to the terms of | |
7 the GNU General Public License, incorporated herein by reference. | |
8 | |
9 Largely based on git's implementation | |
10 */ | |
11 | |
16837
1b9d54c00d50
base85: use Py_ssize_t for string lengths
Adrian Buehlmann <adrian@cadifra.com>
parents:
16522
diff
changeset
|
12 #define PY_SSIZE_T_CLEAN |
3283 | 13 #include <Python.h> |
14 | |
11362
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
15 #include "util.h" |
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
16 |
3283 | 17 static const char b85chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
18 "abcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; | |
19 static char b85dec[256]; | |
20 | |
21 static void | |
22 b85prep(void) | |
23 { | |
26074
c1aefe57cf4e
base85: fix comparison of an int and a long
Augie Fackler <augie@google.com>
parents:
16848
diff
changeset
|
24 unsigned i; |
3283 | 25 |
26 memset(b85dec, 0, sizeof(b85dec)); | |
27 for (i = 0; i < sizeof(b85chars); i++) | |
28 b85dec[(int)(b85chars[i])] = i + 1; | |
29 } | |
30 | |
31 static PyObject * | |
32 b85encode(PyObject *self, PyObject *args) | |
33 { | |
34 const unsigned char *text; | |
35 PyObject *out; | |
36 char *dst; | |
16837
1b9d54c00d50
base85: use Py_ssize_t for string lengths
Adrian Buehlmann <adrian@cadifra.com>
parents:
16522
diff
changeset
|
37 Py_ssize_t len, olen, i; |
3283 | 38 unsigned int acc, val, ch; |
7190
aecea6934fdd
Some additional space/tab cleanups
Thomas Arendsen Hein <thomas@intevation.de>
parents:
3369
diff
changeset
|
39 int pad = 0; |
3283 | 40 |
3369
4bad632913d8
python2.5 PyArg_ParseTuple fix
Alexis S. L. Carvalho <alexis@cecm.usp.br>
parents:
3332
diff
changeset
|
41 if (!PyArg_ParseTuple(args, "s#|i", &text, &len, &pad)) |
3283 | 42 return NULL; |
43 | |
7190
aecea6934fdd
Some additional space/tab cleanups
Thomas Arendsen Hein <thomas@intevation.de>
parents:
3369
diff
changeset
|
44 if (pad) |
aecea6934fdd
Some additional space/tab cleanups
Thomas Arendsen Hein <thomas@intevation.de>
parents:
3369
diff
changeset
|
45 olen = ((len + 3) / 4 * 5) - 3; |
aecea6934fdd
Some additional space/tab cleanups
Thomas Arendsen Hein <thomas@intevation.de>
parents:
3369
diff
changeset
|
46 else { |
aecea6934fdd
Some additional space/tab cleanups
Thomas Arendsen Hein <thomas@intevation.de>
parents:
3369
diff
changeset
|
47 olen = len % 4; |
aecea6934fdd
Some additional space/tab cleanups
Thomas Arendsen Hein <thomas@intevation.de>
parents:
3369
diff
changeset
|
48 if (olen) |
aecea6934fdd
Some additional space/tab cleanups
Thomas Arendsen Hein <thomas@intevation.de>
parents:
3369
diff
changeset
|
49 olen++; |
aecea6934fdd
Some additional space/tab cleanups
Thomas Arendsen Hein <thomas@intevation.de>
parents:
3369
diff
changeset
|
50 olen += len / 4 * 5; |
aecea6934fdd
Some additional space/tab cleanups
Thomas Arendsen Hein <thomas@intevation.de>
parents:
3369
diff
changeset
|
51 } |
11362
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
52 if (!(out = PyBytes_FromStringAndSize(NULL, olen + 3))) |
3283 | 53 return NULL; |
54 | |
11362
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
55 dst = PyBytes_AsString(out); |
3283 | 56 |
3288
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
57 while (len) { |
3283 | 58 acc = 0; |
59 for (i = 24; i >= 0; i -= 8) { | |
60 ch = *text++; | |
61 acc |= ch << i; | |
62 if (--len == 0) | |
63 break; | |
64 } | |
65 for (i = 4; i >= 0; i--) { | |
66 val = acc % 85; | |
67 acc /= 85; | |
68 dst[i] = b85chars[val]; | |
69 } | |
70 dst += 5; | |
71 } | |
72 | |
7190
aecea6934fdd
Some additional space/tab cleanups
Thomas Arendsen Hein <thomas@intevation.de>
parents:
3369
diff
changeset
|
73 if (!pad) |
11362
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
74 _PyBytes_Resize(&out, olen); |
3288
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
75 |
3283 | 76 return out; |
77 } | |
78 | |
79 static PyObject * | |
80 b85decode(PyObject *self, PyObject *args) | |
81 { | |
82 PyObject *out; | |
83 const char *text; | |
84 char *dst; | |
16837
1b9d54c00d50
base85: use Py_ssize_t for string lengths
Adrian Buehlmann <adrian@cadifra.com>
parents:
16522
diff
changeset
|
85 Py_ssize_t len, i, j, olen, cap; |
1b9d54c00d50
base85: use Py_ssize_t for string lengths
Adrian Buehlmann <adrian@cadifra.com>
parents:
16522
diff
changeset
|
86 int c; |
3283 | 87 unsigned int acc; |
88 | |
3369
4bad632913d8
python2.5 PyArg_ParseTuple fix
Alexis S. L. Carvalho <alexis@cecm.usp.br>
parents:
3332
diff
changeset
|
89 if (!PyArg_ParseTuple(args, "s#", &text, &len)) |
3283 | 90 return NULL; |
91 | |
3288
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
92 olen = len / 5 * 4; |
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
93 i = len % 5; |
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
94 if (i) |
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
95 olen += i - 1; |
11362
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
96 if (!(out = PyBytes_FromStringAndSize(NULL, olen))) |
3283 | 97 return NULL; |
98 | |
11362
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
99 dst = PyBytes_AsString(out); |
3283 | 100 |
3288
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
101 i = 0; |
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
102 while (i < len) |
3283 | 103 { |
104 acc = 0; | |
3288
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
105 cap = len - i - 1; |
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
106 if (cap > 4) |
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
107 cap = 4; |
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
108 for (j = 0; j < cap; i++, j++) |
3283 | 109 { |
110 c = b85dec[(int)*text++] - 1; | |
111 if (c < 0) | |
10282
08a0f04b56bd
many, many trivial check-code fixups
Matt Mackall <mpm@selenic.com>
parents:
7190
diff
changeset
|
112 return PyErr_Format( |
08a0f04b56bd
many, many trivial check-code fixups
Matt Mackall <mpm@selenic.com>
parents:
7190
diff
changeset
|
113 PyExc_ValueError, |
16848
19a915d43a68
base85: cast Py_ssize_t values to int (issue3481)
Adrian Buehlmann <adrian@cadifra.com>
parents:
16837
diff
changeset
|
114 "bad base85 character at position %d", |
19a915d43a68
base85: cast Py_ssize_t values to int (issue3481)
Adrian Buehlmann <adrian@cadifra.com>
parents:
16837
diff
changeset
|
115 (int)i); |
3283 | 116 acc = acc * 85 + c; |
117 } | |
3288
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
118 if (i++ < len) |
3283 | 119 { |
120 c = b85dec[(int)*text++] - 1; | |
121 if (c < 0) | |
10282
08a0f04b56bd
many, many trivial check-code fixups
Matt Mackall <mpm@selenic.com>
parents:
7190
diff
changeset
|
122 return PyErr_Format( |
08a0f04b56bd
many, many trivial check-code fixups
Matt Mackall <mpm@selenic.com>
parents:
7190
diff
changeset
|
123 PyExc_ValueError, |
16848
19a915d43a68
base85: cast Py_ssize_t values to int (issue3481)
Adrian Buehlmann <adrian@cadifra.com>
parents:
16837
diff
changeset
|
124 "bad base85 character at position %d", |
19a915d43a68
base85: cast Py_ssize_t values to int (issue3481)
Adrian Buehlmann <adrian@cadifra.com>
parents:
16837
diff
changeset
|
125 (int)i); |
3288
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
126 /* overflow detection: 0xffffffff == "|NsC0", |
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
127 * "|NsC" == 0x03030303 */ |
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
128 if (acc > 0x03030303 || (acc *= 85) > 0xffffffff - c) |
10282
08a0f04b56bd
many, many trivial check-code fixups
Matt Mackall <mpm@selenic.com>
parents:
7190
diff
changeset
|
129 return PyErr_Format( |
08a0f04b56bd
many, many trivial check-code fixups
Matt Mackall <mpm@selenic.com>
parents:
7190
diff
changeset
|
130 PyExc_ValueError, |
16848
19a915d43a68
base85: cast Py_ssize_t values to int (issue3481)
Adrian Buehlmann <adrian@cadifra.com>
parents:
16837
diff
changeset
|
131 "bad base85 sequence at position %d", |
19a915d43a68
base85: cast Py_ssize_t values to int (issue3481)
Adrian Buehlmann <adrian@cadifra.com>
parents:
16837
diff
changeset
|
132 (int)i); |
3288
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
133 acc += c; |
3283 | 134 } |
135 | |
3288
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
136 cap = olen < 4 ? olen : 4; |
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
137 olen -= cap; |
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
138 for (j = 0; j < 4 - cap; j++) |
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
139 acc *= 85; |
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
140 if (cap && cap < 4) |
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
141 acc += 0xffffff >> (cap - 1) * 8; |
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
142 for (j = 0; j < cap; j++) |
3283 | 143 { |
144 acc = (acc << 8) | (acc >> 24); | |
3288
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
145 *dst++ = acc; |
3283 | 146 } |
147 } | |
148 | |
149 return out; | |
150 } | |
151 | |
152 static char base85_doc[] = "Base85 Data Encoding"; | |
153 | |
154 static PyMethodDef methods[] = { | |
3288
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
155 {"b85encode", b85encode, METH_VARARGS, |
7190
aecea6934fdd
Some additional space/tab cleanups
Thomas Arendsen Hein <thomas@intevation.de>
parents:
3369
diff
changeset
|
156 "Encode text in base85.\n\n" |
aecea6934fdd
Some additional space/tab cleanups
Thomas Arendsen Hein <thomas@intevation.de>
parents:
3369
diff
changeset
|
157 "If the second parameter is true, pad the result to a multiple of " |
aecea6934fdd
Some additional space/tab cleanups
Thomas Arendsen Hein <thomas@intevation.de>
parents:
3369
diff
changeset
|
158 "five characters.\n"}, |
3288
e93c926e069e
Handle odd-sized base85 input and output
Brendan Cully <brendan@kublai.com>
parents:
3283
diff
changeset
|
159 {"b85decode", b85decode, METH_VARARGS, "Decode base85 text.\n"}, |
3283 | 160 {NULL, NULL} |
161 }; | |
162 | |
11362
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
163 #ifdef IS_PY3K |
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
164 static struct PyModuleDef base85_module = { |
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
165 PyModuleDef_HEAD_INIT, |
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
166 "base85", |
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
167 base85_doc, |
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
168 -1, |
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
169 methods |
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
170 }; |
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
171 |
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
172 PyMODINIT_FUNC PyInit_base85(void) |
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
173 { |
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
174 b85prep(); |
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
175 |
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
176 return PyModule_Create(&base85_module); |
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
177 } |
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
178 #else |
3283 | 179 PyMODINIT_FUNC initbase85(void) |
180 { | |
181 Py_InitModule3("base85", methods, base85_doc); | |
182 | |
183 b85prep(); | |
184 } | |
11362
f42ef9493fa9
base85.c: Added support for py3k.
Renato Cunha <renatoc@gmail.com>
parents:
10282
diff
changeset
|
185 #endif |