Mercurial > public > mercurial-scm > hg-stable
comparison mercurial/revlog.py @ 2079:ee96ca273f32
New lazy index code for revlogs.
This tunes for large repositories. It does not read the whole
index file in one big chunk, but tries to buffer reads in more
reasonable chunks instead.
Search speeds are improved in two ways. When trying to find a
specific sha hash, it searches from the end of the file backward.
More recent entries are more likely to be relevant, especially the
tip.
Also, this can load only the mapping of nodes to revlog index number.
Loading the map uses less cpu (no struct.unpack) and much less
memory than loading both the map and the index.
This cuts down the time for hg tip on the 80,000 changeset
kernel repo from 1.8s to 3.69s. Most commands the pull a single
rev out of a big index get roughly the same benefit. Commands
that read the whole index are not slower.
author | mason@suse.com |
---|---|
date | Tue, 04 Apr 2006 16:47:12 -0400 |
parents | 441ea218414e |
children | 1cbb14c048cb |
comparison
equal
deleted
inserted
replaced
2078:441ea218414e | 2079:ee96ca273f32 |
---|---|
62 if t == 'x': return zlib.decompress(bin) | 62 if t == 'x': return zlib.decompress(bin) |
63 if t == 'u': return bin[1:] | 63 if t == 'u': return bin[1:] |
64 raise RevlogError(_("unknown compression type %r") % t) | 64 raise RevlogError(_("unknown compression type %r") % t) |
65 | 65 |
66 indexformatv0 = ">4l20s20s20s" | 66 indexformatv0 = ">4l20s20s20s" |
67 v0shaoffset = 56 | |
67 # index ng: | 68 # index ng: |
68 # 6 bytes offset | 69 # 6 bytes offset |
69 # 2 bytes flags | 70 # 2 bytes flags |
70 # 4 bytes compressed length | 71 # 4 bytes compressed length |
71 # 4 bytes uncompressed length | 72 # 4 bytes uncompressed length |
73 # 4 bytes link rev | 74 # 4 bytes link rev |
74 # 4 bytes parent 1 rev | 75 # 4 bytes parent 1 rev |
75 # 4 bytes parent 2 rev | 76 # 4 bytes parent 2 rev |
76 # 32 bytes: nodeid | 77 # 32 bytes: nodeid |
77 indexformatng = ">Qiiiiii20s12x" | 78 indexformatng = ">Qiiiiii20s12x" |
79 ngshaoffset = 32 | |
78 versionformat = ">i" | 80 versionformat = ">i" |
79 | 81 |
80 class lazyparser(object): | 82 class lazyparser(object): |
81 """ | 83 """ |
82 this class avoids the need to parse the entirety of large indices | 84 this class avoids the need to parse the entirety of large indices |
83 | |
84 By default we parse and load 1000 entries at a time. | |
85 | |
86 If no position is specified, we load the whole index, and replace | |
87 the lazy objects in revlog with the underlying objects for | |
88 efficiency in cases where we look at most of the nodes. | |
89 """ | 85 """ |
90 def __init__(self, data, revlog, indexformat): | 86 def __init__(self, dataf, size, indexformat, shaoffset): |
91 self.data = data | 87 self.dataf = dataf |
88 self.format = indexformat | |
92 self.s = struct.calcsize(indexformat) | 89 self.s = struct.calcsize(indexformat) |
93 self.indexformat = indexformat | 90 self.indexformat = indexformat |
94 self.l = len(data)/self.s | 91 self.datasize = size |
92 self.l = size/self.s | |
95 self.index = [None] * self.l | 93 self.index = [None] * self.l |
96 self.map = {nullid: -1} | 94 self.map = {nullid: -1} |
95 self.allmap = 0 | |
97 self.all = 0 | 96 self.all = 0 |
98 self.revlog = revlog | 97 self.mapfind_count = 0 |
99 | 98 self.shaoffset = shaoffset |
100 def load(self, pos=None): | 99 |
100 def loadmap(self): | |
101 """ | |
102 during a commit, we need to make sure the rev being added is | |
103 not a duplicate. This requires loading the entire index, | |
104 which is fairly slow. loadmap can load up just the node map, | |
105 which takes much less time. | |
106 """ | |
107 if self.allmap: return | |
108 start = 0 | |
109 end = self.datasize | |
110 self.allmap = 1 | |
111 cur = 0 | |
112 count = 0 | |
113 blocksize = self.s * 256 | |
114 self.dataf.seek(0) | |
115 while cur < end: | |
116 data = self.dataf.read(blocksize) | |
117 off = 0 | |
118 for x in xrange(256): | |
119 n = data[off + self.shaoffset:off + self.shaoffset + 20] | |
120 self.map[n] = count | |
121 count += 1 | |
122 if count >= self.l: | |
123 break | |
124 off += self.s | |
125 cur += blocksize | |
126 | |
127 def loadblock(self, blockstart, blocksize, data=None): | |
101 if self.all: return | 128 if self.all: return |
102 if pos is not None: | 129 if data is None: |
103 block = pos / 1000 | 130 self.dataf.seek(blockstart) |
104 i = block * 1000 | 131 data = self.dataf.read(blocksize) |
105 end = min(self.l, i + 1000) | 132 lend = len(data) / self.s |
106 else: | 133 i = blockstart / self.s |
107 self.all = 1 | 134 off = 0 |
108 i = 0 | 135 for x in xrange(lend): |
109 end = self.l | 136 if self.index[i + x] == None: |
110 self.revlog.index = self.index | 137 b = data[off : off + self.s] |
111 self.revlog.nodemap = self.map | 138 e = struct.unpack(self.format, b) |
112 | 139 self.index[i + x] = e |
113 while i < end: | 140 self.map[e[-1]] = i + x |
114 if not self.index[i]: | 141 off += self.s |
115 d = self.data[i * self.s: (i + 1) * self.s] | 142 |
116 e = struct.unpack(self.indexformat, d) | 143 def findnode(self, node): |
117 self.index[i] = e | 144 """search backwards through the index file for a specific node""" |
118 self.map[e[-1]] = i | 145 if self.allmap: return None |
119 i += 1 | 146 |
147 # hg log will cause many many searches for the manifest | |
148 # nodes. After we get called a few times, just load the whole | |
149 # thing. | |
150 if self.mapfind_count > 8: | |
151 self.loadmap() | |
152 if node in self.map: | |
153 return node | |
154 return None | |
155 self.mapfind_count += 1 | |
156 last = self.l - 1 | |
157 while self.index[last] != None: | |
158 if last == 0: | |
159 self.all = 1 | |
160 self.allmap = 1 | |
161 return None | |
162 last -= 1 | |
163 end = (last + 1) * self.s | |
164 blocksize = self.s * 256 | |
165 while end >= 0: | |
166 start = max(end - blocksize, 0) | |
167 self.dataf.seek(start) | |
168 data = self.dataf.read(end - start) | |
169 findend = end - start | |
170 while True: | |
171 # we're searching backwards, so weh have to make sure | |
172 # we don't find a changeset where this node is a parent | |
173 off = data.rfind(node, 0, findend) | |
174 findend = off | |
175 if off >= 0: | |
176 i = off / self.s | |
177 off = i * self.s | |
178 n = data[off + self.shaoffset:off + self.shaoffset + 20] | |
179 if n == node: | |
180 self.map[n] = i + start / self.s | |
181 return node | |
182 else: | |
183 break | |
184 end -= blocksize | |
185 return None | |
186 | |
187 def loadindex(self, i=None, end=None): | |
188 if self.all: return | |
189 all = False | |
190 if i == None: | |
191 blockstart = 0 | |
192 blocksize = (512 / self.s) * self.s | |
193 end = self.datasize | |
194 all = True | |
195 else: | |
196 if end: | |
197 blockstart = i * self.s | |
198 end = end * self.s | |
199 blocksize = end - blockstart | |
200 else: | |
201 blockstart = (i & ~(32)) * self.s | |
202 blocksize = self.s * 64 | |
203 end = blockstart + blocksize | |
204 while blockstart < end: | |
205 self.loadblock(blockstart, blocksize) | |
206 blockstart += blocksize | |
207 if all: self.all = True | |
120 | 208 |
121 class lazyindex(object): | 209 class lazyindex(object): |
122 """a lazy version of the index array""" | 210 """a lazy version of the index array""" |
123 def __init__(self, parser): | 211 def __init__(self, parser): |
124 self.p = parser | 212 self.p = parser |
125 def __len__(self): | 213 def __len__(self): |
126 return len(self.p.index) | 214 return len(self.p.index) |
127 def load(self, pos): | 215 def load(self, pos): |
128 if pos < 0: | 216 if pos < 0: |
129 pos += len(self.p.index) | 217 pos += len(self.p.index) |
130 self.p.load(pos) | 218 self.p.loadindex(pos) |
131 return self.p.index[pos] | 219 return self.p.index[pos] |
132 def __getitem__(self, pos): | 220 def __getitem__(self, pos): |
133 return self.p.index[pos] or self.load(pos) | 221 return self.p.index[pos] or self.load(pos) |
134 def __setitem__(self, pos, item): | 222 def __setitem__(self, pos, item): |
135 self.p.index[pos] = item | 223 self.p.index[pos] = item |
141 class lazymap(object): | 229 class lazymap(object): |
142 """a lazy version of the node map""" | 230 """a lazy version of the node map""" |
143 def __init__(self, parser): | 231 def __init__(self, parser): |
144 self.p = parser | 232 self.p = parser |
145 def load(self, key): | 233 def load(self, key): |
146 if self.p.all: return | 234 n = self.p.findnode(key) |
147 n = self.p.data.find(key) | 235 if n == None: |
148 if n < 0: | |
149 raise KeyError(key) | 236 raise KeyError(key) |
150 pos = n / self.p.s | |
151 self.p.load(pos) | |
152 def __contains__(self, key): | 237 def __contains__(self, key): |
153 self.p.load() | 238 if key in self.p.map: |
239 return True | |
240 self.p.loadmap() | |
154 return key in self.p.map | 241 return key in self.p.map |
155 def __iter__(self): | 242 def __iter__(self): |
156 yield nullid | 243 yield nullid |
157 for i in xrange(self.p.l): | 244 for i in xrange(self.p.l): |
158 try: | 245 try: |
159 yield self.p.index[i][-1] | 246 yield self.p.index[i][-1] |
160 except: | 247 except: |
161 self.p.load(i) | 248 self.p.loadindex(i) |
162 yield self.p.index[i][-1] | 249 yield self.p.index[i][-1] |
163 def __getitem__(self, key): | 250 def __getitem__(self, key): |
164 try: | 251 try: |
165 return self.p.map[key] | 252 return self.p.map[key] |
166 except KeyError: | 253 except KeyError: |
220 | 307 |
221 def load(self): | 308 def load(self): |
222 v = self.defversion | 309 v = self.defversion |
223 try: | 310 try: |
224 f = self.opener(self.indexfile) | 311 f = self.opener(self.indexfile) |
225 i = f.read() | 312 i = f.read(4) |
313 f.seek(0) | |
226 except IOError, inst: | 314 except IOError, inst: |
227 if inst.errno != errno.ENOENT: | 315 if inst.errno != errno.ENOENT: |
228 raise | 316 raise |
229 i = "" | 317 i = "" |
230 else: | 318 else: |
239 and st.st_mtime == oldst.st_mtime | 327 and st.st_mtime == oldst.st_mtime |
240 and st.st_ctime == oldst.st_ctime): | 328 and st.st_ctime == oldst.st_ctime): |
241 return | 329 return |
242 self.indexstat = st | 330 self.indexstat = st |
243 if len(i) > 0: | 331 if len(i) > 0: |
244 v = struct.unpack(versionformat, i[:4])[0] | 332 v = struct.unpack(versionformat, i)[0] |
245 flags = v & ~0xFFFF | 333 flags = v & ~0xFFFF |
246 fmt = v & 0xFFFF | 334 fmt = v & 0xFFFF |
247 if fmt == 0: | 335 if fmt == 0: |
248 if flags: | 336 if flags: |
249 raise RevlogError(_("index %s invalid flags %x for format v0" % | 337 raise RevlogError(_("index %s invalid flags %x for format v0" % |
256 raise RevlogError(_("index %s invalid format %d" % | 344 raise RevlogError(_("index %s invalid format %d" % |
257 (self.indexfile, fmt))) | 345 (self.indexfile, fmt))) |
258 self.version = v | 346 self.version = v |
259 if v == 0: | 347 if v == 0: |
260 self.indexformat = indexformatv0 | 348 self.indexformat = indexformatv0 |
349 shaoffset = v0shaoffset | |
261 else: | 350 else: |
262 self.indexformat = indexformatng | 351 self.indexformat = indexformatng |
352 shaoffset = ngshaoffset | |
263 | 353 |
264 if i: | 354 if i: |
265 if not self.inlinedata() and st and st.st_size > 10000: | 355 if not self.inlinedata() and st and st.st_size > 10000: |
266 # big index, let's parse it on demand | 356 # big index, let's parse it on demand |
267 parser = lazyparser(i, self, self.indexformat) | 357 parser = lazyparser(f, st.st_size, self.indexformat, shaoffset) |
268 self.index = lazyindex(parser) | 358 self.index = lazyindex(parser) |
269 self.nodemap = lazymap(parser) | 359 self.nodemap = lazymap(parser) |
270 else: | 360 else: |
361 i = f.read() | |
271 self.parseindex(i) | 362 self.parseindex(i) |
272 if self.inlinedata(): | 363 if self.inlinedata(): |
273 # we've already got the entire data file read in, save it | 364 # we've already got the entire data file read in, save it |
274 # in the chunk data | 365 # in the chunk data |
275 self.chunkcache = (0, i) | 366 self.chunkcache = (0, i) |
310 return int(q & 0xFFFF) | 401 return int(q & 0xFFFF) |
311 | 402 |
312 def offset_type(self, offset, type): | 403 def offset_type(self, offset, type): |
313 return long(long(offset) << 16 | type) | 404 return long(long(offset) << 16 | type) |
314 | 405 |
406 def loadindex(self, start, end): | |
407 """load a block of indexes all at once from the lazy parser""" | |
408 if isinstance(self.index, lazyindex): | |
409 self.index.p.loadindex(start, end) | |
410 | |
315 def loadindexmap(self): | 411 def loadindexmap(self): |
316 """loads both the map and the index from the lazy parser""" | 412 """loads both the map and the index from the lazy parser""" |
317 if isinstance(self.index, lazyindex): | 413 if isinstance(self.index, lazyindex): |
318 p = self.index.p | 414 p = self.index.p |
319 p.load() | 415 p.loadindex() |
416 self.nodemap = p.map | |
417 | |
418 def loadmap(self): | |
419 """loads the map from the lazy parser""" | |
420 if isinstance(self.nodemap, lazymap): | |
421 self.nodemap.p.loadmap() | |
422 self.nodemap = self.nodemap.p.map | |
320 | 423 |
321 def inlinedata(self): return self.version & REVLOGNGINLINEDATA | 424 def inlinedata(self): return self.version & REVLOGNGINLINEDATA |
322 def tip(self): return self.node(len(self.index) - 1) | 425 def tip(self): return self.node(len(self.index) - 1) |
323 def count(self): return len(self.index) | 426 def count(self): return len(self.index) |
324 def node(self, rev): | 427 def node(self, rev): |
688 | 791 |
689 # do we have useful data cached? | 792 # do we have useful data cached? |
690 if self.cache and self.cache[1] >= base and self.cache[1] < rev: | 793 if self.cache and self.cache[1] >= base and self.cache[1] < rev: |
691 base = self.cache[1] | 794 base = self.cache[1] |
692 text = self.cache[2] | 795 text = self.cache[2] |
693 else: | 796 self.loadindex(base, rev + 1) |
797 else: | |
798 self.loadindex(base, rev + 1) | |
694 text = self.chunk(base, df=df) | 799 text = self.chunk(base, df=df) |
695 | 800 |
696 bins = [] | 801 bins = [] |
697 for r in xrange(base + 1, rev + 1): | 802 for r in xrange(base + 1, rev + 1): |
698 bins.append(self.chunk(r, df=df)) | 803 bins.append(self.chunk(r, df=df)) |