Mercurial > public > mercurial-scm > hg-stable
comparison mercurial/changelog.py @ 28487:98d98a645e9d
changelog: add class to represent parsed changelog revisions
Currently, changelog entries are parsed into their respective
components at read time. Many operations are only interested
in a subset of fields of a changelog entry. The parsing and
storing of all the fields adds avoidable overhead.
This patch introduces the "changelogrevision" class. It takes
changelog raw text and exposes the parsed results as attributes.
The code for parsing changelog entries has been moved into its
construction function. changelog.read() has been modified to use
the new class internally while maintaining its existing API.
Future patches will make revision parsing lazy.
We implement the construction function of the new class with
__new__ instead of __init__ so we can use a named tuple to
represent the empty revision. This saves overhead and complexity
of coercing later versions of this class to represent an empty
instance.
While we are here, we add a method on changelog to obtain an
instance of the new type.
The overhead of constructing the new class regresses performance
of revsets accessing this data:
author(mpm)
0.896565
0.929984
desc(bug)
0.887169
0.935642 105%
date(2015)
0.878797
0.908094
extra(rebase_source)
0.865446
0.922624 106%
author(mpm) or author(greg)
1.801832
1.902112 105%
author(mpm) or desc(bug)
1.812438
1.860977
date(2015) or branch(default)
0.968276
1.005824
author(mpm) or desc(bug) or date(2015) or extra(rebase_source)
3.656193
3.743381
Once lazy parsing is implemented, these revsets will all be faster
than before. There is no performance change on revsets that do not
access this data. There /could/ be a performance regression on
operations that perform several changelog reads. However, I can't
think of anything outside of revsets and `hg log` (basically the
same as a revset) that would be impacted.
author | Gregory Szorc <gregory.szorc@gmail.com> |
---|---|
date | Sun, 06 Mar 2016 14:28:02 -0800 |
parents | 9e3f505b1d50 |
children | 8939a95064f1 |
comparison
equal
deleted
inserted
replaced
28486:50314dc3ae4e | 28487:98d98a645e9d |
---|---|
4 # | 4 # |
5 # This software may be used and distributed according to the terms of the | 5 # This software may be used and distributed according to the terms of the |
6 # GNU General Public License version 2 or any later version. | 6 # GNU General Public License version 2 or any later version. |
7 | 7 |
8 from __future__ import absolute_import | 8 from __future__ import absolute_import |
9 | |
10 import collections | |
9 | 11 |
10 from .i18n import _ | 12 from .i18n import _ |
11 from .node import ( | 13 from .node import ( |
12 bin, | 14 bin, |
13 hex, | 15 hex, |
134 if name != target: | 136 if name != target: |
135 return opener(name, mode) | 137 return opener(name, mode) |
136 return appender(opener, name, mode, buf) | 138 return appender(opener, name, mode, buf) |
137 return _delay | 139 return _delay |
138 | 140 |
141 _changelogrevision = collections.namedtuple('changelogrevision', | |
142 ('manifest', 'user', 'date', | |
143 'files', 'description', 'extra')) | |
144 | |
145 class changelogrevision(object): | |
146 """Holds results of a parsed changelog revision. | |
147 | |
148 Changelog revisions consist of multiple pieces of data, including | |
149 the manifest node, user, and date. This object exposes a view into | |
150 the parsed object. | |
151 """ | |
152 | |
153 __slots__ = ( | |
154 'date', | |
155 'description', | |
156 'extra', | |
157 'files', | |
158 'manifest', | |
159 'user', | |
160 ) | |
161 | |
162 def __new__(cls, text): | |
163 if not text: | |
164 return _changelogrevision( | |
165 manifest=nullid, | |
166 user='', | |
167 date=(0, 0), | |
168 files=[], | |
169 description='', | |
170 extra=_defaultextra, | |
171 ) | |
172 | |
173 self = super(changelogrevision, cls).__new__(cls) | |
174 # We could return here and implement the following as an __init__. | |
175 # But doing it here is equivalent and saves an extra function call. | |
176 | |
177 # format used: | |
178 # nodeid\n : manifest node in ascii | |
179 # user\n : user, no \n or \r allowed | |
180 # time tz extra\n : date (time is int or float, timezone is int) | |
181 # : extra is metadata, encoded and separated by '\0' | |
182 # : older versions ignore it | |
183 # files\n\n : files modified by the cset, no \n or \r allowed | |
184 # (.*) : comment (free text, ideally utf-8) | |
185 # | |
186 # changelog v0 doesn't use extra | |
187 | |
188 last = text.index("\n\n") | |
189 self.description = encoding.tolocal(text[last + 2:]) | |
190 l = text[:last].split('\n') | |
191 self.manifest = bin(l[0]) | |
192 self.user = encoding.tolocal(l[1]) | |
193 | |
194 tdata = l[2].split(' ', 2) | |
195 if len(tdata) != 3: | |
196 time = float(tdata[0]) | |
197 try: | |
198 # various tools did silly things with the time zone field. | |
199 timezone = int(tdata[1]) | |
200 except ValueError: | |
201 timezone = 0 | |
202 self.extra = _defaultextra | |
203 else: | |
204 time, timezone = float(tdata[0]), int(tdata[1]) | |
205 self.extra = decodeextra(tdata[2]) | |
206 | |
207 self.date = (time, timezone) | |
208 self.files = l[3:] | |
209 | |
210 return self | |
211 | |
139 class changelog(revlog.revlog): | 212 class changelog(revlog.revlog): |
140 def __init__(self, opener): | 213 def __init__(self, opener): |
141 revlog.revlog.__init__(self, opener, "00changelog.i") | 214 revlog.revlog.__init__(self, opener, "00changelog.i") |
142 if self._initempty: | 215 if self._initempty: |
143 # changelogs don't benefit from generaldelta | 216 # changelogs don't benefit from generaldelta |
321 def checkinlinesize(self, tr, fp=None): | 394 def checkinlinesize(self, tr, fp=None): |
322 if not self._delayed: | 395 if not self._delayed: |
323 revlog.revlog.checkinlinesize(self, tr, fp) | 396 revlog.revlog.checkinlinesize(self, tr, fp) |
324 | 397 |
325 def read(self, node): | 398 def read(self, node): |
399 """Obtain data from a parsed changelog revision. | |
400 | |
401 Returns a 6-tuple of: | |
402 | |
403 - manifest node in binary | |
404 - author/user as a localstr | |
405 - date as a 2-tuple of (time, timezone) | |
406 - list of files | |
407 - commit message as a localstr | |
408 - dict of extra metadata | |
409 | |
410 Unless you need to access all fields, consider calling | |
411 ``changelogrevision`` instead, as it is faster for partial object | |
412 access. | |
326 """ | 413 """ |
327 format used: | 414 c = changelogrevision(self.revision(node)) |
328 nodeid\n : manifest node in ascii | 415 return ( |
329 user\n : user, no \n or \r allowed | 416 c.manifest, |
330 time tz extra\n : date (time is int or float, timezone is int) | 417 c.user, |
331 : extra is metadata, encoded and separated by '\0' | 418 c.date, |
332 : older versions ignore it | 419 c.files, |
333 files\n\n : files modified by the cset, no \n or \r allowed | 420 c.description, |
334 (.*) : comment (free text, ideally utf-8) | 421 c.extra |
335 | 422 ) |
336 changelog v0 doesn't use extra | 423 |
337 """ | 424 def changelogrevision(self, nodeorrev): |
338 text = self.revision(node) | 425 """Obtain a ``changelogrevision`` for a node or revision.""" |
339 if not text: | 426 return changelogrevision(self.revision(nodeorrev)) |
340 return nullid, "", (0, 0), [], "", _defaultextra | |
341 last = text.index("\n\n") | |
342 desc = encoding.tolocal(text[last + 2:]) | |
343 l = text[:last].split('\n') | |
344 manifest = bin(l[0]) | |
345 user = encoding.tolocal(l[1]) | |
346 | |
347 tdata = l[2].split(' ', 2) | |
348 if len(tdata) != 3: | |
349 time = float(tdata[0]) | |
350 try: | |
351 # various tools did silly things with the time zone field. | |
352 timezone = int(tdata[1]) | |
353 except ValueError: | |
354 timezone = 0 | |
355 extra = _defaultextra | |
356 else: | |
357 time, timezone = float(tdata[0]), int(tdata[1]) | |
358 extra = decodeextra(tdata[2]) | |
359 | |
360 files = l[3:] | |
361 return manifest, user, (time, timezone), files, desc, extra | |
362 | 427 |
363 def readfiles(self, node): | 428 def readfiles(self, node): |
364 """ | 429 """ |
365 short version of read that only returns the files modified by the cset | 430 short version of read that only returns the files modified by the cset |
366 """ | 431 """ |