comparison mercurial/changelog.py @ 28487:98d98a645e9d

changelog: add class to represent parsed changelog revisions Currently, changelog entries are parsed into their respective components at read time. Many operations are only interested in a subset of fields of a changelog entry. The parsing and storing of all the fields adds avoidable overhead. This patch introduces the "changelogrevision" class. It takes changelog raw text and exposes the parsed results as attributes. The code for parsing changelog entries has been moved into its construction function. changelog.read() has been modified to use the new class internally while maintaining its existing API. Future patches will make revision parsing lazy. We implement the construction function of the new class with __new__ instead of __init__ so we can use a named tuple to represent the empty revision. This saves overhead and complexity of coercing later versions of this class to represent an empty instance. While we are here, we add a method on changelog to obtain an instance of the new type. The overhead of constructing the new class regresses performance of revsets accessing this data: author(mpm) 0.896565 0.929984 desc(bug) 0.887169 0.935642 105% date(2015) 0.878797 0.908094 extra(rebase_source) 0.865446 0.922624 106% author(mpm) or author(greg) 1.801832 1.902112 105% author(mpm) or desc(bug) 1.812438 1.860977 date(2015) or branch(default) 0.968276 1.005824 author(mpm) or desc(bug) or date(2015) or extra(rebase_source) 3.656193 3.743381 Once lazy parsing is implemented, these revsets will all be faster than before. There is no performance change on revsets that do not access this data. There /could/ be a performance regression on operations that perform several changelog reads. However, I can't think of anything outside of revsets and `hg log` (basically the same as a revset) that would be impacted.
author Gregory Szorc <gregory.szorc@gmail.com>
date Sun, 06 Mar 2016 14:28:02 -0800
parents 9e3f505b1d50
children 8939a95064f1
comparison
equal deleted inserted replaced
28486:50314dc3ae4e 28487:98d98a645e9d
4 # 4 #
5 # This software may be used and distributed according to the terms of the 5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version. 6 # GNU General Public License version 2 or any later version.
7 7
8 from __future__ import absolute_import 8 from __future__ import absolute_import
9
10 import collections
9 11
10 from .i18n import _ 12 from .i18n import _
11 from .node import ( 13 from .node import (
12 bin, 14 bin,
13 hex, 15 hex,
134 if name != target: 136 if name != target:
135 return opener(name, mode) 137 return opener(name, mode)
136 return appender(opener, name, mode, buf) 138 return appender(opener, name, mode, buf)
137 return _delay 139 return _delay
138 140
141 _changelogrevision = collections.namedtuple('changelogrevision',
142 ('manifest', 'user', 'date',
143 'files', 'description', 'extra'))
144
145 class changelogrevision(object):
146 """Holds results of a parsed changelog revision.
147
148 Changelog revisions consist of multiple pieces of data, including
149 the manifest node, user, and date. This object exposes a view into
150 the parsed object.
151 """
152
153 __slots__ = (
154 'date',
155 'description',
156 'extra',
157 'files',
158 'manifest',
159 'user',
160 )
161
162 def __new__(cls, text):
163 if not text:
164 return _changelogrevision(
165 manifest=nullid,
166 user='',
167 date=(0, 0),
168 files=[],
169 description='',
170 extra=_defaultextra,
171 )
172
173 self = super(changelogrevision, cls).__new__(cls)
174 # We could return here and implement the following as an __init__.
175 # But doing it here is equivalent and saves an extra function call.
176
177 # format used:
178 # nodeid\n : manifest node in ascii
179 # user\n : user, no \n or \r allowed
180 # time tz extra\n : date (time is int or float, timezone is int)
181 # : extra is metadata, encoded and separated by '\0'
182 # : older versions ignore it
183 # files\n\n : files modified by the cset, no \n or \r allowed
184 # (.*) : comment (free text, ideally utf-8)
185 #
186 # changelog v0 doesn't use extra
187
188 last = text.index("\n\n")
189 self.description = encoding.tolocal(text[last + 2:])
190 l = text[:last].split('\n')
191 self.manifest = bin(l[0])
192 self.user = encoding.tolocal(l[1])
193
194 tdata = l[2].split(' ', 2)
195 if len(tdata) != 3:
196 time = float(tdata[0])
197 try:
198 # various tools did silly things with the time zone field.
199 timezone = int(tdata[1])
200 except ValueError:
201 timezone = 0
202 self.extra = _defaultextra
203 else:
204 time, timezone = float(tdata[0]), int(tdata[1])
205 self.extra = decodeextra(tdata[2])
206
207 self.date = (time, timezone)
208 self.files = l[3:]
209
210 return self
211
139 class changelog(revlog.revlog): 212 class changelog(revlog.revlog):
140 def __init__(self, opener): 213 def __init__(self, opener):
141 revlog.revlog.__init__(self, opener, "00changelog.i") 214 revlog.revlog.__init__(self, opener, "00changelog.i")
142 if self._initempty: 215 if self._initempty:
143 # changelogs don't benefit from generaldelta 216 # changelogs don't benefit from generaldelta
321 def checkinlinesize(self, tr, fp=None): 394 def checkinlinesize(self, tr, fp=None):
322 if not self._delayed: 395 if not self._delayed:
323 revlog.revlog.checkinlinesize(self, tr, fp) 396 revlog.revlog.checkinlinesize(self, tr, fp)
324 397
325 def read(self, node): 398 def read(self, node):
399 """Obtain data from a parsed changelog revision.
400
401 Returns a 6-tuple of:
402
403 - manifest node in binary
404 - author/user as a localstr
405 - date as a 2-tuple of (time, timezone)
406 - list of files
407 - commit message as a localstr
408 - dict of extra metadata
409
410 Unless you need to access all fields, consider calling
411 ``changelogrevision`` instead, as it is faster for partial object
412 access.
326 """ 413 """
327 format used: 414 c = changelogrevision(self.revision(node))
328 nodeid\n : manifest node in ascii 415 return (
329 user\n : user, no \n or \r allowed 416 c.manifest,
330 time tz extra\n : date (time is int or float, timezone is int) 417 c.user,
331 : extra is metadata, encoded and separated by '\0' 418 c.date,
332 : older versions ignore it 419 c.files,
333 files\n\n : files modified by the cset, no \n or \r allowed 420 c.description,
334 (.*) : comment (free text, ideally utf-8) 421 c.extra
335 422 )
336 changelog v0 doesn't use extra 423
337 """ 424 def changelogrevision(self, nodeorrev):
338 text = self.revision(node) 425 """Obtain a ``changelogrevision`` for a node or revision."""
339 if not text: 426 return changelogrevision(self.revision(nodeorrev))
340 return nullid, "", (0, 0), [], "", _defaultextra
341 last = text.index("\n\n")
342 desc = encoding.tolocal(text[last + 2:])
343 l = text[:last].split('\n')
344 manifest = bin(l[0])
345 user = encoding.tolocal(l[1])
346
347 tdata = l[2].split(' ', 2)
348 if len(tdata) != 3:
349 time = float(tdata[0])
350 try:
351 # various tools did silly things with the time zone field.
352 timezone = int(tdata[1])
353 except ValueError:
354 timezone = 0
355 extra = _defaultextra
356 else:
357 time, timezone = float(tdata[0]), int(tdata[1])
358 extra = decodeextra(tdata[2])
359
360 files = l[3:]
361 return manifest, user, (time, timezone), files, desc, extra
362 427
363 def readfiles(self, node): 428 def readfiles(self, node):
364 """ 429 """
365 short version of read that only returns the files modified by the cset 430 short version of read that only returns the files modified by the cset
366 """ 431 """