Mercurial > public > mercurial-scm > hg-stable
comparison mercurial/util.py @ 30428:1156ec81f709
util: improve iterfile so it chooses code path wisely
We have performance concerns on "iterfile" as it is 4X slower on normal
files. While modern systems have the nice property that reading a "fast"
(on-disk) file cannot be interrupted and should be made use of.
This patch dumps the related knowledge in comments. And "iterfile" chooses
code paths wisely:
1. If it's CPython 3, or PyPY, use the fast path.
2. If fp is a normal file, use the fast path.
3. If fp is not a normal file and CPython version >= 2.7.4, use the same
workaround (4x slower) as before.
4. If fp is not a normal file and CPython version < 2.7.4, use another
workaround (2x slower but may block longer then necessary) which
basically re-invents the buffer + readline logic in Python.
This will give us good confidence on both correctness and performance
dealing with EINTR in iterfile(fp) for all known supported Python versions.
author | Jun Wu <quark@fb.com> |
---|---|
date | Tue, 15 Nov 2016 20:25:51 +0000 |
parents | 854190becacb |
children | 64d7275445d0 |
comparison
equal
deleted
inserted
replaced
30427:854190becacb | 30428:1156ec81f709 |
---|---|
22 import errno | 22 import errno |
23 import gc | 23 import gc |
24 import hashlib | 24 import hashlib |
25 import imp | 25 import imp |
26 import os | 26 import os |
27 import platform as pyplatform | |
27 import re as remod | 28 import re as remod |
28 import shutil | 29 import shutil |
29 import signal | 30 import signal |
30 import socket | 31 import socket |
32 import stat | |
31 import string | 33 import string |
32 import subprocess | 34 import subprocess |
33 import sys | 35 import sys |
34 import tempfile | 36 import tempfile |
35 import textwrap | 37 import textwrap |
2206 wrapper = MBTextWrapper(width=width, | 2208 wrapper = MBTextWrapper(width=width, |
2207 initial_indent=initindent, | 2209 initial_indent=initindent, |
2208 subsequent_indent=hangindent) | 2210 subsequent_indent=hangindent) |
2209 return wrapper.fill(line).encode(encoding.encoding) | 2211 return wrapper.fill(line).encode(encoding.encoding) |
2210 | 2212 |
2211 def iterfile(fp): | 2213 if (pyplatform.python_implementation() == 'CPython' and |
2212 """like fp.__iter__ but does not have issues with EINTR. Python 2.7.12 is | 2214 sys.version_info < (3, 0)): |
2213 known to have such issues.""" | 2215 # There is an issue in CPython that some IO methods do not handle EINTR |
2214 return iter(fp.readline, '') | 2216 # correctly. The following table shows what CPython version (and functions) |
2217 # are affected (buggy: has the EINTR bug, okay: otherwise): | |
2218 # | |
2219 # | < 2.7.4 | 2.7.4 to 2.7.12 | >= 3.0 | |
2220 # -------------------------------------------------- | |
2221 # fp.__iter__ | buggy | buggy | okay | |
2222 # fp.read* | buggy | okay [1] | okay | |
2223 # | |
2224 # [1]: fixed by changeset 67dc99a989cd in the cpython hg repo. | |
2225 # | |
2226 # Here we workaround the EINTR issue for fileobj.__iter__. Other methods | |
2227 # like "read*" are ignored for now, as Python < 2.7.4 is a minority. | |
2228 # | |
2229 # Although we can workaround the EINTR issue for fp.__iter__, it is slower: | |
2230 # "for x in fp" is 4x faster than "for x in iter(fp.readline, '')" in | |
2231 # CPython 2, because CPython 2 maintains an internal readahead buffer for | |
2232 # fp.__iter__ but not other fp.read* methods. | |
2233 # | |
2234 # On modern systems like Linux, the "read" syscall cannot be interrupted | |
2235 # when reading "fast" files like on-disk files. So the EINTR issue only | |
2236 # affects things like pipes, sockets, ttys etc. We treat "normal" (S_ISREG) | |
2237 # files approximately as "fast" files and use the fast (unsafe) code path, | |
2238 # to minimize the performance impact. | |
2239 if sys.version_info >= (2, 7, 4): | |
2240 # fp.readline deals with EINTR correctly, use it as a workaround. | |
2241 def _safeiterfile(fp): | |
2242 return iter(fp.readline, '') | |
2243 else: | |
2244 # fp.read* are broken too, manually deal with EINTR in a stupid way. | |
2245 # note: this may block longer than necessary because of bufsize. | |
2246 def _safeiterfile(fp, bufsize=4096): | |
2247 fd = fp.fileno() | |
2248 line = '' | |
2249 while True: | |
2250 try: | |
2251 buf = os.read(fd, bufsize) | |
2252 except OSError as ex: | |
2253 # os.read only raises EINTR before any data is read | |
2254 if ex.errno == errno.EINTR: | |
2255 continue | |
2256 else: | |
2257 raise | |
2258 line += buf | |
2259 if '\n' in buf: | |
2260 splitted = line.splitlines(True) | |
2261 line = '' | |
2262 for l in splitted: | |
2263 if l[-1] == '\n': | |
2264 yield l | |
2265 else: | |
2266 line = l | |
2267 if not buf: | |
2268 break | |
2269 if line: | |
2270 yield line | |
2271 | |
2272 def iterfile(fp): | |
2273 fastpath = True | |
2274 if type(fp) is file: | |
2275 fastpath = stat.S_ISREG(os.fstat(fp.fileno()).st_mode) | |
2276 if fastpath: | |
2277 return fp | |
2278 else: | |
2279 return _safeiterfile(fp) | |
2280 else: | |
2281 # PyPy and CPython 3 do not have the EINTR issue thus no workaround needed. | |
2282 def iterfile(fp): | |
2283 return fp | |
2215 | 2284 |
2216 def iterlines(iterator): | 2285 def iterlines(iterator): |
2217 for chunk in iterator: | 2286 for chunk in iterator: |
2218 for line in chunk.splitlines(): | 2287 for line in chunk.splitlines(): |
2219 yield line | 2288 yield line |