Mercurial > public > mercurial-scm > hg
comparison mercurial/server.py @ 37212:f09a2eab11cf
server: add an error feedback mechanism for when the daemon fails to launch
There's a recurring problem on Windows where `hg serve -d` will randomly fail to
spawn a detached process. The reason for the failure is completely hidden, and
it takes hours to get a single failure on my laptop. All this does is redirect
stdout/stderr of the child to a file until the lock file is freed, and then the
parent dumps it out if it fails to spawn.
I chose to put the output into the lock file because that is always cleaned up.
There's no way to report errors after that anyway. On Windows, killdaemons.py
is roughly `kill -9`, so this ensures that junk won't pile up.
This may end up being a case of EADDRINUSE. At least that's what I saw spit out
a few times (among other odd errors and missing output on Windows). But I also
managed to get the same thing on Fedora 26 by running test-hgwebdir.t with
--loop -j10 for several hours. Running `netstat` immediately after killing that
run printed a wall of sockets in the TIME_WAIT state, which were gone a couple
seconds later. I couldn't match up ports that failed, because --loop doesn't
print out the message about the port that was used. So maybe the fix is to
rotate the use of HGPORT[12] in the tests. But, let's collect some more data
first.
author | Matt Harbison <matt_harbison@yahoo.com> |
---|---|
date | Wed, 28 Mar 2018 00:11:09 -0400 |
parents | a8a902d7176e |
children | 73a60281a861 |
comparison
equal
deleted
inserted
replaced
37211:77f9e95fe3c4 | 37212:f09a2eab11cf |
---|---|
27 ) | 27 ) |
28 | 28 |
29 def runservice(opts, parentfn=None, initfn=None, runfn=None, logfile=None, | 29 def runservice(opts, parentfn=None, initfn=None, runfn=None, logfile=None, |
30 runargs=None, appendpid=False): | 30 runargs=None, appendpid=False): |
31 '''Run a command as a service.''' | 31 '''Run a command as a service.''' |
32 | |
33 # When daemonized on Windows, redirect stdout/stderr to the lockfile (which | |
34 # gets cleaned up after the child is up and running), so that the parent can | |
35 # read and print the error if this child dies early. See 594dd384803c. On | |
36 # other platforms, the child can write to the parent's stdio directly, until | |
37 # it is redirected prior to runfn(). | |
38 if pycompat.iswindows and opts['daemon_postexec']: | |
39 for inst in opts['daemon_postexec']: | |
40 if inst.startswith('unlink:'): | |
41 lockpath = inst[7:] | |
42 if os.path.exists(lockpath): | |
43 procutil.stdout.flush() | |
44 procutil.stderr.flush() | |
45 | |
46 fd = os.open(lockpath, | |
47 os.O_WRONLY | os.O_APPEND | os.O_BINARY) | |
48 try: | |
49 os.dup2(fd, 1) | |
50 os.dup2(fd, 2) | |
51 finally: | |
52 os.close(fd) | |
32 | 53 |
33 def writepid(pid): | 54 def writepid(pid): |
34 if opts['pid_file']: | 55 if opts['pid_file']: |
35 if appendpid: | 56 if appendpid: |
36 mode = 'ab' | 57 mode = 'ab' |
59 break | 80 break |
60 def condfn(): | 81 def condfn(): |
61 return not os.path.exists(lockpath) | 82 return not os.path.exists(lockpath) |
62 pid = procutil.rundetached(runargs, condfn) | 83 pid = procutil.rundetached(runargs, condfn) |
63 if pid < 0: | 84 if pid < 0: |
85 # If the daemonized process managed to write out an error msg, | |
86 # report it. | |
87 if pycompat.iswindows and os.path.exists(lockpath): | |
88 with open(lockpath) as log: | |
89 for line in log: | |
90 procutil.stderr.write(line) | |
64 raise error.Abort(_('child process failed to start')) | 91 raise error.Abort(_('child process failed to start')) |
65 writepid(pid) | 92 writepid(pid) |
66 finally: | 93 finally: |
67 util.tryunlink(lockpath) | 94 util.tryunlink(lockpath) |
68 if parentfn: | 95 if parentfn: |
79 if opts['daemon_postexec']: | 106 if opts['daemon_postexec']: |
80 try: | 107 try: |
81 os.setsid() | 108 os.setsid() |
82 except AttributeError: | 109 except AttributeError: |
83 pass | 110 pass |
111 | |
112 lockpath = None | |
84 for inst in opts['daemon_postexec']: | 113 for inst in opts['daemon_postexec']: |
85 if inst.startswith('unlink:'): | 114 if inst.startswith('unlink:'): |
86 lockpath = inst[7:] | 115 lockpath = inst[7:] |
87 os.unlink(lockpath) | |
88 elif inst.startswith('chdir:'): | 116 elif inst.startswith('chdir:'): |
89 os.chdir(inst[6:]) | 117 os.chdir(inst[6:]) |
90 elif inst != 'none': | 118 elif inst != 'none': |
91 raise error.Abort(_('invalid value for --daemon-postexec: %s') | 119 raise error.Abort(_('invalid value for --daemon-postexec: %s') |
92 % inst) | 120 % inst) |
105 if nullfd not in (0, 1, 2): | 133 if nullfd not in (0, 1, 2): |
106 os.close(nullfd) | 134 os.close(nullfd) |
107 if logfile and logfilefd not in (0, 1, 2): | 135 if logfile and logfilefd not in (0, 1, 2): |
108 os.close(logfilefd) | 136 os.close(logfilefd) |
109 | 137 |
138 # Only unlink after redirecting stdout/stderr, so Windows doesn't | |
139 # complain about a sharing violation. | |
140 if lockpath: | |
141 os.unlink(lockpath) | |
142 | |
110 if runfn: | 143 if runfn: |
111 return runfn() | 144 return runfn() |
112 | 145 |
113 _cmdservicemap = { | 146 _cmdservicemap = { |
114 'chgunix': chgserver.chgunixservice, | 147 'chgunix': chgserver.chgunixservice, |