Path: utzoo!utgpu!jarvis.csri.toronto.edu!mailrus!uwm.edu!gem.mps.ohio-state.edu!ginosko!uunet!mcsun!ukc!acorn!john From: john@acorn.co.uk (John Bowler) Newsgroups: comp.windows.x Subject: Re: Using xdm on IBM RTs running AOS 4.3/Sept 88 Keywords: xdm, IBM RT/PC, IBM RT 6152 Message-ID: <836@acorn.co.uk> Date: 18 Sep 89 17:37:29 GMT References: Reply-To: john@acorn.UUCP (John Bowler) Organization: Acorn Computers Ltd, Cambridge, UK Lines: 167 In article ehrlich@cs.psu.edu (Daniel Ehrlich) writes: > >We have a number of IBM RT 6152 Academic Systems running AOS 4.3 (aka >BSD 4.3). It would be nice if we could coerce our users into using X >windows on these machines. Xdm looked like the way to go, but we have >noticed that when a user logs out zombied processes tend to collect >until the process table is full and the machine must be rebooted. > >Xdm is set up to restart, rather than reset, the server as the Xibm >server tends to grow without bound if not restarted. Has anyone else >tried using xdm on RTs running AOS? No - but I have suffered from zombied xdm's on a system where the server shuts down instead of resetting when all the client connections are closed. (This was done for precisely the same reasons - to ensure that a new X session didn't inherit an enormous server from a previous one). The problem is that xdm suffers from at least one race condition when the server dies - heavily loaded machines in particular are likely to promote it. The following (context) diffs are to display.c with patches 1-8 (neither patch 9 nor 10 affect it). They fix the problem on Acorn's system:- *** /tmp/,SMSt1022767 Mon Feb 27 19:44:09 1989 (version with patches 1-8) --- display.c Thu Jun 1 19:31:11 1989 *************** *** 38,44 **** static jmp_buf terminated; ! static CatchTerm (), someoneDied (), abortOpen (), StartServer (); static WaitForServer (), TerminateServer (), HupServer (), StartSession (); extern unsigned sleep (); --- 38,44 ---- static jmp_buf terminated; ! static CatchTerm (), CatchChild (), abortOpen (), StartServer (); static WaitForServer (), TerminateServer (), HupServer (), StartSession (); extern unsigned sleep (); *************** *** 58,70 **** abort (); } ! static int someoneDead; static ! someoneDied () { Debug ("someone died\n"); ! someoneDead = 1; } ManageDisplay (d) --- 58,77 ---- abort (); } ! /* ! * This modification is to deal with a server (the ARM one) which ! * insists on shutting down when the last client closes the display. ! * This exposes an interesting race/bug in this code - WaitForServer ! * fails to clean up (because the SIGCHLD handler is removed - below). ! */ ! static int deaths; ! static int burials; static ! CatchChild () { Debug ("someone died\n"); ! ++deaths; } ManageDisplay (d) *************** *** 86,92 **** } (void) signal (SIGTERM, CatchTerm); (void) signal (SIGHUP, CatchHup); ! (void) signal (SIGCHLD, someoneDied); (void) signal (SIGPIPE, SIG_IGN); /* * Step 4: Start server control program --- 93,99 ---- } (void) signal (SIGTERM, CatchTerm); (void) signal (SIGHUP, CatchHup); ! (void) signal (SIGCHLD, CatchChild); (void) signal (SIGPIPE, SIG_IGN); /* * Step 4: Start server control program *************** *** 95,101 **** Debug ("aborting display %s\n", d->name); exit (1); } - (void) signal (SIGCHLD, SIG_DFL); /* * keep a session running on this display */ --- 102,107 ---- *************** *** 109,114 **** --- 115,121 ---- else continue; } + ++burials; if (pid == sessionPid) { Debug ("session died %s\n", d->name); switch (waitVal (status)) { *************** *** 266,274 **** RegisterCloseOnFork (ConnectionNumber (dpy)); return 1; } ! if (someoneDead) { pid = wait ((waitType *) 0); ! if (pid == serverPid) { Debug ("server died\n"); return 0; } --- 273,282 ---- RegisterCloseOnFork (ConnectionNumber (dpy)); return 1; } ! if (burials < deaths) { pid = wait ((waitType *) 0); ! if (pid != (-1)) ++burials; ! if (pid == serverPid || pid == (-1) && errno == ECHILD) { Debug ("server died\n"); return 0; } *************** *** 285,292 **** Debug ("giving up on server\n"); LogError ("server open failed for %s, giving up\n", d->name); pid = 0; ! if (someoneDead) ! pid = wait ((waitType *) 0); if (pid != serverPid) TerminateServer (d, serverPid); return 0; --- 293,301 ---- Debug ("giving up on server\n"); LogError ("server open failed for %s, giving up\n", d->name); pid = 0; ! while (burials < deaths && pid != serverPid) ! if ((pid = wait ((waitType *) 0)) != (-1)) ++burials; ! else break; if (pid != serverPid) TerminateServer (d, serverPid); return 0; *************** *** 338,343 **** --- 347,353 ---- (void) alarm (d->openTimeout); pid = wait ((waitType *) 0); (void) alarm (0); + if (pid >= 0) ++burials; if (pid == serverPid) break; }