Vesta: /vesta/vestasys.org/vesta/run_tool/31/src/RunToolDaemon.C Source File

00001 // Copyright (C) 2001, Compaq Computer Corporation
00002 // 
00003 // This file is part of Vesta.
00004 // 
00005 // Vesta is free software; you can redistribute it and/or
00006 // modify it under the terms of the GNU Lesser General Public
00007 // License as published by the Free Software Foundation; either
00008 // version 2.1 of the License, or (at your option) any later version.
00009 // 
00010 // Vesta is distributed in the hope that it will be useful,
00011 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00012 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013 // Lesser General Public License for more details.
00014 // 
00015 // You should have received a copy of the GNU Lesser General Public
00016 // License along with Vesta; if not, write to the Free Software
00017 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00018 
00019 // Last modified on Mon May 23 22:25:02 EDT 2005 by ken@xorian.net         
00020 //      modified on Wed Feb 23 15:05:01 EST 2005 by irina.furman@intel.com 
00021 //      modified on Thu May 16 23:53:33 EDT 2002 by scott@scooter.cx 
00022 //      modified on Fri Mar 30 12:45:10 PST 2001 by mann   
00023 //      modified on Fri Jun  4 12:39:59 PDT 1999 by heydon 
00024 //      modified on Tue Aug 18 17:22:26 PDT 1998 by yuanyu 
00025 //      modified on Fri Jun 28 15:48:35 PDT 1996 by levin  
00026 
00027 /* This is the daemon/driver for the invocation of tools in an encapsulated
00028    context. It receives requests from remote Vesta evaluators (the _run_tool
00029    primitive) via the RunTool client and invokes a specified tool in a
00030    controlled context. */
00031 
00032 #if defined(__linux__)
00033 // need this for rpc headers. 
00034 #  include <stdint.h>
00035 // need this for system statistics.
00036 #  include <sys/sysinfo.h>
00037 #  include <unistd.h>
00038 #endif
00039 
00040 #include <stdlib.h>
00041 #include <signal.h>
00042 #include <Basics.H>
00043 #include <OS.H>
00044 #include <FdStream.H>
00045 #include <VestaConfig.H>
00046 #include <TCP_sock.H>
00047 #include <SRPC.H>
00048 #include <VestaSource.H>
00049 #include <UniqueId.H>
00050 
00051 #include <sys/utsname.h>         // Note: Unix specific
00052 #include <sys/sysinfo.h>         // Note: Tru64 specific
00053 #if defined(__digital__)
00054 #  include <machine/hal_sysinfo.h> // Note: Tru64 specific
00055 #endif
00056 
00057 #include <sstream>
00058 
00059 #include "RunToolClient.H"
00060 #include "RunToolDaemon.H"
00061 #include "Launcher.h"
00062 
00063 extern "C" {
00064 #include "get_load.h"
00065 }
00066 
00067 using std::ios;
00068 using std::ifstream;
00069 using std::istringstream;
00070 using std::cout;
00071 using std::cin;
00072 using std::cerr;
00073 using std::endl;
00074 using FS::OFdStream;
00075 
00076 // Symbolic constants =======================================================
00077 
00078 const Text DEVNULL("/dev/null");
00079 const int UNUSED = SYSERROR; // indicates an unused slot in 'children'
00080 
00081 // Data structures for request processing ===================================
00082 
00083 static void close_fd(fd_t &fd) throw ();     // forward declaration
00084 
00085 /* A 'std_info' object encapsulates a stream for relaying output
00086    (stdout or stderr) from a forked tool back to the Vesta evalutator.
00087 */
00088 class std_info {
00089   public:
00090     sockaddr_in sock;      // remote endpoint (evaluator IP/port)
00091     TCP_sock *s;           // TCP connection to 'sock' (NULL => /dev/null)
00092     fd_t f;                // corresponding file descriptor
00093                            
00094     std_info() throw ();   // initialization
00095     ~std_info() throw ();  // cleanup
00096 };
00097 
00098 std_info::std_info() throw ()
00099 {
00100     s = NULL;
00101     f = NO_FD;
00102 }
00103 
00104 std_info::~std_info() throw ()
00105 {
00106     if (s == NULL) close_fd(f); 
00107     else delete s;
00108 }
00109 
00110 typedef Sequence<Basics::cond*> CondSeq;
00111 
00112 /* Since the RunToolServer is a single, multi-threaded process that
00113    forks child processes, and the Unix mechanism for discovering when
00114    children complete does not understand about threads, a shared data
00115    structure is needed to record all child processes and their
00116    completion status. A child is entered into this data structure by
00117    the thread that forks it, which then waits for it to complete with
00118    waitpid(2).
00119 
00120    If the client requesting the tool invocation drops the connection
00121    before the child process completes, the child process is killed by
00122    a background thread.  This data structure records that the child
00123    was killed which informs the waiting thread of the child's fate
00124    when it wakes up.  */
00125 class children {
00126   public:
00127     static void init(int running, int pending) throw ();
00128     /* Initialize the 'children' data structure. 'runninng' is the
00129        maximum number of concurrent child processes. 'pending' is the
00130        maximum number of tool invocations held pending until a slot is
00131        available. */
00132 
00133     static bool my_turn() throw(); 
00134     /* Called to decide whether a thread can start a tool to service a
00135        client request.  The return value indicates whether the caller
00136        can proceed.  A return value of false indicates that the
00137        maximum number of running and pending calls has been reached.
00138        May block for a period of time before returning. */
00139 
00140     static int wait_for(pid_t pid, SRPC *client, /*OUT*/ bool &killed)
00141       throw ();
00142     /* Enters the child process with ID 'pid' into this data structure
00143        and Blocks until it completes. Sets 'killed' to true or false
00144        as the child process was killed by the background "killer"
00145        thread or not. */
00146 
00147     static void killer_body() throw ();
00148     /* This procedure implements the body of a background thread that
00149        kills child processes whose corresponding clients have died. */
00150 
00151     static void call_failed() throw ();
00152     /* De-allocates a "running" slot allocated by my_turn in the event
00153        of failure to start the tool.  If a thread has called my_turn
00154        (above) and been allocated a slot for executing a tool (my_turn
00155        returned true), it must either call wait_for or this
00156        function. */
00157 
00158   private:
00159     // 'child_info' contains information about each child process
00160     class child_info {
00161       public:
00162         pid_t pid;            // pid defining a child process (may be UNUSED)
00163         SRPC *client;         // handle on connection to clien (may be NULL)
00164         bool killed;          // true iff child was killed by "killer" thread
00165 
00166         child_info() throw ()
00167         { this->pid = UNUSED; this->client = NULL;
00168           this->killed = false; }
00169     };
00170 
00171     // global fields
00172     static Basics::mutex mu;      // protects the following fields
00173     static int maxChildren;       // max running tools; set by 'init' method
00174     static int maxPending;        // max pending tools; set by 'init' method
00175     static child_info *info;      // array [0..maxChildren) of child_info
00176     static int nInUse;            // card(i | info[i].pid != UNUSED)
00177     static int nPending;          // Count of "pending" threads.
00178     static Basics::thread killer; // thread that kills children when
00179                                   // client dies
00180     static CondSeq pending;       // condition variables used to wake
00181                                   // up threads pending for their
00182                                   // turn.
00183 
00184     friend void Info(SRPC *srpc, int intfVersion, int procId) throw(SRPC::failure);
00185 };
00186 
00187 /* A 'req_data' object represents the arguments and associated data
00188    structures associated with a single _run_tool request.
00189 */
00190 class req_data {
00191   public:
00192     // RPC arguments
00193     Text stdin_file;          // non-empty => set up stdin from here
00194     std_info out;             // relay for tool's stdout
00195     std_info err;             // relay for tool's stderr
00196     sockaddr_in fsroot_sock;  // socket for server for fsroot
00197     LongId fsroot_fh;         // file handle for fsroot
00198     Text wd;                  // fsroot-relative path for working directory
00199     chars_seq ev;             // vector of "env=val" strings
00200     chars_seq argv;           // command and arguments
00201                               
00202     // auxilliary structures  
00203     fd_t stdin_fd;            // file descriptor for tool's stdin
00204     Text chroot_to;           // chroot to here
00205     fd_t error_pipe[2];       // [0] is read end, [1] is write end
00206       /* 'error_pipe' is a pipe for reporting errors from the forked child
00207          process back to the parent. It is only used to report errors that
00208          occur in the child code that prepares for the launching of the
00209          external tool (namely, an error in either the chroot(2) or
00210          execv(2) system calls). */
00211                               
00212     req_data() throw ();      // initialization
00213     ~req_data() throw ();     // cleanup
00214 };
00215 
00216 req_data::req_data() throw ()
00217 {
00218     /* Note: the default constructors for Text fields and the
00219        fields 'ev' and 'args' do just what we want. */
00220     stdin_fd = NO_FD;
00221     error_pipe[0] = NO_FD;
00222     error_pipe[1] = NO_FD;
00223 }
00224 
00225 req_data::~req_data() throw ()
00226 {
00227     close_fd(stdin_fd);
00228     close_fd(error_pipe[0]);
00229     close_fd(error_pipe[1]);
00230 }
00231 
00232 // Global variables =========================================================
00233 
00234 // "children" class variables (see comments above)
00235 Basics::mutex children::mu;
00236 int children::maxChildren;
00237 int children::maxPending;
00238 int children::nInUse;
00239 int children::nPending;
00240 children::child_info *children::info;
00241 Basics::thread children::killer;
00242 CondSeq children::pending;
00243 
00244 // The following are constant after initialization by 'RunToolServerInit'
00245 static Text helper;                // name of "helper" program
00246 static chars_seq helper_switches;  // "helper" switches from config file
00247 static fd_t trouble_fd;            // error reporting channel
00248 static bool debug_mode = false;    // does 'helper_switches" contain "-d"?
00249 static Text volatile_root;         // repos volatile root from config file
00250 static Text sysname, release, version, machine;
00251 static FP::Tag uniqueid;
00252 static unsigned int cpus, cpuMHz, memKB;
00253 static int sync_after_tool = 1;
00254 
00255 // Utilities ================================================================
00256 
00257 struct call_failure { int code; Text msg; };
00258 
00259 static void call_failed(int c, const Text &m) throw (call_failure)
00260 {
00261     call_failure f;
00262     f.code = c;
00263     f.msg = m;
00264     throw(f);
00265 }
00266 
00267 static void fail_with_errno(const Text &m, int e = 0) throw (call_failure)
00268 {
00269     if (e == 0) e = errno;
00270     Text emsg = Basics::errno_Text(e);
00271     call_failed(implementation_error, m + " (" + emsg + ")");
00272 }
00273 
00274 static void PrintMsgWithErrno(const Text &msg) throw ()
00275 {
00276   int saved_errno = errno;
00277   cerr << "\n*** Error: " << msg;
00278   cerr << "("
00279        << Basics::errno_Text(saved_errno)
00280        << ")" << endl;
00281   cerr << "*** Continuing..." << endl;
00282 }
00283 
00284 static Text config_lookup(const Text &key) throw (call_failure)
00285 {
00286     try {
00287         Text val(VestaConfig::get_Text(RUN_TOOL_CONFIG_SECTION, key));
00288         return val;
00289     } catch (VestaConfig::failure f) {
00290         call_failed(implementation_error, f.msg);
00291     }
00292     return "";  // not reached
00293 }
00294 
00295 static void close_fd(fd_t &fd) throw()
00296 {
00297     if (fd == NO_FD) return;
00298     close(fd);
00299     fd = NO_FD;
00300 }
00301 
00302 static void move_fd(fd_t from_fd, fd_t to_fd) throw ()
00303 /* Renumbers the file descriptor 'from_fd' to 'to_fd' (preserving
00304    the reference count). */
00305 {
00306     dup2(from_fd, to_fd);
00307     close_fd(from_fd);
00308 }
00309 
00310 // Support routines for RunTool =============================================
00311 
00312 static void setup_stdin(/*INOUT*/ req_data &d) throw (call_failure)
00313 /* Open 'd.fd' on 'd.stdin_file', or on DEVNULL if 'd.stdin_file' is
00314    the empty text. */
00315 {
00316     const Text f(d.stdin_file.Empty() ? DEVNULL : d.stdin_file);
00317     d.stdin_fd = open(f.cchars(), O_RDONLY, 0);
00318     if (d.stdin_fd < 0) fail_with_errno("Can't open stdin file " + f);
00319 }
00320 
00321 static void setup_std_info(/*INOUT*/ std_info &i)
00322   throw (call_failure, TCP_sock::failure)
00323 /* Set 'i.s' and 'i.f' from 'i.sock'. */
00324 {
00325     if (i.sock.sin_addr.s_addr == 0 && i.sock.sin_port == 0) {
00326         i.s = NULL;
00327         i.f = open(DEVNULL.cchars(), O_WRONLY, 0);
00328         if (i.f == SYSERROR) {
00329             fail_with_errno("Can't open " + DEVNULL + " for std{out,err}");
00330         }
00331     } else {
00332         // Create a TCP socket; connect it remotely
00333       i.s = NEW(TCP_sock);
00334         i.s->connect_to(i.sock);
00335         i.f = i.s->get_fd();
00336     }
00337 }
00338 
00339 // Note that simulate_child_failure must be async-signal-dafe: no
00340 // memory allocations, formatted I/O, etc.  We also avoid C++
00341 // exceptions, as those are probably not safe either.
00342 static void simulate_child_failure(fd_t error_wr, fd_t fallback_wr,
00343                                    int r,
00344                                    const char *msg,
00345                                    int e = 0)
00346      throw ()
00347 {
00348   // Determine the errno, if any, and try to convert it to a useful
00349   // text message.
00350   if (e == 0) e = errno;
00351   Text emsg = Basics::errno_Text(e);
00352 
00353   // First try to write the error message to the error file
00354   // descriptor.
00355   if((write(error_wr, msg, strlen(msg)) == SYSERROR) ||
00356      (write(error_wr, " (", 2) == SYSERROR) ||
00357      (write(error_wr, emsg.cchars(), emsg.Length()) == SYSERROR) ||
00358      (write(error_wr, ")\n", 2) == SYSERROR) ||
00359      ((fsync(error_wr) == SYSERROR) &&
00360       // We ignore errors from fsync which can occur if we're writing
00361       // to a non-file (i.e. a pipe), which we probably are.
00362       (errno != EINVAL)))
00363     {
00364       // If we can't do that, try to write the error message to the
00365       // fall-back file descriptor.  In these we ignore errors,
00366       // because we're out of options if these fail.
00367       write(fallback_wr, msg, strlen(msg));
00368       write(fallback_wr, " (", 2);
00369       write(fallback_wr, emsg.cchars(), emsg.Length());
00370       write(fallback_wr, ")\n", 2);
00371       fsync(fallback_wr);
00372     }
00373 
00374   // Always exit with the status code passed to us.  Note that we must
00375   // use _exit(2), not exit(3), as exit(3) can call code which is not
00376   // aync-signal-safe.
00377   _exit(r);
00378 }
00379 
00380 // Implementation of class 'children' =======================================
00381 
00382 static void *children_killer_body(void *arg) throw ()
00383 /* Wrapper for service thread body; simply calls 'children::killer_body'. */
00384 {
00385     children::killer_body();
00386     return (void *)NULL;
00387 }
00388 
00389 void children::init(int running, int pending) throw ()
00390 {
00391     children::nInUse = 0;
00392     children::nPending = 0;
00393     children::maxChildren = running;
00394     children::maxPending = pending;
00395     children::info = NEW_ARRAY(child_info, running);
00396     children::killer.fork_and_detach(children_killer_body, NULL);
00397 }
00398 
00399 bool children::my_turn() throw()
00400 {
00401   bool result;
00402 
00403   children::mu.lock();
00404   
00405   // Compute the number of running or soon-to-be-running threads.
00406   int nRunning =
00407     // Number currently running
00408     children::nInUse +
00409     // Number that have been signaled to wake up and run but haven't
00410     // been scheduled yet.
00411     (children::nPending - children::pending.size());
00412   assert(children::nPending >= children::pending.size());
00413   assert(nRunning >= 0);
00414 
00415   // If we can run more tools, return immediately.
00416   if(nRunning < children::maxChildren)
00417     {
00418       // For this to be true, all pending threads should have at least
00419       // been signaled.
00420       assert(children::pending.size() == 0);
00421 
00422       // There's one more thread running, and the caller can proceed.
00423       assert(children::nInUse < children::maxChildren);
00424       children::nInUse++;
00425       result = true;
00426     }
00427   // If we're below the pending limit, wait our turn.
00428   else if(children::pending.size() < children::maxPending)
00429     {
00430       // We're now waiting.
00431       children::nPending++;
00432 
00433       // There better be somebody to wake us up when they're done.
00434       assert(nRunning > 0);
00435 
00436       // When it's our turn, a thread completing a tool will signal
00437       // this condition variable for us.
00438       Basics::cond c;
00439       children::pending.addhi(&c);
00440       c.wait(children::mu);
00441 
00442       // There's now one one less thread pending, more thread running,
00443       // and the caller can proceed.
00444       assert(children::nInUse < children::maxChildren);
00445       children::nPending--;
00446       children::nInUse++;
00447       result = true;
00448     }
00449   // Too many requests: the caller doesn't get a turn.
00450   else
00451     {
00452       result = false;
00453     }
00454 
00455   children::mu.unlock();
00456 
00457   return result;
00458 }
00459 
00460 int children::wait_for(pid_t pid, SRPC *client, /*OUT*/ bool &killed) throw ()
00461 /* This routine is called by the parent thread. It blocks until the child
00462    process 'pid' completes, and then returns the completion status. 'client'
00463    should be a handle on the connection back to the client that initiated
00464    the tool; if the connection back to the client dies, the child will be
00465    killed by the background "killer" thread, in which case 'killed' will
00466    be set to true. */
00467 {
00468     int status;      // completion status
00469     int slot = -1;   // index of an empty slot
00470 
00471     children::mu.lock();
00472 
00473     for(int i = 0; (i < children::maxChildren) && (slot < 0); i++)
00474       {
00475         if(children::info[i].pid == UNUSED)
00476           {
00477             slot = i; // slot to use for this child
00478           }
00479       }
00480     assert(slot >= 0);
00481 
00482     // Add this child to 'children'
00483     assert(info[slot].pid == UNUSED);
00484     children::info[slot].pid = pid;
00485     children::info[slot].client = client;
00486 
00487     // Release our lock on the child data before waiting for process
00488     // completion.
00489     children::mu.unlock();
00490 
00491     // wait until child completes
00492     pid_t done_pid = waitpid(pid, &status, 0);
00493     // If there was an error, report it.
00494     if(done_pid == SYSERROR)
00495     {
00496       int l_errno = errno;
00497       cerr << "waitpid(pid = " << pid << ") returned an error: "
00498            << strerror(l_errno) << " (" << l_errno << ")"
00499            << endl;
00500     }
00501     // ...but we don't expect an error.
00502     assert(done_pid == pid);
00503 
00504     // Re-acquire a lock on the child data so we can free up our slot
00505     // and complete.
00506     children::mu.lock();
00507 
00508     // extract exit status and clean up
00509     killed = children::info[slot].killed;
00510     children::info[slot].pid = UNUSED;  // mark slot free
00511     children::nInUse--;
00512     children::info[slot].client = NULL; // drop SRPC object for GC
00513     children::info[slot].killed = false;
00514 
00515     // If there are some pending requests, wake up the next one in
00516     // line.
00517     if(children::pending.size() > 0)
00518       {
00519         Basics::cond *c = children::pending.remlo();
00520         assert(c != 0);
00521         c->signal();
00522       }
00523     assert(children::nPending >= children::pending.size());
00524 
00525     children::mu.unlock();
00526     return status;
00527 } // children::wait_for
00528 
00529 void children::killer_body() throw ()
00530 /* This is the body of a background "killer" thread. This thread is needed
00531    to kill off child processes when the client that initiated them dies.
00532    It works by periodically walking over the 'children' list. Any child
00533    whose corresponding 'client' connection has died is then killed using
00534    the kill(2) system call. The data structure is updated and the appropriate
00535    parent thread is signalled. */
00536 {
00537     while (true) {
00538         // pause for 30 seconds
00539         Basics::thread::pause(30);
00540 
00541         // look for children to kill
00542         children::mu.lock();
00543         for (int i = 0; i < children::maxChildren; i++) {
00544             child_info *slot = children::info + i; // local
00545             if (slot->pid != UNUSED && slot->client != NULL) {
00546                 // test if the connection to the client is still alive
00547                 if (!slot->client->alive()) {
00548                     // kill child process group
00549                     int sig = (slot->killed) ? SIGKILL : SIGTERM;
00550                     if (kill(-slot->pid, sig) != 0) {
00551                       // This kill() can fail if we lose a race with
00552                       // the process dying in some other way.  In that
00553                       // case wait_for will receive the process's exit
00554                       // status and clean up as soon as it gets a
00555                       // chance to run, so we do nothing here.
00556                     } else {
00557                       slot->killed = true;
00558                     }
00559                 }
00560             }
00561         }
00562         children::mu.unlock();
00563     }
00564 } // children::killer_body
00565 
00566 void children::call_failed() throw ()
00567 {
00568   // Acquire the lock and decrement the "in use" counter for this
00569   // thread that failed.
00570   children::mu.lock();
00571   children::nInUse--;
00572 
00573   // If there are some pending requests, wake up the next one in
00574   // line.
00575   if(children::pending.size() > 0)
00576     {
00577       Basics::cond *c = children::pending.remlo();
00578       assert(c != 0);
00579       c->signal();
00580     }
00581   assert(children::nPending >= children::pending.size());
00582 
00583   // Release the lock
00584   children::mu.unlock();
00585 }
00586 
00587 // Main request handler =====================================================
00588 
00589 static void RunTool(SRPC *srpc, int intfVersion) throw(SRPC::failure)
00590 {
00591   // Have we acquired and not yeat released an "in use" slot?
00592   bool in_use_slot = false;
00593 
00594     try {
00595         req_data d; // the main request object for this call
00596 
00597         /* Unmarshal the arguments into 'd'. See RunToolClientPrivate.H for
00598            the calling sequence. */
00599         srpc->recv_Text(/*OUT*/ d.stdin_file);
00600         d.out.sock = srpc->recv_socket();
00601         d.err.sock = srpc->recv_socket();
00602         d.fsroot_sock = srpc->recv_socket();
00603         int len = sizeof(d.fsroot_fh);
00604         srpc->recv_bytes_here(/*OUT*/ (char *)&d.fsroot_fh, /*INOUT*/ len);
00605         if (len != sizeof(d.fsroot_fh)) {
00606             call_failed(implementation_error, "Invalid fsroot");
00607         }
00608         srpc->recv_Text(/*OUT*/ d.wd);
00609         srpc->recv_chars_seq(/*OUT*/ d.ev);
00610         srpc->recv_chars_seq(/*OUT*/ d.argv);
00611         if (d.argv[0] == NULL) {
00612             call_failed(implementation_error, "Null command");
00613         }
00614         srpc->recv_end();
00615 
00616         // Wait for it to be our turn to run.
00617         if(!children::my_turn())
00618           {
00619             // Limit on running/pending requests reached: deny this
00620             // request.
00621             call_failed(client_error, "request denied: server too busy");
00622           }
00623         // We've now been given an "in use" slot.
00624         in_use_slot = true;
00625     
00626         /* Ugly way to find out where to chroot to.  We do it this way for
00627            historical reasons, to avoid changing too many interfaces. */
00628         unsigned int index;
00629         (void) d.fsroot_fh.getParent(&index);
00630         char arc[(sizeof(index) * 2) + 1];
00631         sprintf(arc, "%08x", index);
00632         d.chroot_to = volatile_root + PathnameSep + arc;
00633     
00634         // Set up input/output channels
00635         setup_stdin(/*INOUT*/ d);
00636         setup_std_info(/*INOUT*/ d.out);
00637         setup_std_info(/*INOUT*/ d.err);
00638 
00639         // Pause if appropriate environment variable is set
00640         if (getenv("STOP_BEFORE_TOOL")) {
00641             cerr << "Stop before tool; root = " << d.chroot_to << endl;
00642             cerr << "Hit enter to continue: ";
00643             char enter;
00644             cin.get(enter);
00645         }
00646     
00647         // Create pipe as error-reporting channel for child
00648         if (pipe(d.error_pipe) == SYSERROR) {
00649             fail_with_errno("pipe call failed!");
00650         }
00651     
00652         // ------------------------------------------------------------
00653         // Perform setup needed by the forked child.  The "Single UNIX
00654         // Specification" says that a child forked from a
00655         // multi-threaded program "may only execute async-signal-safe
00656         // operations until such time as one of the exec functions is
00657         // called".  This means that we can't perform memory
00658         // allocations or anything else fancy in the child, so we do
00659         // all that stuff now.
00660 
00661         // Setup working directory we will chdir to.
00662         Text wd(d.chroot_to + PathnameSep + d.wd);
00663 
00664         // Setup the error message text to be used if chdir fails.
00665         Text chdir_emsg("Can't establish working directory " + wd);
00666 
00667         /* Now set up to run a "helper" program (typically the
00668            "tool_launcher" program), which is a small executable
00669            that runs setuid-ed to root so that it can perform the
00670            chroot(2) operation.  The essence of this program is the
00671            following actions:
00672 
00673 
00674            (1) chroot(d.chroot_to);
00675            (2) seteuid(geteuid()); // become unprivileged
00676            (3) execve(d.argv[0], d.argv, d.ev)
00677 
00678            We use execv(2) to invoke this "helper" program, passing
00679            it the two necessary pieces of information (mnt_pt, argv)
00680            on the command line. Here 'argv' is a a command-line
00681            encoding of: the helper switches, the file descriptor of
00682            the error reporting channel, 'd.chroot_to', 'd.ev',
00683            and 'd.argv'. */
00684         char **helper_cmd =
00685           NEW_ARRAY(char *, 
00686                     1+                        // helper name
00687                     helper_switches.length()+ // switches from config file
00688                     2+                        // -err <arg>
00689                     2+                        // -root <arg>
00690                     1+d.ev.length()+          // -env <list>
00691                     1+d.argv.length()+        // -cmd <list>
00692                     1);                       // terminator      
00693         int i = 0; // current arg
00694         helper_cmd[i++] = helper.chars();
00695         int j;
00696         for (j = 0; j < helper_switches.length(); /*SKIP*/) {
00697           helper_cmd[i++] = helper_switches[j++];
00698         }
00699         helper_cmd[i++] = "-err";
00700         char error_wr[20];
00701         sprintf(error_wr, "%d", d.error_pipe[1]);
00702         helper_cmd[i++] = error_wr;
00703         helper_cmd[i++] = "-root";
00704         helper_cmd[i++] = d.chroot_to.chars();
00705         helper_cmd[i++] = "-env";
00706         for (j = 0; j < d.ev.length(); /*SKIP*/) {
00707           helper_cmd[i++] = d.ev[j++];
00708         }
00709         helper_cmd[i++] = "-cmd";
00710         for (j = 0; j < d.argv.length(); /*SKIP*/) {
00711           helper_cmd[i++] = d.argv[j++];
00712         }
00713         helper_cmd[i++] = NULL;
00714         // ------------------------------------------------------------
00715 
00716         // Fork subprocess to complete preparations and execute command
00717         pid_t child_pid = fork();
00718         switch (child_pid) {
00719           case -1:
00720             fail_with_errno("Couldn't fork process to execute command");
00721             break;                      // keep compiler happy
00722 
00723           // child code -----------------------------------------------------
00724           case 0:
00725             // Start a new process group
00726             ::setpgid(0, 0);
00727 
00728             // Discard read end of error_pipe
00729             close_fd(d.error_pipe[0]);
00730       
00731             // Establish file descriptors.
00732             move_fd(d.stdin_fd, STDIN_FILENO);
00733             move_fd(d.out.f, STDOUT_FILENO);
00734             move_fd(d.err.f, STDERR_FILENO);
00735       
00736             /* Change working directory to be at the specified path
00737                from the volatile root.. */
00738             if (chdir(wd.chars()) == SYSERROR) {
00739               simulate_child_failure(d.error_pipe[1], trouble_fd,
00740                                      implementation_error,
00741                                      chdir_emsg.cchars());
00742             }
00743 
00744             // run the "helper" program
00745             execv(helper_cmd[0], helper_cmd);
00746 
00747             // execution continues only if execv failed
00748             simulate_child_failure(d.error_pipe[1], trouble_fd,
00749                                    configuration_error,
00750                                    "Can't exec helper");
00751 
00752             // Should be unreachable, but just to be safe...
00753             _exit(implementation_error);
00754 
00755             // unreachable
00756             break;
00757 
00758           // parent code ----------------------------------------------------
00759           default:
00760             // Write debugging information
00761             if (debug_mode) {
00762                 OFdStream os(trouble_fd);
00763                 os << "\n*** Launching child process " << child_pid
00764                    << " ***" << endl;
00765             }
00766       
00767             // Discard write end of error_pipe
00768             close_fd(d.error_pipe[1]);
00769       
00770             // wait for child to finish
00771             bool child_killed;
00772             int s = children::wait_for(child_pid, srpc, /*OUT*/ child_killed);
00773             // children::wait_for released our "in use" slot.
00774             in_use_slot = false;
00775 
00776             // post debugging
00777             if (debug_mode) {
00778                 if (child_killed) {
00779                     cerr << "*** Child process " << child_pid
00780                          << " killed due to client termination ***" << endl;
00781                 } else {
00782                     cerr << "*** Finished child process " << child_pid
00783                          << ": exit status " << WEXITSTATUS(s)
00784                          << ", signal " << WTERMSIG(s) << " ***" << endl;
00785                 }
00786             }
00787 
00788             // pause execution if appropriate environment variables set
00789             if (getenv("STOP_AFTER_TOOL")) {
00790                 cerr << "Stop after tool; root = " << d.chroot_to << endl;
00791                 cerr << "Hit enter to continue: ";
00792                 char enter;
00793                 cin.get(enter);
00794             } else if (WIFSIGNALED(s) && getenv("STOP_AFTER_TOOL_SIGNALED")) {
00795                 cerr << "Stop after tool signaled " << WTERMSIG(s)
00796                      << "; root = " << d.chroot_to << endl;
00797                 cerr << "Hit enter to continue: ";
00798                 char enter;
00799                 cin.get(enter);
00800             } else if (WIFEXITED(s) && WEXITSTATUS(s) &&
00801                        getenv("STOP_AFTER_TOOL_ERROR")) {
00802                 cerr << "Stop after tool error " << WEXITSTATUS(s)
00803                      << "; root = " << d.chroot_to << endl;
00804                 cerr << "Hit enter to continue: ";
00805                 char enter;
00806                 cin.get(enter);
00807             }
00808 
00809             // Check for error message reported on 'd.error_pipe'
00810             Text msg("");
00811             while (true) {
00812                 char buf[100];
00813                 int ct = read(d.error_pipe[0], buf, sizeof(buf)-1);
00814                 if (ct == 0) break;
00815                 if (ct == SYSERROR)
00816                     fail_with_errno("Can't read from error_pipe!");
00817                 buf[ct] = 0;
00818                 msg += buf;
00819             }
00820             close_fd(d.error_pipe[0]);
00821       
00822             // handle error condition
00823             if (!msg.Empty()) {
00824                 /* Tool was not launched; 's' explains why, and a
00825                    human-readable message is in 'msg'. */
00826                 int r;
00827                 switch (WEXITSTATUS(s)) {
00828                   case unrecognized_switch:
00829                   case chroot_failure:
00830                   case wd_not_in_root:
00831                     r = configuration_error;
00832                     break;
00833                   case execve_failure:
00834                     r = client_error;
00835                     break;
00836                   default:
00837                     r = implementation_error;
00838                     break;
00839                 }
00840                 call_failed(r, msg);
00841                 // does not return
00842             }
00843       
00844             // Tool was launched and has terminated.
00845 
00846             int i = sync_after_tool;
00847             while (i--) {
00848               // There is evidence that the Tru64 kernel is willing to
00849               // tell the parent that a child is done before the NFS
00850               // write-through-on-close for all the child's files has
00851               // really been flushed to the server.  Optionally try to
00852               // avoid this problem by calling sync().  This is
00853               // expensive on a shared machine with other file writing
00854               // going on, so we would rather not do it, but it might
00855               // be necessary.  Note that even the man page for sync
00856               // does not guarantee that all the writes are really
00857               // done when it returns, but some implementations may
00858               // ensure this anyway.  I don't know for sure about
00859               // Tru64.  Historically, people called sync() repeatedly
00860               // to work around this problem, but I don't believe that
00861               // really guarantees anything either; certainly the man
00862               // page doesn't mention it.
00863               ::sync();
00864             }
00865 
00866             // Return results to caller.
00867             srpc->send_int(WIFEXITED(s) ? WEXITSTATUS(s) : 0);
00868             srpc->send_int(WIFSIGNALED(s) ? WTERMSIG(s) : 0);
00869             if(intfVersion >= RUN_TOOL_CORE_VERSION) {
00870                     srpc->send_bool(WCOREDUMP(s) ? true : false);
00871             }
00872             srpc->send_end();
00873             break;
00874         } /* switch */
00875 
00876         // Free the command-line for invoking the helper.
00877         delete [] helper_cmd;
00878         helper_cmd = NULL; // help out the garbage collector
00879     } /* try */
00880     catch(const TCP_sock::failure &tf) {
00881         // If we're still holding an "in use" slot, release it now.
00882         if(in_use_slot)
00883           {
00884             children::call_failed();
00885           }
00886 
00887         srpc->send_failure(implementation_error, "TCP failure: " + tf.msg);
00888     }
00889     catch(const call_failure &cf) {
00890         // If we're still holding an "in use" slot, release it now.
00891         if(in_use_slot)
00892           {
00893             children::call_failed();
00894           }
00895 
00896         srpc->send_failure(cf.code, cf.msg);
00897     }
00898     catch (SRPC::failure) {
00899         // If we're still holding an "in use" slot, release it now.
00900         // (I don't believe this case is actually possible, but it
00901         // doesn't hurt.)
00902         if(in_use_slot)
00903           {
00904             children::call_failed();
00905           }
00906 
00907         throw;
00908     }
00909 
00910     // We should never reach this point and still be holding an "in
00911     // use" slot.
00912     assert(!in_use_slot);
00913 }
00914 
00915 void Info(SRPC *srpc, int intfVersion, int procId) throw(SRPC::failure)
00916 {
00917     float load = GetLoadPoint();
00918     children::mu.lock();
00919     int max_tools = children::maxChildren;
00920     int cur_tools, cur_pending, max_pending;
00921     if(intfVersion >= RUN_TOOL_LOAD_VERSION) {
00922             cur_tools = children::nInUse;
00923             cur_pending = children::nPending;
00924             max_pending = children::maxPending;
00925     } else {
00926             cur_tools = children::nInUse + children::nPending;
00927     }
00928     children::mu.unlock();
00929 
00930     try {
00931         /* See RunToolClientPrivate.H for the calling sequence. */
00932 
00933         /* No arguments */
00934         srpc->recv_end();
00935     
00936         /* Return results */
00937         srpc->send_Text(sysname);
00938         srpc->send_Text(release);
00939         srpc->send_Text(version);
00940         srpc->send_Text(machine);
00941         srpc->send_int(cpus);
00942         srpc->send_int(cpuMHz);
00943         srpc->send_int(memKB);
00944         srpc->send_int(max_tools);
00945         srpc->send_int(cur_tools);
00946         if (intfVersion >= RUN_TOOL_LOAD_VERSION) {
00947                 srpc->send_float(load);
00948                 srpc->send_int(cur_pending);
00949                 srpc->send_int(max_pending);
00950         }
00951         if (procId != RUN_TOOL_OLDINFO) uniqueid.Send(*srpc);
00952         srpc->send_end();
00953 
00954     } /* try */
00955     catch(const TCP_sock::failure &tf) {
00956         srpc->send_failure(implementation_error, "TCP failure: " + tf.msg);
00957     }
00958 }
00959 
00960 void 
00961 RunToolServer(SRPC *srpc, int intfVersion, int procId, /*UNUSED*/void *arg)
00962   throw(SRPC::failure)
00963 /* This is the server for the RunToolServer interface. It expects to
00964    be invoked via the LimService interface, which establishes the RPC
00965    connection with the client. */
00966 {
00967   
00968   if(intfVersion > RUN_TOOL_INTERFACE_VERSION) { 
00969     srpc->send_failure(SRPC::version_skew,
00970                        "RunToolServer: Unsupported interface version");
00971     return;
00972   }
00973 
00974   /* 'arg' is unused; all information comes via the RPC connection. */
00975   switch (procId) {
00976   case RUN_TOOL_DOIT:
00977   case SRPC::default_proc_id:
00978     RunTool(srpc, intfVersion);
00979     break;
00980     
00981   case RUN_TOOL_OLDINFO:
00982   case RUN_TOOL_INFO:
00983     Info(srpc, intfVersion, procId);
00984     break;
00985 
00986   default:
00987     srpc->send_failure(SRPC::version_skew, "Unknown proc_id");
00988     break;
00989   }
00990 }
00991 
00992 #if defined(__linux__)
00993 
00994 // get_hw_clock_freq: Attempt to discover the system clock rate under
00995 // Linux.
00996 
00997 // This is an ugly hack, but we haven't found a standard, architecture
00998 // independent way to get this information.
00999 
01000 static unsigned int get_hw_clock_freq()
01001 {
01002   ifstream pstr("/proc/cpuinfo");
01003 
01004   char line[1000];
01005 
01006   if(pstr.bad())
01007     {
01008       return 1; 
01009     }
01010 
01011   // Look for a line like one of these:
01012 
01013   // cycle frequency [Hz]    : 432900432
01014   // cpu MHz         : 399.068
01015   // clock           : 375MHz
01016 
01017   // Then pull out the number, round up to the nearest MHz (when
01018   // appropriate) and return it.
01019   for(pstr.getline(line, sizeof(line));
01020       !pstr.eof();
01021       pstr.getline(line, sizeof(line)))
01022     {
01023       const char *l_digits;
01024 
01025       // Alpha Linux style
01026       // cycle frequency [Hz]    : 432900432
01027       if(strncmp(line, "cycle frequency [Hz]", 20) == 0)
01028         {
01029           // Look for a number
01030           l_digits = strpbrk(line+20, "0123456789");
01031           if(l_digits != NULL)
01032             {
01033               // Convert Hz to MHz, rounding up
01034               int freq = atoi(l_digits);
01035               int l_result = (freq + 500000) / 1000000; 
01036               return l_result;
01037             }
01038         }
01039       // x86 Linux style
01040       // cpu MHz         : 399.068
01041       else if(strncmp(line, "cpu MHz", 7) == 0)
01042         {
01043           // Look for a number
01044           l_digits = strpbrk(line+7, "0123456789");
01045           if(l_digits != NULL)
01046             {
01047               int l_result = atoi(l_digits);
01048               return l_result;
01049             }
01050         }
01051       // PPC Linux style
01052       // clock           : 375MHz
01053       else if((strncmp(line, "clock", 5) == 0) && isspace(line[5]))
01054         {
01055           // Look for a number
01056           l_digits = strpbrk(line+6, "0123456789");
01057           if(l_digits != NULL)
01058             {
01059               int l_result = atoi(l_digits);
01060               return l_result;
01061             }
01062         }
01063 
01064       // Handle lines longer than our buffer.
01065       while(pstr.fail())
01066         {
01067           // Clear the failbit
01068           pstr.clear(pstr.rdstate() & ~ios::failbit);
01069           // Read the next chunk of the line, possibly still too
01070           // little to complete it, hence the enclosing while.
01071           pstr.getline(line, sizeof(line));
01072         }
01073     }
01074 
01075   // If we fail to find something which tells us the CPU speed, return
01076   // 1.
01077   return 1; 
01078 }
01079 
01080 // get_n_cpus: Return the number of CPUs in this machine under Linux.
01081 
01082 // It seems that the glibc function get_nprocs parses /proc/cpuinfo
01083 // and is broken on non-x86 architectures.  See:
01084 
01085 // http://www.geocrawler.com/mail/msg.php3?msg_id=3052218&list=16
01086 
01087 // This function uses a similar method suggested in the above message:
01088 // parsing /proc/stat which is apparently not as architecutre
01089 // dependent as /proc/cpuinfo.  Specifically, on SMP machines it
01090 // contains one line per-CPU that starts with "cpu[0-9]".  On
01091 // uni-processor machines, no such lines will be found.
01092 
01093 static unsigned int get_n_cpus()
01094 {
01095   // Open /proc/stat.
01096   ifstream l_proc_stat("/proc/stat");
01097 
01098   // We'll count CPUs in this variable.
01099   unsigned int l_result = 0;
01100 
01101   // Read one line at a time.
01102   char l_line[1000];
01103   for(l_proc_stat.getline(l_line, sizeof(l_line));
01104       !l_proc_stat.eof();
01105       l_proc_stat.getline(l_line, sizeof(l_line)))
01106     {
01107       // Count lines which start with cpu[0-9]
01108       if((strncmp(l_line, "cpu", 3) == 0) && isdigit(l_line[3]))
01109         {
01110           l_result++;
01111         }
01112 
01113       // Handle lines longer than our buffer.
01114       while(l_proc_stat.fail())
01115         {
01116           // Clear the failbit
01117           l_proc_stat.clear(l_proc_stat.rdstate() & ~ios::failbit);
01118           // Read another the next chunk of the line, possibly still too
01119           // little to complete it, hence the enclosing while.
01120           l_proc_stat.getline(l_line, sizeof(l_line));
01121         }
01122     }
01123 
01124   // If we found any lines starting with cpu[0-9], then the number we
01125   // found is the number of CPUs.
01126   if(l_result > 0)
01127     {
01128       return l_result;
01129     }
01130 
01131   // If we didn't find any, this must be a uni-processor.
01132   return 1;
01133 }
01134 
01135 #endif // defined (__linux__)
01136 
01137 // Initialization routine ===================================================
01138 
01139 void RunToolServerInit(int maxTools, int maxPending) throw(SRPC::failure)
01140 {
01141     /* We tuck away a file descriptor for stderr in 'trouble_fd' so that
01142        it can be used as an emergency error reporting channel.  This channel
01143        is used to report problems detected in the Vesta implementation that
01144        occur after the forked launcher/tool process has set its stderr in
01145        preparation for the tool execution and before the tool has actually
01146        begun execution. */
01147     try {
01148         if ((trouble_fd = dup(STDERR_FILENO)) == SYSERROR) {
01149             fail_with_errno(
01150               "Can't get file descriptor to report launcher errors");
01151         }
01152     } catch (const call_failure &f) {
01153         cerr << f.msg << endl;
01154         return;
01155     }
01156   
01157     // Get local operating system and hardware information
01158     static struct utsname u;
01159     uname(&u);
01160     sysname = u.sysname;
01161     release = u.release;
01162     version = u.version;
01163     machine = u.machine;
01164 
01165 #if defined(__digital__)    
01166     // This part is Tru64 specific
01167     struct cpu_info cpuinfo;
01168     int start = 0;
01169     getsysinfo(GSI_CPU_INFO, (caddr_t) &cpuinfo, sizeof(cpuinfo),
01170                &start, NULL, NULL);
01171     cpus = cpuinfo.cpus_in_box;
01172     cpuMHz = cpuinfo.mhz;
01173     start = 0;
01174     getsysinfo(GSI_PHYSMEM, (caddr_t) &memKB, sizeof(memKB),
01175                &start, NULL, NULL);
01176 #elif defined (__linux__)
01177     // Getting the system info we need requires some ugly hackery on
01178     // Linux.  We can get "memKB" by calling sysconf(_SC_PHYS_PAGES)
01179     // and sysconf(_SC_PAGE_SIZE).  For cpuMHz we call
01180     // get_hw_clock_freq to scan /proc/cpuinfo looking for a line
01181     // which that tells us the CPU speed.  For cpus, we call
01182     // get_n_cpus to scan /proc/stat looking for the per-processor
01183     // lines present on SMP machines.
01184 
01185     cpus = get_n_cpus();
01186     cpuMHz = get_hw_clock_freq();
01187     memKB = ((((unsigned long) sysconf(_SC_PAGE_SIZE)) *
01188               ((unsigned long) sysconf(_SC_PHYS_PAGES)))
01189              / 1024);
01190     
01191     // Those values really should all be above 0.
01192     assert(cpus > 0);
01193     assert(cpuMHz > 0);
01194     assert(memKB > 0);
01195 #endif
01196 
01197     // Allow overrides for the bootstrapping case, where the helper
01198     // (tool launcher) runs the tools on a different machine.
01199     VestaConfig::get(RUN_TOOL_CONFIG_SECTION, "sysname", sysname);
01200     VestaConfig::get(RUN_TOOL_CONFIG_SECTION, "release", release);
01201     VestaConfig::get(RUN_TOOL_CONFIG_SECTION, "version", version);
01202     VestaConfig::get(RUN_TOOL_CONFIG_SECTION, "machine", machine);
01203     try {
01204       cpus = VestaConfig::get_int(RUN_TOOL_CONFIG_SECTION, "cpus");
01205     } catch (VestaConfig::failure) { /* ignore */ }
01206     try {
01207       cpuMHz = VestaConfig::get_int(RUN_TOOL_CONFIG_SECTION, "cpuMHz");
01208     } catch (VestaConfig::failure) { /* ignore */ }
01209     try {
01210       memKB = VestaConfig::get_int(RUN_TOOL_CONFIG_SECTION, "memKB");
01211     } catch (VestaConfig::failure) { /* ignore */ }
01212 
01213     // Invent a unique ID for this server instance
01214     uniqueid = UniqueId();
01215 
01216     // Get other Vesta configuration information
01217     helper = config_lookup("helper");
01218     try {
01219         Text val(VestaConfig::get_Text(RUN_TOOL_CONFIG_SECTION,
01220                                        "helper_switches"));
01221         istringstream is(val.chars());
01222         char s[100];
01223         while ( is >> s && strlen(s) != 0) {
01224             if (strcmp(s, "-d") == 0) {
01225                 int dbgfd;
01226                 try {
01227                     if ((dbgfd = dup(STDERR_FILENO)) == SYSERROR) {
01228                         fail_with_errno(
01229                           "Can't get file descriptor for debug output");
01230                     }
01231                 } catch (call_failure f) {
01232                     cerr << f.msg << '\n';
01233                     continue;
01234                 }
01235                 debug_mode = true;
01236                 helper_switches += "-dbg";
01237                 sprintf(s, "%d", dbgfd);
01238             }
01239             helper_switches += s;
01240         }
01241     } catch(const VestaConfig::failure &f) {
01242         /* ignore */
01243     }
01244     try {
01245       sync_after_tool = VestaConfig::get_int(RUN_TOOL_CONFIG_SECTION,
01246                                              "sync_after_tool");
01247     } catch(const VestaConfig::failure &f) {
01248         /* ignore */
01249     }
01250     if (debug_mode) {
01251         cout << "helper command = " << helper;
01252         for (int i = 0; i < helper_switches.length(); i++) {
01253             cout << " " << helper_switches[i];
01254         }
01255         cout << endl;
01256     }
01257 
01258     volatile_root = config_lookup("VolatileRootName");
01259     // Check that repository is mounted
01260     struct stat st;
01261     if (stat(volatile_root.cchars(), &st) < 0) {
01262       cerr << "Can't access volatile root (not mounted?): "
01263            << volatile_root + ": " << strerror(errno) << endl;
01264       exit(1);
01265     }
01266 
01267     children::init(maxTools, maxPending);
01268 }       
01269 
01270 void RunToolServerCleanup() throw ()
01271 {
01272     // Nothing to do on most platforms.
01273 }