Author: buckley
Date: Thu Dec 30 12:51:05 2010
New Revision: 2866

Adding a run timeout option, inpired by crappy LCG


Modified: trunk/ChangeLog
--- trunk/ChangeLog	Wed Dec 29 23:47:06 2010	(r2865)
+++ trunk/ChangeLog	Thu Dec 30 12:51:05 2010	(r2866)
@@ -1,3 +1,11 @@
+2010-12-30  Andy Buckley  <andy at insectnation.org>
+	* Adding a run timeout option, and small bug-fixes to the event
+	timeout handling, and making first event timeout work nicely with
+	the run timeout. Run timeout is intended to be used in conjunction
+	with timed batch token expiry, of the type that likes to make 0
+	byte AIDA files on LCG when Grid proxies time out.
 2010-12-21  Andy Buckley  <andy at insectnation.org>
 	* Fix the cuts in the CDF 1994 colour coherence analysis.

Modified: trunk/bin/rivet
--- trunk/bin/rivet	Wed Dec 29 23:47:06 2010	(r2865)
+++ trunk/bin/rivet	Thu Dec 30 12:51:05 2010	(r2866)
@@ -95,6 +95,11 @@
 parser.add_option("--event-timeout", dest="EVENT_TIMEOUT", type="int",
                   default=3600, metavar="NSECS",
                   help="max time in whole seconds to wait for an event to be generated from the specified source (default = %default)")
+parser.add_option("--run-timeout", dest="RUN_TIMEOUT", type="int",
+                  default=None, metavar="NSECS",
+                  help="max time in whole seconds to wait for the run to finish. This can be useful on batch systems such "
+                  "as the LCG Grid where tokens expire on a fixed wall-clock and can render long Rivet runs unable to write "
+                  "out the final histogram file (default = unlimited)")
 parser.add_option("--histo-interval", dest="HISTO_WRITE_INTERVAL", type=int,
                   default=None, help="[experimental!] specify the number of events between histogram file updates. "
                   "Default is to only write out at the end of the run. Note that intermediate histograms will be those "
@@ -394,31 +399,35 @@
 logging.info("Rivet running on machine %s (%s)" % (platform.node(), platform.machine()))
+## Set up an event timeout handler
+    def evttimeouthandler(signum, frame):
+        logging.warn("It has taken more than %d secs to get an event! Is the input event stream working?" %
+                     min(opts.EVENT_TIMEOUT, opts.RUN_TIMEOUT))
+        raise Exception("Event timeout")
+    signal.signal(signal.SIGALRM, evttimeouthandler)
 ## Init run based on one event
 hepmcfile = HEPMCFILES[0]
-    def evtinithandler(signum, frame):
-        logging.warn("It has taken more than %d secs to get the first event! Is the input event stream working?" % opts.EVENT_TIMEOUT)
-        raise Exception("Event initialisation timeout")
-    signal.signal(signal.SIGALRM, evtinithandler)
-    signal.alarm(opts.EVENT_TIMEOUT)
+    if opts.EVENT_TIMEOUT or opts.RUN_TIMEOUT:
+        signal.alarm(min(opts.EVENT_TIMEOUT, opts.RUN_TIMEOUT))
     init_ok = run.init(hepmcfile)
+    signal.alarm(0)
     if not init_ok:
-        logging.error("Failed to initialise on event file %s... exiting" % hepmcfile)
+        logging.error("Failed to initialise using event file '%s'... exiting" % hepmcfile)
-    logging.error("Timeout in initialisation from event file %s... exiting" % hepmcfile)
+    logging.error("Timeout in initialisation from event file '%s'... exiting" % hepmcfile)
-## Cancel timeout
 ## Event loop
 evtnum = 0
 starttime = time.time()
 for fileidx, hepmcfile in enumerate(HEPMCFILES):
-    ## Open next HepMC file (does not apply to first file: it was already used for the run init)
+    ## Open next HepMC file (NB. this doesn't apply to the first file: it was already used for the run init)
     if fileidx > 0:
         if not run.readEvent():
@@ -428,22 +437,39 @@
     while opts.MAXEVTNUM is None or evtnum < opts.MAXEVTNUM:
         evtnum += 1
         logNEvt(evtnum, starttime, opts.MAXEVTNUM)
         ## Process this event
         processed_ok = run.processEvent()
         if not processed_ok:
             logging.warn("Event processing failed for evt #%i!" % evtnum)
+        ## Set flag to exit event loop if run timeout exceeded
+        if opts.RUN_TIMEOUT and (time.time() - starttime) > opts.RUN_TIMEOUT:
+            logging.warning("Run timeout of %d secs exceeded... exiting gracefully" % opts.RUN_TIMEOUT)
+            RECVD_KILL_SIGNAL = True
         ## Exit the loop if signalled
         if RECVD_KILL_SIGNAL is not None:
-        ## Read next event
-        read_ok = run.readEvent()
-        if not read_ok:
-            break
+        ## Read next event (with timeout handling if requested)
+        try:
+            if opts.EVENT_TIMEOUT:
+                signal.alarm(opts.EVENT_TIMEOUT)
+            read_ok = run.readEvent()
+            signal.alarm(0)
+            if not read_ok:
+                break
+        except:
+            logging.error("Timeout in reading event from '%s'... exiting" % hepmcfile)
+            sys.exit(3)
         ## Write a histo file snapshot if appropriate
         if opts.HISTO_WRITE_INTERVAL is not None:
             if evtnum % opts.HISTO_WRITE_INTERVAL == 0:
 logging.info("Finished event loop")

