Commit feab684b authored by David Anderson's avatar David Anderson

- client: abort jobs that are unstarted and past deadline

- client: abort runaway jobs based on elapsed time instead of CPU time.
    Specifically, abort jobs for which
    elapsed time > WU.rsc_fpops_bound / app_version.flops
    This policy works for
    1) GPU jobs (which may use little CPU time)
    2) jobs that run but because of bugs use little CPU time
        (e.g., because they're sleeping)
    whereas the old policy didn't.

svn path=/trunk/boinc/; revision=17399
parent 41fe3e40
......@@ -2256,3 +2256,22 @@ David 25 Feb 2009
cs_prefs.cpp
cs_scheduler.cpp
cpu_sched.cpp
David 25 Feb 2009
- client: abort jobs that are unstarted and past deadline
- client: abort runaway jobs based on elapsed time instead of CPU time.
Specifically, abort jobs for which
elapsed time > WU.rsc_fpops_bound / app_version.flops
This policy works for
1) GPU jobs (which may use little CPU time)
2) jobs that run but because of bugs use little CPU time
(e.g., because they're sleeping)
whereas the old policy didn't.
client/
client_state.cpp,h
app.cpp,h
app_control.cpp
lib/
error_numbers.h
str_util.cpp
......@@ -266,7 +266,7 @@ int ACTIVE_TASK::init(RESULT* rp) {
result = rp;
wup = rp->wup;
app_version = rp->avp;
max_cpu_time = rp->wup->rsc_fpops_bound/gstate.host_info.p_fpops;
max_elapsed_time = rp->wup->rsc_fpops_bound/rp->avp->flops;
max_disk_usage = rp->wup->rsc_disk_bound;
max_mem_usage = rp->wup->rsc_memory_bound;
get_slot_dir(slot, slot_dir, sizeof(slot_dir));
......
......@@ -109,8 +109,8 @@ public:
/// (that way don't have to worry about top-level dirs
/// being non-readable, etc).
char slot_path[512];
/// abort if total CPU exceeds this
double max_cpu_time;
/// abort if elapsed time exceeds this
double max_elapsed_time;
/// abort if disk usage (in+out+temp) exceeds this
double max_disk_usage;
/// abort if memory usage exceeds this
......
......@@ -571,12 +571,12 @@ bool ACTIVE_TASK_SET::check_rsc_limits_exceeded() {
for (i=0; i<active_tasks.size(); i++) {
atp = active_tasks[i];
if (atp->task_state() != PROCESS_EXECUTING) continue;
if (atp->current_cpu_time > atp->max_cpu_time) {
if (atp->elapsed_time > atp->max_elapsed_time) {
msg_printf(atp->result->project, MSG_INFO,
"Aborting task %s: exceeded CPU time limit %f\n",
atp->result->name, atp->max_cpu_time
"Aborting task %s: exceeded elapsed time limit %f\n",
atp->result->name, atp->max_elapsed_time
);
atp->abort_task(ERR_RSC_LIMIT_EXCEEDED, "Maximum CPU time exceeded");
atp->abort_task(ERR_RSC_LIMIT_EXCEEDED, "Maximum elapsed time exceeded");
did_anything = true;
continue;
}
......
......@@ -919,12 +919,25 @@ int CLIENT_STATE::nresults_for_project(PROJECT* p) {
return n;
}
bool CLIENT_STATE::abort_unstarted_late_jobs() {
if (now < 1235668593) return false; // skip if user reset system clock
for (unsigned int i=0; i<results.size(); i++) {
RESULT* rp = results[i];
if (!rp->not_started()) continue;
if (rp->report_deadline > now) continue;
rp->abort_inactive(ERR_UNSTARTED_LATE);
}
}
bool CLIENT_STATE::garbage_collect() {
bool action;
static double last_time=0;
if (gstate.now - last_time < GARBAGE_COLLECT_PERIOD) return false;
last_time = gstate.now;
bool action = garbage_collect_always();
action = abort_unstarted_late_jobs();
if (action) return true;
action = garbage_collect_always();
if (action) return true;
// Detach projects that are marked for detach when done
......
......@@ -240,6 +240,7 @@ private:
int link_workunit(PROJECT*, WORKUNIT*);
int link_result(PROJECT*, RESULT*);
void print_summary();
bool abort_unstarted_late_jobs();
bool garbage_collect();
bool garbage_collect_always();
bool update_results();
......
......@@ -187,8 +187,9 @@
#define ERR_DB_CONN_LOST -230
#define ERR_CRYPTO -231
#define ERR_ABORTED_ON_EXIT -232
#define ERR_UNSTARTED_LATE -233
// PLEASE: add a text description of your error to
// the text description function boincerror() in str_util.C.
// the text description function boincerror() in str_util.cpp.
#endif
......@@ -736,6 +736,8 @@ const char* boincerror(int which_error) {
case ERR_SYMLINK: return "symlink() failed";
case ERR_DB_CONN_LOST: return "DB connection lost during enumeration";
case ERR_CRYPTO: return "encryption error";
case ERR_ABORTED_ON_EXIT: return "job was aborted on client exit";
case ERR_UNSTARTED_LATE: return "job is unstarted and past deadline";
case 404: return "HTTP file not found";
case 407: return "HTTP proxy authentication failure";
case 416: return "HTTP range request error";
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment