work_fetch.cpp 36.4 KB
Newer Older
1
// This file is part of BOINC.
David Anderson's avatar
David Anderson committed
2
// http://boinc.berkeley.edu
3
// Copyright (C) 2008 University of California
David Anderson's avatar
David Anderson committed
4
//
5 6 7 8
// BOINC is free software; you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License
// as published by the Free Software Foundation,
// either version 3 of the License, or (at your option) any later version.
David Anderson's avatar
David Anderson committed
9
//
10
// BOINC is distributed in the hope that it will be useful,
David Anderson's avatar
David Anderson committed
11 12 13 14
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU Lesser General Public License for more details.
//
15 16
// You should have received a copy of the GNU Lesser General Public License
// along with BOINC.  If not, see <http://www.gnu.org/licenses/>.
David Anderson's avatar
David Anderson committed
17

18 19
#include "util.h"

20
#include "client_types.h"
David Anderson's avatar
David Anderson committed
21
#include "client_msgs.h"
David Anderson's avatar
David Anderson committed
22 23 24
#ifdef SIM
#include "sim.h"
#else
David Anderson's avatar
David Anderson committed
25
#include "client_state.h"
David Anderson's avatar
David Anderson committed
26
#endif
David Anderson's avatar
David Anderson committed
27

28
#include "scheduler_op.h"
29 30
#include "work_fetch.h"

David Anderson's avatar
David Anderson committed
31
using std::vector;
32 33

RSC_WORK_FETCH cuda_work_fetch;
34
RSC_WORK_FETCH ati_work_fetch;
35 36 37
RSC_WORK_FETCH cpu_work_fetch;
WORK_FETCH work_fetch;

38
#define MIN_BACKOFF_INTERVAL    60
39 40 41 42 43 44 45 46
#define MAX_BACKOFF_INTERVAL    86400
    // if we ask a project for work for a resource and don't get it,
    // we do exponential backoff.
    // This constant is an upper bound for this.
    // E.g., if we need GPU work, we'll end up asking once a day,
    // so if the project develops a GPU app,
    // we'll find out about it within a day.

47
static inline const char* rsc_name(int t) {
David Anderson's avatar
David Anderson committed
48 49
    switch (t) {
    case RSC_TYPE_CPU: return "CPU";
50 51
    case RSC_TYPE_CUDA: return "NVIDIA GPU";
    case RSC_TYPE_ATI: return "ATI GPU";
David Anderson's avatar
David Anderson committed
52 53
    }
    return "Unknown";
54 55
}

56 57
RSC_PROJECT_WORK_FETCH& RSC_WORK_FETCH::project_state(PROJECT* p) {
    switch(rsc_type) {
58
    case RSC_TYPE_CPU: return p->cpu_pwf;
59
    case RSC_TYPE_CUDA: return p->cuda_pwf;
60
    case RSC_TYPE_ATI: return p->ati_pwf;
David Anderson's avatar
David Anderson committed
61
    default: return p->cpu_pwf;
David Anderson's avatar
David Anderson committed
62 63 64
    }
}

65
inline bool prefs_prevent_fetch(PROJECT* p, int rsc_type) {
66
    switch(rsc_type) {
67
    case RSC_TYPE_CPU:
68
        if (p->no_cpu_pref) return true;
69
    case RSC_TYPE_CUDA:
70
        if (p->no_cuda_pref) return true;
71
    case RSC_TYPE_ATI:
72
        if (p->no_ati_pref) return true;
73
    }
74 75 76 77 78
    return false;
}

bool RSC_WORK_FETCH::may_have_work(PROJECT* p) {
    if (prefs_prevent_fetch(p, rsc_type)) return false;
79 80 81 82
    RSC_PROJECT_WORK_FETCH& w = project_state(p);
    return (w.backoff_time < gstate.now);
}

83 84
bool RSC_PROJECT_WORK_FETCH::compute_may_have_work(PROJECT* p, int rsc_type) {
    switch(rsc_type) {
85 86 87
    case RSC_TYPE_CPU: if (p->no_cpu_pref) return false; break;
    case RSC_TYPE_CUDA: if (p->no_cuda_pref) return false; break;
    case RSC_TYPE_ATI: if (p->no_ati_pref) return false; break;
88
    }
89 90 91
    return (backoff_time < gstate.now);
}

92 93
void RSC_PROJECT_WORK_FETCH::rr_init(PROJECT* p, int rsc_type) {
    may_have_work = compute_may_have_work(p, rsc_type);
David Anderson's avatar
David Anderson committed
94 95 96
    runnable_share = 0;
    fetchable_share = 0;
    has_runnable_jobs = false;
David Anderson's avatar
David Anderson committed
97
    sim_nused = 0;
98
    deadlines_missed = 0;
99 100
}

101 102 103
void RSC_WORK_FETCH::rr_init() {
    shortfall = 0;
    nidle_now = 0;
David Anderson's avatar
David Anderson committed
104
    sim_nused = 0;
105 106
    total_fetchable_share = 0;
    total_runnable_share = 0;
107
    deadline_missed_instances = 0;
108
    saturated_time = 0;
109
    pending.clear();
110
    busy_time_estimator.reset();
111 112 113 114 115 116
}

void WORK_FETCH::rr_init() {
    cpu_work_fetch.rr_init();
    if (coproc_cuda) {
        cuda_work_fetch.rr_init();
David Anderson's avatar
David Anderson committed
117
    }
118 119 120
    if (coproc_ati) {
        ati_work_fetch.rr_init();
    }
121 122
    for (unsigned int i=0; i<gstate.projects.size(); i++) {
        PROJECT* p = gstate.projects[i];
123
        p->pwf.can_fetch_work = p->pwf.compute_can_fetch_work(p);
124
        p->pwf.has_runnable_jobs = false;
125
        p->cpu_pwf.rr_init(p, RSC_TYPE_CPU);
126
        if (coproc_cuda) {
127
            p->cuda_pwf.rr_init(p, RSC_TYPE_CUDA);
128
        }
129
        if (coproc_ati) {
130
            p->ati_pwf.rr_init(p, RSC_TYPE_ATI);
131
        }
132
    }
David Anderson's avatar
David Anderson committed
133 134
}

135 136 137 138 139 140 141 142
bool PROJECT_WORK_FETCH::compute_can_fetch_work(PROJECT* p) {
    if (p->non_cpu_intensive) return false;
    if (p->suspended_via_gui) return false;
    if (p->master_url_fetch_pending) return false;
    if (p->min_rpc_time > gstate.now) return false;
    if (p->dont_request_more_work) return false;
    if (p->some_download_stalled()) return false;
    if (p->some_result_suspended()) return false;
143
    if (p->too_many_uploading_results) return false;
144 145 146
    return true;
}

147 148 149
void PROJECT_WORK_FETCH::reset(PROJECT* p) {
    p->cpu_pwf.reset();
    p->cuda_pwf.reset();
150
    p->ati_pwf.reset();
151 152
}

David Anderson's avatar
David Anderson committed
153 154
void RSC_WORK_FETCH::accumulate_shortfall(double d_time) {
    double idle = ninstances - sim_nused;
155
    if (idle > 1e-6) {
156 157
        shortfall += idle*d_time;
    }
158 159 160 161 162
#if 0
    msg_printf(0, MSG_INFO, "accum shortf (%s): idle %f dt %f sf %f",
        rsc_name(rsc_type), idle, d_time, shortfall
    );
#endif
163
}
David Anderson's avatar
David Anderson committed
164

165
void RSC_WORK_FETCH::update_saturated_time(double dt) {
166
    double idle = ninstances - sim_nused;
167 168
    if (idle < 1e-6) {
        saturated_time = dt;
David Anderson's avatar
David Anderson committed
169
    }
170 171 172
}

void RSC_WORK_FETCH::update_busy_time(double dur, double nused) {
173
    busy_time_estimator.update(dur, nused);
David Anderson's avatar
David Anderson committed
174 175
}

176 177 178 179
// see if the project's debt is beyond what would normally happen;
// if so we conclude that it had a long job that ran in EDF mode;
// avoid asking it for work unless absolutely necessary.
//
180
bool RSC_PROJECT_WORK_FETCH::overworked() {
181
    double x = gstate.work_buf_total() + gstate.global_prefs.cpu_scheduling_period(); 
182 183
    if (x < 86400) x = 86400;
    return (debt < -x);
184 185
}

186 187 188 189
#define FETCH_IF_IDLE_INSTANCE          0
    // If resource has an idle instance,
    // get work for it from the project with greatest LTD,
    // even if it's overworked.
190 191 192 193 194 195
#define FETCH_IF_MAJOR_SHORTFALL        1
    // If resource is saturated for less than work_buf_min(),
    // get work for it from the project with greatest LTD,
    // even if it's overworked.
#define FETCH_IF_MINOR_SHORTFALL        2
    // If resource is saturated for less than work_buf_total(),
196
    // get work for it from the non-overworked project with greatest LTD.
197
#define FETCH_IF_PROJECT_STARVED        3
198 199
    // If any project is not overworked and has no runnable jobs
    // (for any resource, not just this one)
200 201
    // get work from the one with greatest LTD.

202
// Choose the best project to ask for work for this resource,
203
// given the specific criterion
204
//
205 206
PROJECT* RSC_WORK_FETCH::choose_project(int criterion) {
    double req;
207
    PROJECT* pbest = NULL;
David Anderson's avatar
David Anderson committed
208

209
    switch (criterion) {
210 211 212
    case FETCH_IF_IDLE_INSTANCE:
        if (nidle_now == 0) return NULL;
        break;
213
    case FETCH_IF_MAJOR_SHORTFALL:
214
        if (saturated_time > gstate.work_buf_min()) return NULL;
215
        break;
216
    case FETCH_IF_MINOR_SHORTFALL:
217
        if (saturated_time > gstate.work_buf_total()) return NULL;
218
        break;
219 220 221
    case FETCH_IF_PROJECT_STARVED:
        if (deadline_missed_instances >= ninstances) return NULL;
        break;
222 223
    }

224 225
    for (unsigned i=0; i<gstate.projects.size(); i++) {
        PROJECT* p = gstate.projects[i];
226 227
        if (!p->pwf.can_fetch_work) continue;
        if (!project_state(p).may_have_work) continue;
228
        RSC_PROJECT_WORK_FETCH& rpwf = project_state(p);
229
        switch (criterion) {
230
        case FETCH_IF_MINOR_SHORTFALL:
231 232 233 234
            if (rpwf.overworked()) continue;
            break;
        case FETCH_IF_PROJECT_STARVED:
            if (rpwf.overworked()) continue;
235
            if (p->pwf.has_runnable_jobs) continue;
236
            break;
237
        }
238 239
        if (pbest) {
            if (pbest->pwf.overall_debt > p->pwf.overall_debt) {
David Anderson's avatar
David Anderson committed
240 241
                continue;
            }
242

David Anderson's avatar
David Anderson committed
243
        }
244
        pbest = p;
David Anderson's avatar
David Anderson committed
245
    }
246 247 248 249 250 251
    if (!pbest) return NULL;

    // decide how much work to request from each resource
    //
    work_fetch.clear_request();
    switch (criterion) {
252 253 254 255 256 257 258 259 260 261
    case FETCH_IF_IDLE_INSTANCE:
        if (log_flags.work_fetch_debug) {
            msg_printf(pbest, MSG_INFO,
                "chosen: %s idle instance", rsc_name(rsc_type)
            );
        }
        req = share_request(pbest);
        if (req > shortfall) req = shortfall;
        set_request(pbest, req);
        break;
262
    case FETCH_IF_MAJOR_SHORTFALL:
263 264
        if (log_flags.work_fetch_debug) {
            msg_printf(pbest, MSG_INFO,
265
                "chosen: %s major shortfall", rsc_name(rsc_type)
266 267 268 269 270 271
            );
        }
        req = share_request(pbest);
        if (req > shortfall) req = shortfall;
        set_request(pbest, req);
        break;
272
    case FETCH_IF_MINOR_SHORTFALL:
273 274
        if (log_flags.work_fetch_debug) {
            msg_printf(pbest, MSG_INFO,
275
                "chosen: %s minor shortfall", rsc_name(rsc_type)
276 277 278 279 280 281 282
            );
        }
        work_fetch.set_shortfall_requests(pbest);
        break;
    case FETCH_IF_PROJECT_STARVED:
        if (log_flags.work_fetch_debug) {
            msg_printf(pbest, MSG_INFO,
283
                "chosen: %s starved", rsc_name(rsc_type)
284 285 286 287 288 289
            );
        }
        req = share_request(pbest);
        set_request(pbest, req);
        break;
    }
290 291 292
    return pbest;
}

293 294
void WORK_FETCH::set_shortfall_requests(PROJECT* p) {
    cpu_work_fetch.set_shortfall_request(p);
295
    if (coproc_cuda && coproc_cuda->usable) {
296 297
        cuda_work_fetch.set_shortfall_request(p);
    }
298
    if (coproc_ati && coproc_ati->usable) {
299 300
        ati_work_fetch.set_shortfall_request(p);
    }
301 302 303 304
}

void RSC_WORK_FETCH::set_shortfall_request(PROJECT* p) {
    if (!shortfall) return;
305 306 307
    RSC_PROJECT_WORK_FETCH& w = project_state(p);
    if (!w.may_have_work) return;
    if (w.overworked()) return;
308 309 310
    set_request(p, shortfall);
}

311 312 313 314 315 316
void WORK_FETCH::set_overall_debts() {
    for (unsigned i=0; i<gstate.projects.size(); i++) {
        PROJECT* p = gstate.projects[i];
        p->pwf.overall_debt = p->cpu_pwf.debt;
        if (coproc_cuda) {
            p->pwf.overall_debt += cuda_work_fetch.speed*p->cuda_pwf.debt;
David Anderson's avatar
David Anderson committed
317
        }
318 319 320
        if (coproc_ati) {
            p->pwf.overall_debt += ati_work_fetch.speed*p->ati_pwf.debt;
        }
David Anderson's avatar
David Anderson committed
321
    }
322
}
David Anderson's avatar
David Anderson committed
323

324 325 326 327 328 329 330
void WORK_FETCH::zero_debts() {
    for (unsigned i=0; i<gstate.projects.size(); i++) {
        PROJECT* p = gstate.projects[i];
        p->cpu_pwf.debt = 0;
        if (coproc_cuda) {
            p->cuda_pwf.debt = 0;
        }
331 332 333
        if (coproc_ati) {
            p->ati_pwf.debt = 0;
        }
334 335 336
    }
}

337
void RSC_WORK_FETCH::print_state(const char* name) {
338
    msg_printf(0, MSG_INFO,
339
        "[wfd] %s: shortfall %.2f nidle %.2f saturated %.2f busy %.2f RS fetchable %.2f runnable %.2f",
340
        name,
341
        shortfall, nidle_now, saturated_time, busy_time_estimator.get_busy_time(),
342
        total_fetchable_share, total_runnable_share
343 344 345
    );
    for (unsigned int i=0; i<gstate.projects.size(); i++) {
        PROJECT* p = gstate.projects[i];
David Anderson's avatar
David Anderson committed
346
        if (p->non_cpu_intensive) continue;
347
        RSC_PROJECT_WORK_FETCH& pwf = project_state(p);
348
        double bt = pwf.backoff_time>gstate.now?pwf.backoff_time-gstate.now:0;
349 350 351
        bool blocked_by_prefs = false;
        switch (rsc_type) {
        case RSC_TYPE_CPU:
352
            if (p->no_cpu_pref) blocked_by_prefs = true;
353 354
            break;
        case RSC_TYPE_CUDA:
355
            if (p->no_cuda_pref) blocked_by_prefs = true;
356 357
            break;
        case RSC_TYPE_ATI:
358
            if (p->no_ati_pref) blocked_by_prefs = true;
359 360
            break;
        }
361
        msg_printf(p, MSG_INFO,
362
            "[wfd] %s: fetch share %.2f debt %.2f backoff dt %.2f int %.2f%s%s%s%s%s%s%s",
363
            name,
364
            pwf.fetchable_share, pwf.debt, bt, pwf.backoff_interval,
365 366 367
            p->suspended_via_gui?" (susp via GUI)":"",
            p->master_url_fetch_pending?" (master fetch pending)":"",
            p->min_rpc_time > gstate.now?" (comm deferred)":"",
David Anderson's avatar
 
David Anderson committed
368
            p->dont_request_more_work?" (no new tasks)":"",
369
            pwf.overworked()?" (overworked)":"",
370 371
            p->too_many_uploading_results?" (too many uploads)":"",
            blocked_by_prefs?" (blocked by prefs)":""
372 373
        );
    }
David Anderson's avatar
David Anderson committed
374 375
}

376 377
void WORK_FETCH::print_state() {
    msg_printf(0, MSG_INFO, "[wfd] ------- start work fetch state -------");
David Anderson's avatar
David Anderson committed
378 379
    msg_printf(0, MSG_INFO, "[wfd] target work buffer: %.2f + %.2f sec",
        gstate.work_buf_min(), gstate.work_buf_additional()
David Anderson's avatar
David Anderson committed
380
    );
381 382
    cpu_work_fetch.print_state("CPU");
    if (coproc_cuda) {
383 384 385
        cuda_work_fetch.print_state("NVIDIA GPU");
    }
    if (coproc_ati) {
David Anderson's avatar
 
David Anderson committed
386
        ati_work_fetch.print_state("ATI GPU");
387 388 389
    }
    for (unsigned int i=0; i<gstate.projects.size(); i++) {
        PROJECT* p = gstate.projects[i];
David Anderson's avatar
David Anderson committed
390
        if (p->non_cpu_intensive) continue;
391
        msg_printf(p, MSG_INFO, "[wfd] overall_debt %.0f", p->pwf.overall_debt);
392 393
    }
    msg_printf(0, MSG_INFO, "[wfd] ------- end work fetch state -------");
David Anderson's avatar
David Anderson committed
394 395
}

396 397 398 399 400 401 402 403
void RSC_WORK_FETCH::clear_request() {
    req_secs = 0;
    req_instances = 0;
}

void WORK_FETCH::clear_request() {
    cpu_work_fetch.clear_request();
    cuda_work_fetch.clear_request();
404
    ati_work_fetch.clear_request();
405 406
}

407 408 409 410 411 412 413 414 415 416 417 418 419
// does the project have a downloading or runnable job?
//
static bool has_a_job(PROJECT* p) {
    for (unsigned int j=0; j<gstate.results.size(); j++) {
        RESULT* rp = gstate.results[j];
        if (rp->project != p) continue;
        if (rp->state() <= RESULT_FILES_DOWNLOADED) {
            return true;
        }
    }
    return false;
}

420
// we're going to contact this project for reasons other than work fetch;
421
// decide if we should piggy-back a work fetch request.
422 423
//
void WORK_FETCH::compute_work_request(PROJECT* p) {
424 425
    clear_request();
    if (p->dont_request_more_work) return;
426
    if (p->non_cpu_intensive) {
427
        if (!has_a_job(p)) {
428 429 430 431 432
            cpu_work_fetch.req_secs = 1;
        }
        return;
    }

433 434 435
    // See if this is the project we'd ask for work anyway.
    // Temporarily clear resource backoffs,
    // since we're going to contact this project in any case.
436
    //
437 438
    double cpu_save = p->cpu_pwf.backoff_time;
    double cuda_save = p->cuda_pwf.backoff_time;
439
    double ati_save = p->ati_pwf.backoff_time;
440 441
    p->cpu_pwf.backoff_time = 0;
    p->cuda_pwf.backoff_time = 0;
442
    p->ati_pwf.backoff_time = 0;
443
    PROJECT* pbest = choose_project();
444 445
    p->cpu_pwf.backoff_time = cpu_save;
    p->cuda_pwf.backoff_time = cuda_save;
446
    p->ati_pwf.backoff_time = ati_save;
David Anderson's avatar
David Anderson committed
447 448 449 450 451 452 453 454
    if (p == pbest) {
        // Ask for work for all devices w/ a shortfall.
        // Otherwise we can have a situation where a GPU is idle,
        // we ask only for GPU work, and the project never has any
        //
        work_fetch.set_shortfall_requests(pbest);
        return;
    }
455 456 457 458 459 460

    // if not, don't request any work
    //
    clear_request();
}

461 462 463
// see if there's a fetchable non-CPU-intensive project without work
//
PROJECT* WORK_FETCH::non_cpu_intensive_project_needing_work() {
David Anderson's avatar
David Anderson committed
464 465 466 467
    for (unsigned int i=0; i<gstate.projects.size(); i++) {
        PROJECT* p = gstate.projects[i];
        if (!p->non_cpu_intensive) continue;
        if (!p->can_request_work()) continue;
David Anderson's avatar
David Anderson committed
468
        if (p->cpu_pwf.backoff_time > gstate.now) continue;
469 470 471 472
        if (has_a_job(p)) continue;
        clear_request();
        cpu_work_fetch.req_secs = 1;
        return p;
David Anderson's avatar
David Anderson committed
473 474
    }
    return 0;
475 476
}

477 478
// choose a project to fetch work from,
// and set the request fields of resource objects
David Anderson's avatar
David Anderson committed
479
//
480 481
PROJECT* WORK_FETCH::choose_project() {
    PROJECT* p = 0;
482

David Anderson's avatar
David Anderson committed
483 484
    p = non_cpu_intensive_project_needing_work();
    if (p) return p;
485

486 487
    gstate.compute_nuploading_results();

488 489
    gstate.rr_simulation();
    set_overall_debts();
490

491 492 493 494
    bool cuda_usable = coproc_cuda && coproc_cuda->usable;
    bool ati_usable = coproc_ati && coproc_ati->usable;

    if (cuda_usable) {
495 496
        p = cuda_work_fetch.choose_project(FETCH_IF_IDLE_INSTANCE);
    }
497
    if (ati_usable) {
498 499
        p = ati_work_fetch.choose_project(FETCH_IF_IDLE_INSTANCE);
    }
500 501 502
    if (!p) {
        p = cpu_work_fetch.choose_project(FETCH_IF_IDLE_INSTANCE);
    }
503
    if (!p && cuda_usable) {
504
        p = cuda_work_fetch.choose_project(FETCH_IF_MAJOR_SHORTFALL);
505
    }
506
    if (!p && ati_usable) {
507 508
        p = ati_work_fetch.choose_project(FETCH_IF_MAJOR_SHORTFALL);
    }
509
    if (!p) {
510
        p = cpu_work_fetch.choose_project(FETCH_IF_MAJOR_SHORTFALL);
David Anderson's avatar
David Anderson committed
511
    }
512
    if (!p && cuda_usable) {
513
        p = cuda_work_fetch.choose_project(FETCH_IF_MINOR_SHORTFALL);
David Anderson's avatar
David Anderson committed
514
    }
515
    if (!p && ati_usable) {
516 517
        p = ati_work_fetch.choose_project(FETCH_IF_MINOR_SHORTFALL);
    }
518
    if (!p) {
519
        p = cpu_work_fetch.choose_project(FETCH_IF_MINOR_SHORTFALL);
520
    }
521 522 523 524
#if 0
    // don't try to maintain GPU work for all projects,
    // since we don't use round-robin scheduling for GPUs
    //
525
    if (!p && cuda_usable) {
526
        p = cuda_work_fetch.choose_project(FETCH_IF_PROJECT_STARVED);
527
    }
528
    if (!p && ati_usable) {
529 530
        p = ati_work_fetch.choose_project(FETCH_IF_PROJECT_STARVED);
    }
531
#endif
532
    if (!p) {
533
        p = cpu_work_fetch.choose_project(FETCH_IF_PROJECT_STARVED);
David Anderson's avatar
David Anderson committed
534
    }
535

David Anderson's avatar
David Anderson committed
536
    if (log_flags.work_fetch_debug) {
537
        print_state();
538
        if (!p) {
539
            msg_printf(0, MSG_INFO, "[wfd] No project chosen for work fetch");
540 541 542
        }
    }

543 544
    return p;
}
David Anderson's avatar
David Anderson committed
545

546 547 548
double RSC_WORK_FETCH::share_request(PROJECT* p) {
    double dcf = p->duration_correction_factor;
    if (dcf < 0.02 || dcf > 80.0) {
David Anderson's avatar
David Anderson committed
549 550 551
        // if project's DCF is too big or small,
        // its completion time estimates are useless; just ask for 1 second
        //
552
        return 1;
553
    } else {
554
        // otherwise ask for the project's share
David Anderson's avatar
David Anderson committed
555
        //
556 557
        RSC_PROJECT_WORK_FETCH& w = project_state(p);
        return gstate.work_buf_total()*w.fetchable_share;
558
    }
559 560 561 562 563
}

void RSC_WORK_FETCH::set_request(PROJECT* p, double r) {
    RSC_PROJECT_WORK_FETCH& w = project_state(p);
    req_secs = r;
564 565 566
    req_instances = (int)ceil(w.fetchable_share*nidle_now);
}

567 568 569 570 571 572 573
void WORK_FETCH::accumulate_inst_sec(ACTIVE_TASK* atp, double dt) {
    APP_VERSION* avp = atp->result->avp;
    PROJECT* p = atp->result->project;
    double x = dt*avp->avg_ncpus;
    p->cpu_pwf.secs_this_debt_interval += x;
    cpu_work_fetch.secs_this_debt_interval += x;
    if (coproc_cuda) {
574
        x = dt*avp->ncudas;
575 576
        p->cuda_pwf.secs_this_debt_interval += x;
        cuda_work_fetch.secs_this_debt_interval += x;
David Anderson's avatar
David Anderson committed
577
    }
578 579 580 581 582
    if (coproc_ati) {
        x = dt*avp->natis;
        p->ati_pwf.secs_this_debt_interval += x;
        ati_work_fetch.secs_this_debt_interval += x;
    }
583
}
David Anderson's avatar
David Anderson committed
584

David Anderson's avatar
 
David Anderson committed
585 586
// update long-term debts for a resource.
//
587 588
void RSC_WORK_FETCH::update_debts() {
    unsigned int i;
589
    int neligible = 0;
590 591 592
    double ders = 0;
    PROJECT* p;

David Anderson's avatar
 
David Anderson committed
593 594
    // find the total resource share of eligible projects
    //
595 596 597
    for (i=0; i<gstate.projects.size(); i++) {
        p = gstate.projects[i];
        RSC_PROJECT_WORK_FETCH& w = project_state(p);
598
        if (w.debt_eligible(p, *this)) {
David Anderson's avatar
David Anderson committed
599
            ders += p->resource_share;
600
            neligible++;
David Anderson's avatar
David Anderson committed
601
        }
David Anderson's avatar
David Anderson committed
602
    }
David Anderson's avatar
 
David Anderson committed
603
    if (!neligible) {
604 605 606 607 608
        if (log_flags.debt_debug) {
            msg_printf(0, MSG_INFO,
                "[debt] %s: no eligible projects", rsc_name(rsc_type)
            );
        }
David Anderson's avatar
 
David Anderson committed
609 610
        return;
    }
611

612 613
    double max_debt=0;
    bool first = true;
614 615
    for (i=0; i<gstate.projects.size(); i++) {
        p = gstate.projects[i];
616
        if (p->non_cpu_intensive) continue;
617
        RSC_PROJECT_WORK_FETCH& w = project_state(p);
618
        if (w.debt_eligible(p, *this)) {
619
            double share_frac = p->resource_share/ders;
620 621 622 623

            // the change to a project's debt is:
            // (how much it's owed) - (how much it got)
            //
David Anderson's avatar
David Anderson committed
624
            double delta = share_frac*secs_this_debt_interval - w.secs_this_debt_interval;
625
            w.debt += delta;
David Anderson's avatar
David Anderson committed
626 627 628 629 630 631 632 633
            if (log_flags.debt_debug) {
                msg_printf(p, MSG_INFO,
                    "[debt] %s debt %.2f delta %.2f share frac %.2f (%.2f/%.2f) secs %.2f rsc_secs %.2f",
                    rsc_name(rsc_type),
                    w.debt, delta, share_frac, p->resource_share, ders, secs_this_debt_interval,
                    w.secs_this_debt_interval
                );
            }
634 635 636 637 638 639 640 641
            if (first) {
                max_debt = w.debt;
                first = false;
            } else {
                if (w.debt > max_debt) {
                    max_debt = w.debt;
                }
            }
642 643 644 645 646 647 648
        } else {
            if (log_flags.debt_debug) {
                msg_printf(p, MSG_INFO,
                    "[debt] %s ineligible; debt %.2f",
                    rsc_name(rsc_type), w.debt
                );
            }
David Anderson's avatar
David Anderson committed
649 650 651
        }
    }

652 653 654 655 656 657 658
    // The net change may be
    // - positive if the resource wasn't fully utilized during the debt interval
    // - negative it was overcommitted (e.g., CPU)
    // We need to keep eligible projects from diverging from non-eligible ones;
    // also, if all the debts are large negative we need to gradually
    // shift them towards zero.
    // To do this, we add an offset as follows:
659 660 661 662 663 664
    // delta_limit is the largest rate at which any project's debt
    // could increase or decrease.
    // If the largest debt is close to zero (relative to delta_limit)
    // than add an offset that will bring it exactly to zero.
    // Otherwise add an offset of 2*delta_limit,
    // which will gradually bring all the debts towards zero
665
    //
666 667 668 669
    // The policy of keeping the max debt at zero is important;
    // it means that new projects will begin in parity with high-debt project,
    // and won't wait for months to get work.
    //
670
    double offset;
671
    double delta_limit = secs_this_debt_interval*ninstances;
David Anderson's avatar
David Anderson committed
672 673
    if (max_debt > -2*delta_limit) {
        offset = max_debt?-max_debt:0;  // avoid -0
674
    } else {
675
        offset = 2*delta_limit;
676
    }
677
    if (log_flags.debt_debug) {
678 679 680
        msg_printf(0, MSG_INFO, "[debt] %s debt: adding offset %.2f",
            rsc_name(rsc_type), offset
        );
David Anderson's avatar
David Anderson committed
681
    }
682 683
    for (i=0; i<gstate.projects.size(); i++) {
        p = gstate.projects[i];
684
        if (p->non_cpu_intensive) continue;
685
        RSC_PROJECT_WORK_FETCH& w = project_state(p);
686
        if (w.debt_eligible(p, *this)) {
687
            w.debt += offset;
688
        }
David Anderson's avatar
David Anderson committed
689
    }
690
}
David Anderson's avatar
David Anderson committed
691

692 693 694 695 696 697 698 699
// find total and per-project resource shares for each resource
//
void WORK_FETCH::compute_shares() {
    unsigned int i;
    PROJECT* p;
    for (i=0; i<gstate.projects.size(); i++) {
        p = gstate.projects[i];
        if (p->non_cpu_intensive) continue;
David Anderson's avatar
David Anderson committed
700
        if (p->cpu_pwf.has_runnable_jobs) {
701
            cpu_work_fetch.total_runnable_share += p->resource_share;
David Anderson's avatar
David Anderson committed
702
        }
David Anderson's avatar
David Anderson committed
703
        if (p->cuda_pwf.has_runnable_jobs) {
704
            cuda_work_fetch.total_runnable_share += p->resource_share;
David Anderson's avatar
David Anderson committed
705
        }
706 707 708
        if (p->ati_pwf.has_runnable_jobs) {
            ati_work_fetch.total_runnable_share += p->resource_share;
        }
709
        if (!p->pwf.can_fetch_work) continue;
710
        if (p->cpu_pwf.may_have_work) {
711 712
            cpu_work_fetch.total_fetchable_share += p->resource_share;
        }
713
        if (coproc_cuda && p->cuda_pwf.may_have_work) {
David Anderson's avatar
David Anderson committed
714
            cuda_work_fetch.total_fetchable_share += p->resource_share;
715
        }
716 717 718
        if (coproc_ati && p->ati_pwf.may_have_work) {
            ati_work_fetch.total_fetchable_share += p->resource_share;
        }
719 720 721 722
    }
    for (i=0; i<gstate.projects.size(); i++) {
        p = gstate.projects[i];
        if (p->non_cpu_intensive) continue;
David Anderson's avatar
David Anderson committed
723
        if (p->cpu_pwf.has_runnable_jobs) {
724
            p->cpu_pwf.runnable_share = p->resource_share/cpu_work_fetch.total_runnable_share;
David Anderson's avatar
David Anderson committed
725
        }
David Anderson's avatar
David Anderson committed
726
        if (p->cuda_pwf.has_runnable_jobs) {
727
            p->cuda_pwf.runnable_share = p->resource_share/cuda_work_fetch.total_runnable_share;
David Anderson's avatar
David Anderson committed
728
        }
729 730 731
        if (p->ati_pwf.has_runnable_jobs) {
            p->ati_pwf.runnable_share = p->resource_share/ati_work_fetch.total_runnable_share;
        }
732
        if (!p->pwf.can_fetch_work) continue;
733
        if (p->cpu_pwf.may_have_work) {
734 735
            p->cpu_pwf.fetchable_share = p->resource_share/cpu_work_fetch.total_fetchable_share;
        }
736
        if (coproc_cuda && p->cuda_pwf.may_have_work) {
737 738
            p->cuda_pwf.fetchable_share = p->resource_share/cuda_work_fetch.total_fetchable_share;
        }
739 740 741
        if (coproc_ati && p->ati_pwf.may_have_work) {
            p->ati_pwf.fetchable_share = p->resource_share/ati_work_fetch.total_fetchable_share;
        }
742 743 744 745 746
    }
}

// should this project be accumulating debt for this resource?
//
747
bool RSC_PROJECT_WORK_FETCH::debt_eligible(PROJECT* p, RSC_WORK_FETCH& rwf) {
David Anderson's avatar
David Anderson committed
748
    if (p->non_cpu_intensive) return false;
749
    if (p->suspended_via_gui) return false;
750
    if (has_runnable_jobs) return true;
751 752
        // must precede the done_request_more_work check
    if (p->dont_request_more_work) return false;
753
    if (backoff_time > gstate.now) return false;
754 755 756 757 758 759 760
    if (prefs_prevent_fetch(p, rwf.rsc_type)) return false;

    // NOTE: it's critical that all conditions that might prevent
    // us from asking the project for work of this type
    // be included in the above list.
    // Otherwise we might get in a state where debt accumulates,
    // pushing other projects into overworked state
761 762 763

    // The last time we asked for work we didn't get any,
    // but it's been a while since we asked.
764 765
    // In this case, accumulate debt until we reach (around) zero, then stop.
    //
766 767 768 769 770
    if (backoff_interval == MAX_BACKOFF_INTERVAL) {
        if (debt > -rwf.ninstances*DEBT_ADJUST_PERIOD) {
            return false;
        }
    }
771
    if (p->min_rpc_time > gstate.now) return false;
772 773
    return true;
}
David Anderson's avatar
David Anderson committed
774

775 776 777 778 779 780 781 782 783 784 785 786 787 788
inline bool has_coproc_app(PROJECT* p, int rsc_type) {
    unsigned int i;
    for (i=0; i<gstate.app_versions.size(); i++) {
        APP_VERSION* avp = gstate.app_versions[i];
        if (avp->project != p) continue;
        switch(rsc_type) {
        case RSC_TYPE_CUDA: if (avp->ncudas) return true;
        case RSC_TYPE_ATI: if (avp->natis) return true;
        }
    }
    return false;
}

void WORK_FETCH::write_request(FILE* f, PROJECT* p) {
789 790 791
    double work_req = cpu_work_fetch.req_secs;

    // if project is anonymous platform, set the overall work req
792
    // to the max of the requests of resource types for which we have versions.
793
    // Otherwise projects with old schedulers won't send us work.
794
    // THIS CAN BE REMOVED AT SOME POINT
795
    //
796 797 798 799 800
    if (p->anonymous_platform) {
        if (has_coproc_app(p, RSC_TYPE_CUDA)) {
            if (cuda_work_fetch.req_secs > work_req) {
                work_req = cuda_work_fetch.req_secs;
            }
801
        }
802 803 804 805
        if (has_coproc_app(p, RSC_TYPE_ATI)) {
            if (ati_work_fetch.req_secs > work_req) {
                work_req = ati_work_fetch.req_secs;
            }
806 807
        }
    }
808
    fprintf(f,
809
        "    <work_req_seconds>%f</work_req_seconds>\n"
810
        "    <cpu_req_secs>%f</cpu_req_secs>\n"
811 812
        "    <cpu_req_instances>%d</cpu_req_instances>\n"
        "    <estimated_delay>%f</estimated_delay>\n",
813
        work_req,
814
        cpu_work_fetch.req_secs,
815
        cpu_work_fetch.req_instances,
816
        cpu_work_fetch.req_secs?cpu_work_fetch.busy_time_estimator.get_busy_time():0
817
    );
818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838
    if (log_flags.work_fetch_debug) {
        char buf[256], buf2[256];
        sprintf(buf,
            "[wfd] request: %.2f sec CPU (%.2f sec, %d)",
            work_req,
            cpu_work_fetch.req_secs, cpu_work_fetch.req_instances
        );
        if (coproc_cuda) {
            sprintf(buf2, " NVIDIA GPU (%.2f sec, %d)",
                cuda_work_fetch.req_secs, cuda_work_fetch.req_instances
            );
            strcat(buf, buf2);
        }
        if (coproc_ati) {
            sprintf(buf2, " ATI GPU (%.2f sec, %d)",
                ati_work_fetch.req_secs, ati_work_fetch.req_instances
            );
            strcat(buf, buf2);
        }
        msg_printf(p, MSG_INFO, buf);
    }
David Anderson's avatar
David Anderson committed
839 840
}

841
// we just got a scheduler reply with the given jobs; update backoffs
David Anderson's avatar
David Anderson committed
842
//
843 844 845
void WORK_FETCH::handle_reply(
    PROJECT* p, SCHEDULER_REPLY* srp, vector<RESULT*> new_results
) {
846
    unsigned int i;
847
    bool got_cpu = false, got_cuda = false, got_ati = false;
David Anderson's avatar
David Anderson committed
848

849 850 851 852 853 854 855 856 857 858