work_fetch.h 10.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
// This file is part of BOINC.
// http://boinc.berkeley.edu
// Copyright (C) 2008 University of California
//
// BOINC is free software; you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License
// as published by the Free Software Foundation,
// either version 3 of the License, or (at your option) any later version.
//
// BOINC is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with BOINC.  If not, see <http://www.gnu.org/licenses/>.

// Work fetch logic for CPU, GPU, and other processing resources.
// See http://boinc.berkeley.edu/trac/wiki/GpuWorkFetch

David Anderson's avatar
 
David Anderson committed
21
22
23
24
#ifndef _WORK_FETCH_
#define _WORK_FETCH_

#include <vector>
25
#include <deque>
David Anderson's avatar
 
David Anderson committed
26

27
28
#define RSC_TYPE_ANY    -1
#define RSC_TYPE_CPU    0
David Anderson's avatar
 
David Anderson committed
29

30
// reasons for not fetching work
31
//
32
33
34
35
36
37
38
39
40
41
#define CANT_FETCH_WORK_NON_CPU_INTENSIVE           1
#define CANT_FETCH_WORK_SUSPENDED_VIA_GUI           2
#define CANT_FETCH_WORK_MASTER_URL_FETCH_PENDING    3
#define CANT_FETCH_WORK_MIN_RPC_TIME                4
#define CANT_FETCH_WORK_DONT_REQUEST_MORE_WORK      5
#define CANT_FETCH_WORK_DOWNLOAD_STALLED            6
#define CANT_FETCH_WORK_RESULT_SUSPENDED            7
#define CANT_FETCH_WORK_TOO_MANY_UPLOADS            8
#define CANT_FETCH_WORK_NOT_HIGHEST_PRIORITY        9
#define CANT_FETCH_WORK_DONT_NEED                   10
42
#define CANT_FETCH_WORK_TOO_MANY_RUNNABLE           11
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61

inline const char* cant_fetch_work_string(int reason) {
    switch (reason) {
    case CANT_FETCH_WORK_NON_CPU_INTENSIVE:
        return "non CPU intensive";
    case CANT_FETCH_WORK_SUSPENDED_VIA_GUI:
        return "suspended via Manager";
    case CANT_FETCH_WORK_MASTER_URL_FETCH_PENDING:
        return "master URL fetch pending";
    case CANT_FETCH_WORK_MIN_RPC_TIME:
        return "scheduler RPC backoff";
    case CANT_FETCH_WORK_DONT_REQUEST_MORE_WORK:
        return "\"no new tasks\" requested via Manager";
    case CANT_FETCH_WORK_DOWNLOAD_STALLED:
        return "some download is stalled";
    case CANT_FETCH_WORK_RESULT_SUSPENDED:
        return "some task is suspended via Manager";
    case CANT_FETCH_WORK_TOO_MANY_UPLOADS:
        return "too many uploads in progress";
David Anderson's avatar
David Anderson committed
62
63
    case CANT_FETCH_WORK_NOT_HIGHEST_PRIORITY:
        return "project is not highest priority";
David Anderson's avatar
David Anderson committed
64
65
    case CANT_FETCH_WORK_DONT_NEED:
        return "don't need";
66
67
    case CANT_FETCH_WORK_TOO_MANY_RUNNABLE:
        return "too many runnable tasks";
68
69
70
71
    }
    return "";
}

72
struct PROJECT;
David Anderson's avatar
 
David Anderson committed
73
struct RESULT;
74
struct ACTIVE_TASK;
David Anderson's avatar
 
David Anderson committed
75
struct RSC_WORK_FETCH;
76
struct SCHEDULER_REPLY;
77
struct APP_VERSION;
David Anderson's avatar
 
David Anderson committed
78

79
// state per (resource, project) pair
David Anderson's avatar
 
David Anderson committed
80
81
//
struct RSC_PROJECT_WORK_FETCH {
82
    // the following are persistent (saved in state file)
David Anderson's avatar
 
David Anderson committed
83
84
    double backoff_time;
    double backoff_interval;
85

86
87
88
89
    // the following used by REC accounting
    double secs_this_rec_interval;
    inline void reset_rec_accounting() {
        secs_this_rec_interval = 0;
David Anderson's avatar
 
David Anderson committed
90
    }
91
92
    double queue_est;
        // an estimate of instance-secs of queued work;
93
94
95
    bool anon_skip;
        // set if this project is anonymous platform
        // and it has no app version that uses this resource
96
97
98
99
    double fetchable_share;
        // this project's share relative to projects from which
        // we could probably get work for this resource;
        // determines how many instances this project deserves
100
    int n_runnable_jobs;
David Anderson's avatar
David Anderson committed
101
    double sim_nused;
102
        // # of instances used at this point in the simulation
103
    double nused_total;     // sum of instances over all runnable jobs
104
105
106
107
108
    int ncoprocs_excluded;
        // number of excluded instances
    int non_excluded_instances;
        // bitmap of non-excluded instances
        // (i.e. instances this project's jobs can run on)
109
110
    int deadlines_missed;
    int deadlines_missed_copy;
111
        // copy of the above used during schedule_cpus()
112
113
    std::deque<RESULT*> pending;
    std::deque<RESULT*>::iterator pending_iter;
David Anderson's avatar
 
David Anderson committed
114
115

    RSC_PROJECT_WORK_FETCH() {
116
117
        backoff_time = 0;
        backoff_interval = 0;
118
        secs_this_rec_interval = 0;
119
120
121
        queue_est = 0;
        anon_skip = false;
        fetchable_share = 0;
122
        n_runnable_jobs = 0;
123
124
        sim_nused = 0;
        nused_total = 0;
125
126
        ncoprocs_excluded = 0;
        non_excluded_instances = 0;
127
128
        deadlines_missed = 0;
        deadlines_missed_copy = 0;
David Anderson's avatar
 
David Anderson committed
129
130
    }

131
    inline void reset() {
David Anderson's avatar
 
David Anderson committed
132
133
134
        backoff_time = 0;
        backoff_interval = 0;
    }
135
136

    bool may_have_work;
137
    bool compute_may_have_work(PROJECT*, int rsc_type);
138
    void resource_backoff(PROJECT*, const char*);
139
    void rr_init(PROJECT*, int rsc_type);
140
141
142
143
    void clear_backoff() {
        backoff_time = 0;
        backoff_interval = 0;
    }
David Anderson's avatar
 
David Anderson committed
144
145
};

David Anderson's avatar
David Anderson committed
146
// estimate the time a resource will be saturated
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
// with high-priority jobs.
//
struct BUSY_TIME_ESTIMATOR {
    std::vector<double> busy_time;
    int ninstances;
    inline void reset() {
        for (int i=0; i<ninstances; i++) {
            busy_time[i] = 0;
        }
    }
    inline void init(int n) {
        ninstances = n;
        busy_time.resize(n);
        reset();
    }
    // called for each high-priority job.
    // Find the least-busy instance, and put this job
    // on that and following instances
    //
    inline void update(double dur, double nused) {
167
        if (ninstances==0) return;
168
169
        int i, j;
        if (nused < 1) return;
170
        double best = 0;
171
        int ibest = 0;
172
173
        for (i=0; i<ninstances; i++) {
            if (!i || busy_time[i] < best) {
174
175
176
177
                best = busy_time[i];
                ibest = i;
            }
        }
David Anderson's avatar
   
David Anderson committed
178
179
        int inused = (int) nused;     // ignore fractional usage
        for (i=0; i<inused; i++) {
180
181
182
183
184
185
186
187
188
            j = (ibest + i) % ninstances;
            busy_time[j] += dur;
        }
    }

    // the overall busy time is the busy time of
    // the least busy instance
    //
    inline double get_busy_time() {
189
190
191
        double best = 0;
        for (int i=0; i<ninstances; i++) {
            if (!i || busy_time[i] < best) {
192
193
194
195
196
197
198
                best = busy_time[i];
            }
        }
        return best;
    }
};

David Anderson's avatar
 
David Anderson committed
199
200
201
202
203
// per-resource state
//
struct RSC_WORK_FETCH {
    int rsc_type;
    int ninstances;
204
    double relative_speed;   // total FLOPS relative to CPU total FLOPS
205
    bool has_exclusions;
David Anderson's avatar
 
David Anderson committed
206
207
208
209

    // the following used/set by rr_simulation():
    //
    double shortfall;
210
        // seconds of idle instances between now and now+work_buf_total()
David Anderson's avatar
 
David Anderson committed
211
    double nidle_now;
David Anderson's avatar
David Anderson committed
212
    double sim_nused;
213
214
215
216
217
    int sim_used_instances;
        // bitmap of instances used in simulation,
        // taking into account GPU exclusions
    int sim_excluded_instances;
        // bitmap of instances not used (i.e. starved because of exclusion)
218
219
    double total_fetchable_share;
        // total RS of projects from which we could fetch jobs for this device
220
221
222
    double saturated_time;
        // estimated time until resource is not saturated
        // used to calculate work request
223
224
    double deadline_missed_instances;
        // instance count for jobs that miss deadline
225
    BUSY_TIME_ESTIMATOR busy_time_estimator;
226
227
228
#ifdef SIM
    double estimated_delay;
#endif
229

230
231
232
    void init(int t, int n, double sp) {
        rsc_type = t;
        ninstances = n;
233
        relative_speed = sp;
234
235
        busy_time_estimator.init(n);
    }
236
237
238
    // the following specify the work request for this resource
    //
    double req_secs;
239
    double req_instances;
David Anderson's avatar
 
David Anderson committed
240

241
242
243
244
    // REC accounting
    double secs_this_rec_interval;
    inline void reset_rec_accounting() {
        this->secs_this_rec_interval = 0;
David Anderson's avatar
 
David Anderson committed
245
246
    }

247
248
249
    // temp in choose_project()
    PROJECT* found_project;     // a project able to ask for this work

David Anderson's avatar
 
David Anderson committed
250
    void rr_init();
251
    void update_stats(double sim_now, double dt, double buf_end);
252
    void update_busy_time(double dur, double nused);
253
    void supplement(PROJECT*);
David Anderson's avatar
 
David Anderson committed
254
    RSC_PROJECT_WORK_FETCH& project_state(PROJECT*);
255
    void print_state(const char*);
256
    void clear_request();
257
    void set_request(PROJECT*);
258
    void set_request_excluded(PROJECT*);
259
    bool may_have_work(PROJECT*);
260
    bool can_fetch(PROJECT*);
261
    bool backed_off(PROJECT*);
262
    bool uses_starved_excluded_instances(PROJECT*);
David Anderson's avatar
 
David Anderson committed
263
    RSC_WORK_FETCH() {
264
265
266
267
268
269
270
271
272
        rsc_type = 0;
        ninstances = 0;
        relative_speed = 0;
        shortfall = 0;
        nidle_now = 0;
        sim_nused = 0;
        total_fetchable_share = 0;
        saturated_time = 0;
        deadline_missed_instances = 0;
273
        has_exclusions = false;
David Anderson's avatar
 
David Anderson committed
274
275
276
277
278
279
280
    }
};


// per project state
//
struct PROJECT_WORK_FETCH {
281
282
283
284
    double rec;
        // recent estimated credit
    double rec_time;
        // when it was last updated
285
    double rec_temp;
286
287
288
        // temporary copy used during schedule_cpus() and work fetch
    double rec_temp_save;
        // temporary used during RR simulation
David Anderson's avatar
David Anderson committed
289
    int cant_fetch_work_reason;
David Anderson's avatar
David Anderson committed
290
    int compute_cant_fetch_work_reason(PROJECT*);
291
    int n_runnable_jobs;
David Anderson's avatar
 
David Anderson committed
292
293
294
    PROJECT_WORK_FETCH() {
        memset(this, 0, sizeof(*this));
    }
295
    void reset(PROJECT*);
David Anderson's avatar
 
David Anderson committed
296
297
298
299
300
};

// global work fetch state
//
struct WORK_FETCH {
301
302
    void setup();
    PROJECT* choose_project();
303
        // Find a project to ask for work.
David Anderson's avatar
David Anderson committed
304
    PROJECT* non_cpu_intensive_project_needing_work();
305
    void piggyback_work_request(PROJECT*);
306
        // we're going to contact this project anyway;
307
        // piggyback a work request if appropriate.
David Anderson's avatar
 
David Anderson committed
308
    void accumulate_inst_sec(ACTIVE_TASK*, double dt);
309
    void write_request(FILE*, PROJECT*);
310
311
312
    void handle_reply(
        PROJECT*, SCHEDULER_REPLY*, std::vector<RESULT*>new_results
    );
313
    void set_initial_work_request(PROJECT*);
314
    void set_all_requests(PROJECT*);
315
    void set_all_requests_hyst(PROJECT*, int rsc_type);
David Anderson's avatar
 
David Anderson committed
316
317
    void print_state();
    void init();
318
    void compute_cant_fetch_work_reason();
David Anderson's avatar
 
David Anderson committed
319
    void rr_init();
320
321
    void clear_request();
    void compute_shares();
322
    void clear_backoffs(APP_VERSION&);
323
    void request_string(char*);
324
    bool requested_work();
David Anderson's avatar
 
David Anderson committed
325
326
};

327
extern RSC_WORK_FETCH rsc_work_fetch[MAX_RSC];
David Anderson's avatar
 
David Anderson committed
328
329
extern WORK_FETCH work_fetch;

330
331
extern void set_no_rsc_config();

332
333
334
335
extern void project_priority_init(bool for_work_fetch);
extern double project_priority(PROJECT*);
extern void adjust_rec_sched(RESULT*);
extern void adjust_rec_work_fetch(RESULT*);
336

337
extern double total_peak_flops();
338

David Anderson's avatar
 
David Anderson committed
339
#endif