From 89b51ea43d5118ad89264dc0dc23c42c9709038c Mon Sep 17 00:00:00 2001 From: David Anderson <davea@ssl.berkeley.edu> Date: Fri, 25 Jul 2014 12:40:35 -0700 Subject: [PATCH] scheduler: preliminary support for generic coprocessors A "generic" coprocessor is one that's reported by the client, but's not of a type that the scheduler knows about (NVIDIA, AMD, Intel). With this commit the following works: - On the client, define a <coproc> in your cc_config.xml with a custom name, say 'miner_asic'. - define a plan class such as <plan_class> <name>foobar</name> <gpu_type>miner_asic</gpu_type> <cpu_frac>0.5</cpu_frac> <plan_class> - App versions of this plan class will be sent only to hosts that report a coproc of type "miner_asic". The <app_version>s in the scheduler reply will include a <coproc> element with the given name and count=1. This will cause the client (at least the current client) to run only one of these jobs at a time, and to schedule the CPU appropriately. Note: there's a lot missing from this; - app version FLOPS will be those of a CPU app; - jobs will be sent only if CPU work is requested ... and many other things. Fixing these issues requires a significant re-architecture of the scheduler, in particular getting rid of the PROC_TYPE_* constants and the associated arrays, which hard-wire the 3 fixed GPU types. --- lib/coproc.cpp | 9 ++++++ lib/coproc.h | 10 ++++++- sched/plan_class_spec.cpp | 58 ++++++++++++++++++++++++++++----------- sched/sched_check.cpp | 2 +- sched/sched_send.cpp | 10 +++---- sched/sched_types.cpp | 16 +++++++++-- sched/sched_types.h | 4 +++ sched/sched_version.cpp | 2 +- 8 files changed, 84 insertions(+), 27 deletions(-) diff --git a/lib/coproc.cpp b/lib/coproc.cpp index 3ab7378d76..00884920f9 100644 --- a/lib/coproc.cpp +++ b/lib/coproc.cpp @@ -235,6 +235,15 @@ int COPROCS::parse(XML_PARSER& xp) { } continue; } + if (xp.match_tag("coproc")) { + COPROC cp; + retval = cp.parse(xp); + if (!retval) { + coprocs[n_rsc++] = cp; + } else { + fprintf(stderr, "failed to parse <coproc>: %d\n", retval); + } + } } return ERR_XML_PARSE; } diff --git a/lib/coproc.h b/lib/coproc.h index 283f552042..4e7a75cc84 100644 --- a/lib/coproc.h +++ b/lib/coproc.h @@ -474,7 +474,7 @@ struct COPROCS { coprocs[n_rsc++] = c; return 0; } - COPROC* type_to_coproc(int t) { + COPROC* proc_type_to_coproc(int t) { switch(t) { case PROC_TYPE_NVIDIA_GPU: return &nvidia; case PROC_TYPE_AMD_GPU: return &ati; @@ -482,6 +482,14 @@ struct COPROCS { } return NULL; } + COPROC* lookup_type(const char* t) { + for (int i=1; i<n_rsc; i++) { + if (!strcmp(t, coprocs[i].type)) { + return &coprocs[i]; + } + } + return NULL; + } COPROCS() { n_rsc = 0; nvidia.count = 0; diff --git a/sched/plan_class_spec.cpp b/sched/plan_class_spec.cpp index f04fac73ce..053ffb2126 100644 --- a/sched/plan_class_spec.cpp +++ b/sched/plan_class_spec.cpp @@ -550,14 +550,14 @@ bool PLAN_CLASS_SPEC::check(SCHEDULER_REQUEST& sreq, HOST_USAGE& hu) { // Intel GPU // - } else if (strstr(gpu_type, "intel")==gpu_type) { + } else if (strstr(gpu_type, "intel") == gpu_type) { COPROC& cp = sreq.coprocs.intel_gpu; cpp = &cp; if (!cp.count) { if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, - "[version] [version] No Intel GPUs found\n" + "[version] plan_class_spec: No Intel GPUs found\n" ); } return false; @@ -565,6 +565,24 @@ bool PLAN_CLASS_SPEC::check(SCHEDULER_REQUEST& sreq, HOST_USAGE& hu) { if (min_gpu_ram_mb) { gpu_requirements[PROC_TYPE_INTEL_GPU].update(0, min_gpu_ram_mb * MEGA); } + + // custom GPU type + // + } else { + cpp = sreq.coprocs.lookup_type(gpu_type); + if (!cpp) { + if (config.debug_version_select) { + log_messages.printf(MSG_NORMAL, + "[version] plan_class_spec: No %s found\n", gpu_type + ); + } + return false; + } + if (config.debug_version_select) { + log_messages.printf(MSG_NORMAL, + "[version] plan_class_spec: Custom coproc %s found\n", gpu_type + ); + } } if (opencl) { @@ -662,21 +680,29 @@ bool PLAN_CLASS_SPEC::check(SCHEDULER_REQUEST& sreq, HOST_USAGE& hu) { gpu_usage = gpu_utilization; } - coproc_perf( - capped_host_fpops(), - gpu_peak_flops_scale * gpu_usage * cpp->peak_flops, - cpu_frac, - hu.projected_flops, - hu.avg_ncpus - ); - if (avg_ncpus) { - hu.avg_ncpus = avg_ncpus; - } - // I believe the first term here is just hu.projected_flops, - // but I'm leaving it spelled out to match GPU scheduling - // code in sched_customize.cpp + // if we don't know GPU peak flops, treat it like a CPU app // - hu.peak_flops = gpu_peak_flops_scale*gpu_usage*cpp->peak_flops + hu.avg_ncpus*capped_host_fpops(); + if (cpp->peak_flops == 0) { + strcpy(hu.custom_coproc_type, gpu_type); + hu.avg_ncpus = cpu_frac; + hu.gpu_usage = gpu_usage; + } else { + coproc_perf( + capped_host_fpops(), + gpu_peak_flops_scale * gpu_usage * cpp->peak_flops, + cpu_frac, + hu.projected_flops, + hu.avg_ncpus + ); + if (avg_ncpus) { + hu.avg_ncpus = avg_ncpus; + } + // I believe the first term here is just hu.projected_flops, + // but I'm leaving it spelled out to match GPU scheduling + // code in sched_customize.cpp + // + hu.peak_flops = gpu_peak_flops_scale*gpu_usage*cpp->peak_flops + hu.avg_ncpus*capped_host_fpops(); + } if (!strcmp(gpu_type, "amd") || !strcmp(gpu_type, "ati")) { hu.proc_type = PROC_TYPE_AMD_GPU; diff --git a/sched/sched_check.cpp b/sched/sched_check.cpp index 5e4deba503..fef19b3f5c 100644 --- a/sched/sched_check.cpp +++ b/sched/sched_check.cpp @@ -140,7 +140,7 @@ static inline double get_estimated_delay(BEST_APP_VERSION& bav) { if (pt == PROC_TYPE_CPU) { return g_request->cpu_estimated_delay; } - COPROC* cp = g_request->coprocs.type_to_coproc(pt); + COPROC* cp = g_request->coprocs.proc_type_to_coproc(pt); return cp->estimated_delay; } diff --git a/sched/sched_send.cpp b/sched/sched_send.cpp index 63fd5bc88b..aa8c183423 100644 --- a/sched/sched_send.cpp +++ b/sched/sched_send.cpp @@ -151,7 +151,7 @@ void WORK_REQ_BASE::get_job_limits() { for (i=1; i<g_request->coprocs.n_rsc; i++) { COPROC& cp = g_request->coprocs.coprocs[i]; int proc_type = coproc_type_name_to_num(cp.type); - if (!proc_type) continue; + if (proc_type < 0) continue; n = cp.count; if (n > MAX_GPUS) n = MAX_GPUS; ninstances[proc_type] = n; @@ -546,7 +546,7 @@ static inline void update_estimated_delay(BEST_APP_VERSION& bav, double dt) { if (pt == PROC_TYPE_CPU) { g_request->cpu_estimated_delay += dt*bav.host_usage.avg_ncpus/g_request->host.p_ncpus; } else { - COPROC* cp = g_request->coprocs.type_to_coproc(pt); + COPROC* cp = g_request->coprocs.proc_type_to_coproc(pt); cp->estimated_delay += dt*bav.host_usage.gpu_usage/cp->count; } } @@ -1169,7 +1169,7 @@ void send_gpu_messages() { bool usable_gpu = false; bool have_gpu_apps = false; for (int i=1; i<NPROC_TYPES; i++) { - COPROC* cp = g_request->coprocs.type_to_coproc(i); + COPROC* cp = g_request->coprocs.proc_type_to_coproc(i); if (ssp->have_apps_for_proc_type[i]) { have_gpu_apps = true; if (cp->count) { @@ -1429,7 +1429,7 @@ void send_work_setup() { // do sanity checking on GPU scheduling parameters // for (i=1; i<NPROC_TYPES; i++) { - COPROC* cp = g_request->coprocs.type_to_coproc(i); + COPROC* cp = g_request->coprocs.proc_type_to_coproc(i); if (cp->count) { g_wreq->req_secs[i] = clamp_req_sec(cp->req_secs); g_wreq->req_instances[i] = cp->req_instances; @@ -1490,7 +1490,7 @@ void send_work_setup() { g_request->cpu_estimated_delay ); for (i=1; i<NPROC_TYPES; i++) { - COPROC* cp = g_request->coprocs.type_to_coproc(i); + COPROC* cp = g_request->coprocs.proc_type_to_coproc(i); if (cp->count) { log_messages.printf(MSG_NORMAL, "[send] %s: req %.2f sec, %.2f instances; est delay %.2f\n", diff --git a/sched/sched_types.cpp b/sched/sched_types.cpp index ad66ed2653..b4d0ee325a 100644 --- a/sched/sched_types.cpp +++ b/sched/sched_types.cpp @@ -71,7 +71,7 @@ int CLIENT_APP_VERSION::parse(XML_PARSER& xp) { double pf = host_usage.avg_ncpus * g_reply->host.p_fpops; if (host_usage.proc_type != PROC_TYPE_CPU) { - COPROC* cp = g_request->coprocs.type_to_coproc(host_usage.proc_type); + COPROC* cp = g_request->coprocs.proc_type_to_coproc(host_usage.proc_type); pf += host_usage.gpu_usage*cp->peak_flops; } host_usage.peak_flops = pf; @@ -94,7 +94,7 @@ int CLIENT_APP_VERSION::parse(XML_PARSER& xp) { int retval = coproc_req.parse(xp); if (!retval) { int rt = coproc_type_name_to_num(coproc_req.type); - if (!rt) { + if (rt <= 0) { log_messages.printf(MSG_NORMAL, "UNKNOWN COPROC TYPE %s\n", coproc_req.type ); @@ -626,7 +626,7 @@ static bool have_apps_for_client() { for (int i=0; i<NPROC_TYPES; i++) { if (ssp->have_apps_for_proc_type[i]) { if (!i) return true; - COPROC* cp = g_request->coprocs.type_to_coproc(i); + COPROC* cp = g_request->coprocs.proc_type_to_coproc(i); if (cp->count) return true; } } @@ -1095,6 +1095,16 @@ int APP_VERSION::write(FILE* fout) { bavp->host_usage.gpu_usage ); } + if (strlen(bavp->host_usage.custom_coproc_type)) { + fprintf(fout, + " <coproc>\n" + " <type>%s</type>\n" + " <count>%f</count>\n" + " </coproc>\n", + bavp->host_usage.custom_coproc_type, + bavp->host_usage.gpu_usage + ); + } if (bavp->host_usage.gpu_ram) { fprintf(fout, " <gpu_ram>%f</gpu_ram>\n", diff --git a/sched/sched_types.h b/sched/sched_types.h index 7efb71cbf0..79743270ea 100644 --- a/sched/sched_types.h +++ b/sched/sched_types.h @@ -73,6 +73,9 @@ struct HOST_USAGE { double peak_flops; // stored in result.flops_estimate, and used for credit calculations char cmdline[256]; + char custom_coproc_type[256]; + // if we're using a custom GPU type, it's name + // TODO: get rid of PROC_TYPE_*, and this HOST_USAGE() { proc_type = PROC_TYPE_CPU; @@ -83,6 +86,7 @@ struct HOST_USAGE { projected_flops = 0; peak_flops = 0; strcpy(cmdline, ""); + strcpy(custom_coproc_type, ""); } void sequential_app(double flops) { proc_type = PROC_TYPE_CPU; diff --git a/sched/sched_version.cpp b/sched/sched_version.cpp index ee93711560..a72125b23b 100644 --- a/sched/sched_version.cpp +++ b/sched/sched_version.cpp @@ -94,7 +94,7 @@ inline int scaled_max_jobs_per_day(DB_HOST_APP_VERSION& hav, HOST_USAGE& hu) { n *= g_reply->host.p_ncpus; } } else { - COPROC* cp = g_request->coprocs.type_to_coproc(hu.proc_type); + COPROC* cp = g_request->coprocs.proc_type_to_coproc(hu.proc_type); if (cp->count) { n *= cp->count; } -- GitLab