From 89b51ea43d5118ad89264dc0dc23c42c9709038c Mon Sep 17 00:00:00 2001
From: David Anderson <davea@ssl.berkeley.edu>
Date: Fri, 25 Jul 2014 12:40:35 -0700
Subject: [PATCH] scheduler: preliminary support for generic coprocessors

A "generic" coprocessor is one that's reported by the client,
but's not of a type that the scheduler knows about (NVIDIA, AMD, Intel).

With this commit the following works:
- On the client, define a <coproc> in your cc_config.xml
  with a custom name, say 'miner_asic'.
- define a plan class such as
  <plan_class>
    <name>foobar</name>
    <gpu_type>miner_asic</gpu_type>
    <cpu_frac>0.5</cpu_frac>
  <plan_class>
- App versions of this plan class will be sent only to hosts
  that report a coproc of type "miner_asic".
  The <app_version>s in the scheduler reply will include
  a <coproc> element with the given name and count=1.
  This will cause the client (at least the current client)
  to run only one of these jobs at a time,
  and to schedule the CPU appropriately.

Note: there's a lot missing from this;
- app version FLOPS will be those of a CPU app;
- jobs will be sent only if CPU work is requested
... and many other things.
Fixing these issues requires a significant re-architecture of the scheduler,
in particular getting rid of the PROC_TYPE_* constants
and the associated arrays,
which hard-wire the 3 fixed GPU types.
---
 lib/coproc.cpp            |  9 ++++++
 lib/coproc.h              | 10 ++++++-
 sched/plan_class_spec.cpp | 58 ++++++++++++++++++++++++++++-----------
 sched/sched_check.cpp     |  2 +-
 sched/sched_send.cpp      | 10 +++----
 sched/sched_types.cpp     | 16 +++++++++--
 sched/sched_types.h       |  4 +++
 sched/sched_version.cpp   |  2 +-
 8 files changed, 84 insertions(+), 27 deletions(-)

diff --git a/lib/coproc.cpp b/lib/coproc.cpp
index 3ab7378d76..00884920f9 100644
--- a/lib/coproc.cpp
+++ b/lib/coproc.cpp
@@ -235,6 +235,15 @@ int COPROCS::parse(XML_PARSER& xp) {
             }
             continue;
         }
+        if (xp.match_tag("coproc")) {
+            COPROC cp;
+            retval = cp.parse(xp);
+            if (!retval) {
+                coprocs[n_rsc++] = cp;
+            } else {
+                fprintf(stderr, "failed to parse <coproc>: %d\n", retval);
+            }
+        }
     }
     return ERR_XML_PARSE;
 }
diff --git a/lib/coproc.h b/lib/coproc.h
index 283f552042..4e7a75cc84 100644
--- a/lib/coproc.h
+++ b/lib/coproc.h
@@ -474,7 +474,7 @@ struct COPROCS {
         coprocs[n_rsc++] = c;
         return 0;
     }
-    COPROC* type_to_coproc(int t) {
+    COPROC* proc_type_to_coproc(int t) {
         switch(t) {
         case PROC_TYPE_NVIDIA_GPU: return &nvidia;
         case PROC_TYPE_AMD_GPU: return &ati;
@@ -482,6 +482,14 @@ struct COPROCS {
         }
         return NULL;
     }
+    COPROC* lookup_type(const char* t) {
+        for (int i=1; i<n_rsc; i++) {
+            if (!strcmp(t, coprocs[i].type)) {
+                return &coprocs[i];
+            }
+        }
+        return NULL;
+    }
     COPROCS() {
         n_rsc = 0;
         nvidia.count = 0;
diff --git a/sched/plan_class_spec.cpp b/sched/plan_class_spec.cpp
index f04fac73ce..053ffb2126 100644
--- a/sched/plan_class_spec.cpp
+++ b/sched/plan_class_spec.cpp
@@ -550,14 +550,14 @@ bool PLAN_CLASS_SPEC::check(SCHEDULER_REQUEST& sreq, HOST_USAGE& hu) {
 
     // Intel GPU
     //
-    } else if (strstr(gpu_type, "intel")==gpu_type) {
+    } else if (strstr(gpu_type, "intel") == gpu_type) {
         COPROC& cp = sreq.coprocs.intel_gpu;
         cpp = &cp;
 
         if (!cp.count) {
             if (config.debug_version_select) {
                 log_messages.printf(MSG_NORMAL,
-                    "[version] [version] No Intel GPUs found\n"
+                    "[version] plan_class_spec: No Intel GPUs found\n"
                 );
             }
             return false;
@@ -565,6 +565,24 @@ bool PLAN_CLASS_SPEC::check(SCHEDULER_REQUEST& sreq, HOST_USAGE& hu) {
         if (min_gpu_ram_mb) {
             gpu_requirements[PROC_TYPE_INTEL_GPU].update(0, min_gpu_ram_mb * MEGA);
         }
+
+    // custom GPU type
+    //
+    } else {
+        cpp = sreq.coprocs.lookup_type(gpu_type);
+        if (!cpp) {
+            if (config.debug_version_select) {
+                log_messages.printf(MSG_NORMAL,
+                    "[version] plan_class_spec: No %s found\n", gpu_type
+                );
+            }
+            return false;
+        }
+        if (config.debug_version_select) {
+            log_messages.printf(MSG_NORMAL,
+                "[version] plan_class_spec: Custom coproc %s found\n", gpu_type
+            );
+        }
     }
 
     if (opencl) {
@@ -662,21 +680,29 @@ bool PLAN_CLASS_SPEC::check(SCHEDULER_REQUEST& sreq, HOST_USAGE& hu) {
             gpu_usage = gpu_utilization;
         }
 
-        coproc_perf(
-            capped_host_fpops(),
-            gpu_peak_flops_scale * gpu_usage * cpp->peak_flops,
-            cpu_frac,
-            hu.projected_flops,
-            hu.avg_ncpus
-        );
-        if (avg_ncpus) {
-            hu.avg_ncpus = avg_ncpus;
-        }
-        // I believe the first term here is just hu.projected_flops,
-        // but I'm leaving it spelled out to match GPU scheduling 
-        // code in sched_customize.cpp
+        // if we don't know GPU peak flops, treat it like a CPU app
         //
-        hu.peak_flops = gpu_peak_flops_scale*gpu_usage*cpp->peak_flops + hu.avg_ncpus*capped_host_fpops();
+        if (cpp->peak_flops == 0) {
+            strcpy(hu.custom_coproc_type, gpu_type);
+            hu.avg_ncpus = cpu_frac;
+            hu.gpu_usage = gpu_usage;
+        } else {
+            coproc_perf(
+                capped_host_fpops(),
+                gpu_peak_flops_scale * gpu_usage * cpp->peak_flops,
+                cpu_frac,
+                hu.projected_flops,
+                hu.avg_ncpus
+            );
+            if (avg_ncpus) {
+                hu.avg_ncpus = avg_ncpus;
+            }
+            // I believe the first term here is just hu.projected_flops,
+            // but I'm leaving it spelled out to match GPU scheduling 
+            // code in sched_customize.cpp
+            //
+            hu.peak_flops = gpu_peak_flops_scale*gpu_usage*cpp->peak_flops + hu.avg_ncpus*capped_host_fpops();
+        }
 
         if (!strcmp(gpu_type, "amd") || !strcmp(gpu_type, "ati")) {
             hu.proc_type = PROC_TYPE_AMD_GPU;
diff --git a/sched/sched_check.cpp b/sched/sched_check.cpp
index 5e4deba503..fef19b3f5c 100644
--- a/sched/sched_check.cpp
+++ b/sched/sched_check.cpp
@@ -140,7 +140,7 @@ static inline double get_estimated_delay(BEST_APP_VERSION& bav) {
     if (pt == PROC_TYPE_CPU) {
         return g_request->cpu_estimated_delay;
     }
-    COPROC* cp = g_request->coprocs.type_to_coproc(pt);
+    COPROC* cp = g_request->coprocs.proc_type_to_coproc(pt);
     return cp->estimated_delay;
 }
 
diff --git a/sched/sched_send.cpp b/sched/sched_send.cpp
index 63fd5bc88b..aa8c183423 100644
--- a/sched/sched_send.cpp
+++ b/sched/sched_send.cpp
@@ -151,7 +151,7 @@ void WORK_REQ_BASE::get_job_limits() {
     for (i=1; i<g_request->coprocs.n_rsc; i++) {
         COPROC& cp = g_request->coprocs.coprocs[i];
         int proc_type = coproc_type_name_to_num(cp.type);
-        if (!proc_type) continue;
+        if (proc_type < 0) continue;
         n = cp.count;
         if (n > MAX_GPUS) n = MAX_GPUS;
         ninstances[proc_type] = n;
@@ -546,7 +546,7 @@ static inline void update_estimated_delay(BEST_APP_VERSION& bav, double dt) {
     if (pt == PROC_TYPE_CPU) {
         g_request->cpu_estimated_delay += dt*bav.host_usage.avg_ncpus/g_request->host.p_ncpus;
     } else {
-        COPROC* cp = g_request->coprocs.type_to_coproc(pt);
+        COPROC* cp = g_request->coprocs.proc_type_to_coproc(pt);
         cp->estimated_delay += dt*bav.host_usage.gpu_usage/cp->count;
     }
 }
@@ -1169,7 +1169,7 @@ void send_gpu_messages() {
     bool usable_gpu = false;
     bool have_gpu_apps = false;
     for (int i=1; i<NPROC_TYPES; i++) {
-        COPROC* cp = g_request->coprocs.type_to_coproc(i);
+        COPROC* cp = g_request->coprocs.proc_type_to_coproc(i);
         if (ssp->have_apps_for_proc_type[i]) {
             have_gpu_apps = true;
             if (cp->count) {
@@ -1429,7 +1429,7 @@ void send_work_setup() {
     // do sanity checking on GPU scheduling parameters
     //
     for (i=1; i<NPROC_TYPES; i++) {
-        COPROC* cp = g_request->coprocs.type_to_coproc(i);
+        COPROC* cp = g_request->coprocs.proc_type_to_coproc(i);
         if (cp->count) {
             g_wreq->req_secs[i] = clamp_req_sec(cp->req_secs);
             g_wreq->req_instances[i] = cp->req_instances;
@@ -1490,7 +1490,7 @@ void send_work_setup() {
             g_request->cpu_estimated_delay
         );
         for (i=1; i<NPROC_TYPES; i++) {
-            COPROC* cp = g_request->coprocs.type_to_coproc(i);
+            COPROC* cp = g_request->coprocs.proc_type_to_coproc(i);
             if (cp->count) {
                 log_messages.printf(MSG_NORMAL,
                     "[send] %s: req %.2f sec, %.2f instances; est delay %.2f\n",
diff --git a/sched/sched_types.cpp b/sched/sched_types.cpp
index ad66ed2653..b4d0ee325a 100644
--- a/sched/sched_types.cpp
+++ b/sched/sched_types.cpp
@@ -71,7 +71,7 @@ int CLIENT_APP_VERSION::parse(XML_PARSER& xp) {
 
             double pf = host_usage.avg_ncpus * g_reply->host.p_fpops;
             if (host_usage.proc_type != PROC_TYPE_CPU) {
-                COPROC* cp = g_request->coprocs.type_to_coproc(host_usage.proc_type);
+                COPROC* cp = g_request->coprocs.proc_type_to_coproc(host_usage.proc_type);
                 pf += host_usage.gpu_usage*cp->peak_flops;
             }
             host_usage.peak_flops = pf;
@@ -94,7 +94,7 @@ int CLIENT_APP_VERSION::parse(XML_PARSER& xp) {
             int retval = coproc_req.parse(xp);
             if (!retval) {
                 int rt = coproc_type_name_to_num(coproc_req.type);
-                if (!rt) {
+                if (rt <= 0) {
                     log_messages.printf(MSG_NORMAL,
                         "UNKNOWN COPROC TYPE %s\n", coproc_req.type
                     );
@@ -626,7 +626,7 @@ static bool have_apps_for_client() {
     for (int i=0; i<NPROC_TYPES; i++) {
         if (ssp->have_apps_for_proc_type[i]) {
             if (!i) return true;
-            COPROC* cp = g_request->coprocs.type_to_coproc(i);
+            COPROC* cp = g_request->coprocs.proc_type_to_coproc(i);
             if (cp->count) return true;
         }
     }
@@ -1095,6 +1095,16 @@ int APP_VERSION::write(FILE* fout) {
             bavp->host_usage.gpu_usage
         );
     }
+    if (strlen(bavp->host_usage.custom_coproc_type)) {
+        fprintf(fout,
+            "    <coproc>\n"
+            "        <type>%s</type>\n"
+            "        <count>%f</count>\n"
+            "    </coproc>\n",
+            bavp->host_usage.custom_coproc_type,
+            bavp->host_usage.gpu_usage
+        );
+    }
     if (bavp->host_usage.gpu_ram) {
         fprintf(fout,
             "    <gpu_ram>%f</gpu_ram>\n",
diff --git a/sched/sched_types.h b/sched/sched_types.h
index 7efb71cbf0..79743270ea 100644
--- a/sched/sched_types.h
+++ b/sched/sched_types.h
@@ -73,6 +73,9 @@ struct HOST_USAGE {
     double peak_flops;
         // stored in result.flops_estimate, and used for credit calculations
     char cmdline[256];
+    char custom_coproc_type[256];
+        // if we're using a custom GPU type, it's name
+        // TODO: get rid of PROC_TYPE_*, and this
 
     HOST_USAGE() {
         proc_type = PROC_TYPE_CPU;
@@ -83,6 +86,7 @@ struct HOST_USAGE {
         projected_flops = 0;
         peak_flops = 0;
         strcpy(cmdline, "");
+        strcpy(custom_coproc_type, "");
     }
     void sequential_app(double flops) {
         proc_type = PROC_TYPE_CPU;
diff --git a/sched/sched_version.cpp b/sched/sched_version.cpp
index ee93711560..a72125b23b 100644
--- a/sched/sched_version.cpp
+++ b/sched/sched_version.cpp
@@ -94,7 +94,7 @@ inline int scaled_max_jobs_per_day(DB_HOST_APP_VERSION& hav, HOST_USAGE& hu) {
             n *= g_reply->host.p_ncpus;
         }
     } else {
-        COPROC* cp = g_request->coprocs.type_to_coproc(hu.proc_type);
+        COPROC* cp = g_request->coprocs.proc_type_to_coproc(hu.proc_type);
         if (cp->count) {
             n *= cp->count;
         }
-- 
GitLab