...
 
Commits (4)
......@@ -297,6 +297,12 @@ resources. All of these only work with two types of resources.
dual approximation, heteroprio-based greedy algorithm. Inspired from [Scheduling Data Flow Program in XKaapi: A
New Affinity Based Algorithm for Heterogeneous Architectures](https://hal.inria.fr/hal-01081629v1), with only the
second part of the schedule.
+ `dp3demi`
dual approximation, dynamic programming based algorithm. Based
on APPROX-3/2 from
[Scheduling Independent Moldable Tasks on Multi-Cores with GPUs](https://hal.inria.fr/hal-01516752),
but restricted to the non moldable case. Should also appear as a
more generic (2q+1)/(2q) approximation in IJFCS.
+ `accel`
Accel algorithm from [Scheduling Independent Tasks on Multi-cores with GPU Accelerators](https://hal.inria.fr/hal-01081625),
Section 4.
......
#ifndef INDEPDP2_H
#define INDEPDP2_H
#include "IndepAllocator.h"
#include "IndepDualGeneric.h"
#include "instance.h"
#include <vector>
extern double lowerBoundTwoResource(Instance& ins, std::vector<int> taskSet,
double CPUload = 0, double GPUload = 0);
class IndepDP2 : public IndepAllocator {
class IndepDP2 : public IndepDualGeneric {
protected:
double tryGuess(Instance &, std::vector<int> taskSet, double maxGPUload, double maxlen,
IndepResult & result, bool getResult);
double epsilon = 0.01;
double tryGuess(Instance &, std::vector<int> taskSet, std::vector<double>& loads,
double maxlen, IndepResult & result, bool getResult);
double discretizationConstant = 3.0;
public:
IndepDP2(const AlgOptions& opt);
IndepResult compute(Instance &, std::vector<int> &taskSet, std::vector<double> &loads);
};
......
#ifndef INDEPDP3DEMI_H
#define INDEPDP3DEMI_H
#include "IndepDualGeneric.h"
#include "instance.h"
#include <vector>
class IndepDP3Demi : public IndepDualGeneric {
protected:
double tryGuess(Instance &, std::vector<int> taskSet, std::vector<double>& loads,
double target, IndepResult & result, bool getResult);
double discretizationConstant = 3.0;
public:
IndepDP3Demi(const AlgOptions& opt);
};
#endif
......@@ -4,7 +4,9 @@ set(SCHED_SRC
PreAllocatedGreedy.cpp
HeteroPrio.cpp
IndepBalanced.cpp
IndepDualGeneric.cpp ../include/IndepDualGeneric.h
IndepDP2.cpp
IndepDP3Demi.cpp
IndepImreh.cpp
IndepAccel.cpp
IndepBased.cpp
......@@ -15,7 +17,9 @@ set(SCHED_SRC
availSequence.cpp
GreedyAlgorithm.cpp
HeftAlgorithm.cpp
OnlineZhang.cpp ../include/OnlineZhang.h GreedyPerType.cpp ../include/GreedyPerType.h TrueHeteroPrio.cpp ../include/TrueHeteroPrio.h OnlineGeneric.cpp ../include/OnlineGeneric.h OnlineQA.cpp ../include/OnlineQA.h OnlineECT.cpp ../include/OnlineECT.h OnlineERLS.cpp ../include/OnlineERLS.h IndepCLB2C.cpp ../include/IndepCLB2C.h)
OnlineZhang.cpp ../include/OnlineZhang.h GreedyPerType.cpp ../include/GreedyPerType.h TrueHeteroPrio.cpp ../include/TrueHeteroPrio.h OnlineGeneric.cpp ../include/OnlineGeneric.h OnlineQA.cpp ../include/OnlineQA.h OnlineECT.cpp ../include/OnlineECT.h OnlineERLS.cpp ../include/OnlineERLS.h IndepCLB2C.cpp ../include/IndepCLB2C.h
)
if(CPLEX_FOUND)
set(SCHED_SRC ${SCHED_SRC} SchedLPIndep.cpp ../include/SchedLPIndep.h AreaRound.cpp ../include/AreaRound.h)
......
......@@ -36,7 +36,7 @@ bool IndepAccel::tryGuess(Instance& ins, std::vector<int> taskSet, double target
// Sets "2" are the short ones
vector<int> setG1, setG2, setC1, setC2;
for(auto t: taskSet) {
if(ins.execType(IA_CPU, t) <= target)
if(cpuTime(ins, t) <= target / 2.0)
setC2.push_back(t);
else
setG1.push_back(t);
......@@ -68,20 +68,16 @@ bool IndepAccel::tryGuess(Instance& ins, std::vector<int> taskSet, double target
cout << "IndepAccel: start. C1; " << setC1 << " C2: " << setC2 << " G1: " << setG1 << " G2: " << setG2 << endl;
// Tasks in G1 with GPU time small enough go to G2: they do not disturb the schedule
// We find them at the end of G1 because of how we sorted G1.
if(!setG1.empty()) {
auto it = setG1.end() - 1;
while (gpuTime(ins, *it) < target / 2.0) {
setG2.push_back(*it);
if(it == setG1.begin()) break;
it--;
}
if(gpuTime(ins, *it) >= target / 2.0) it++;
setG1.erase(it, setG1.end());
vector<int> newSetG1;
for(int& t: setG1) {
if(gpuTime(ins, t) < target / 2.0)
setG2.push_back(t);
else
newSetG1.push_back(t);
}
setG1 = newSetG1;
if(((int) setG1.size()) > ins.nbWorkers[IA_CPU] + ins.nbWorkers[IA_GPU]) {
if((int) setG1.size() > ins.nbWorkers[IA_CPU] + ins.nbWorkers[IA_GPU]) {
if(verbosity >= 6)
cout << "IndepAccel: G1 set too big, not feasible. "<< endl;
return false;
......
......@@ -2,6 +2,7 @@
#include "GreedyAlgorithm.h"
#include "util.h"
#include "IndepDP2.h"
#include "IndepDP3Demi.h"
#include "IndepAccel.h"
#include "IndepDualHP.h"
#include "IndepImreh.h"
......@@ -28,6 +29,8 @@ IndepBased::IndepBased(const AlgOptions& options) : GreedyAlgorithm(options) {
string indepName = options.asString("indep", "dualhp");
if(indepName == "dp2")
indep = new IndepDP2(options);
if(indepName == "dp3demi")
indep = new IndepDP3Demi(options);
if(indepName == "dualhp")
indep = new IndepDualHP(options);
if(indepName == "accel") {
......
......@@ -9,23 +9,24 @@
using namespace std;
IndepDP2::IndepDP2(const AlgOptions& opt): IndepAllocator(opt) {
verbosity = opt.asInt("verb_DP2", verbosity);
discretizationConstant = opt.asDouble("disc", 3.0);
IndepDP2::IndepDP2(const AlgOptions& opt): IndepDualGeneric(opt) {
discretizationConstant = opt.asDouble("disc", discretizationConstant);
}
// Arbitrary convention: CPU times are index 0, GPU times are index 1. Just a naming thing.
// returns minimum CPU load
double IndepDP2::tryGuess(Instance& instance, std::vector<int> taskSet, double maxGPUload, double maxlen,
IndepResult &result, bool getResult) {
// returns minimum CPU load, taking into account existing loads
double IndepDP2::tryGuess(Instance& instance, std::vector<int> taskSet, vector<double>& loads,
double maxlen, IndepResult &result, bool getResult) {
// CPUload(i, g) := smallest load on CPU from the first i tasks,
// with at most g load on GPU
// So need to discretize the GPU load ? Yes. Paper says with a ratio of lambda/3n
// For all tasks in taskSet:
// forall g, CPUload(i, g) = min CPUload(i-1, g-T^G_i) CPUload(i-1, g) + T^C_i
double existingCPUload = loads[0];
double existingGPUload = loads[1];
double maxGPUload = maxlen * instance.nbWorkers[1] - existingGPUload;
if(maxGPUload < 0) maxGPUload = 1;
......@@ -118,162 +119,5 @@ double IndepDP2::tryGuess(Instance& instance, std::vector<int> taskSet, double m
delete[] CPUload[0];
delete[] CPUload;
return value;
}
IndepResult IndepDP2::compute(Instance& instance, vector<int> &taskSet, vector<double> &loads) {
if(instance.nbWorkerTypes != 2) {
cerr << "IndepDP2: only implemented for instances with 2 worker types" << endl;
throw(1);
}
IndepResult result(2); // 2 because there are two resource types.
int nbCPU = instance.nbWorkers[0];
int nbGPU = instance.nbWorkers[1];
if(verbosity >= 4) {
cout << "IndepDP2: called with TS=" << taskSet << " and loads=" << loads << endl;
cout << " CPU times: ";
for(int i : taskSet) cout << instance.execType(0, i) << " ";
cout << endl;
cout << " GPU times: ";
for(int i : taskSet) cout << instance.execType(1, i) << " ";
cout << endl;
}
if(taskSet.size() == 0)
return result;
double minload = min(loads[0], loads[1]);
loads[0] -= minload;
loads[1] -= minload;
double low = lowerBoundTwoResource(instance, taskSet, loads[0], loads[1]);
double up = std::numeric_limits<double>::infinity();
// TODO: optim for the case where GPUarea <= min execution time on CPU: result = all on GPU !
/* double firstguess = low;
if(area > low) {
low = area;
firstguess = 1.15*area;
}
bool haveResult = false;
double target = firstguess;
double r = tryGuess(instance, taskSet, target * nbGPU - loads[1], target, result, target == low);
if(verbosity >= 6)
cout << "IndepDP2: firstguess = "<< firstguess << ", result = " << r << " mkspan = " << (r+loads[0])/ nbCPU << endl;
if((r != -1) && (r + loads[0])/nbCPU <= target*(1+ epsilon)) {
up = firstguess;
haveResult = (target == low);
} else {
low = firstguess;
}
*/
double target;
bool haveResult = false;
// Then, dichotomy, as usual
while(abs(up - low) > epsilon*low) {
if(up != std::numeric_limits<double>::infinity())
target = (up + low) / 2;
else
target = 1.15*low;
double r = tryGuess(instance, taskSet, target * nbGPU - loads[1], target, result, abs(target - low) <= 3*epsilon*low);
if(verbosity >= 6)
cout << "IndepDP2: TARGET = "<< target << ", result = " << r << " mkspan= " << (r+loads[0])/ nbCPU << endl;
if((r != -1) && (r + loads[0]) /nbCPU <= target * (1+ epsilon)) {
up = target;
haveResult = (abs(target - low) <= 3*epsilon*low);
}
else {
low = target;
}
}
if(! haveResult) {
double r = tryGuess(instance, taskSet, up * nbGPU - loads[1], up, result, true);
if(verbosity >= 6)
cout << "IndepDP2: TARGET = "<< up << ", result = " << r << " mkspan= " << (r+loads[0])/ nbCPU << endl;
}
/* if(verbosity >= 4)
cout << "Result of IndepDP2: " << result << endl; */
return result;
}
double lowerBoundTwoResource(Instance& instance, vector<int> taskSet,
double CPUload, double GPUload) {
if(instance.nbWorkerTypes != 2) {
cerr << "lowerBoundTwoResources: only implemented for instances with 2 worker types" << endl;
throw(1);
}
int nbCPU = instance.nbWorkers[0];
int nbGPU = instance.nbWorkers[1];
double longest = 0;
struct TypeData {
int nbTasks;
double CPUtime;
double GPUtime;
double ratio;
};
vector<TypeData> taskTypes(instance.nbTaskTypes);
for(auto &&t: taskSet) {
taskTypes[instance.taskTypes[t]].nbTasks++;
}
for(int i = 0; i < instance.nbTaskTypes; i++) {
TypeData &d = taskTypes[i];
d.CPUtime = instance.execTimes[0][i] / nbCPU;
d.GPUtime = instance.execTimes[1][i] / nbGPU;
d.ratio = instance.execTimes[0][i] / instance.execTimes[1][i];
if(d.nbTasks > 0)
longest = max(longest,
min(instance.execTimes[0][i], instance.execTimes[1][i]));
}
sort(taskTypes.begin(), taskTypes.end(), [&] (TypeData a, TypeData b) {
return (a.ratio > b.ratio);
});
double CPUlen = CPUload / nbCPU, GPUlen = GPUload / nbGPU;
int g = 0; int c = instance.nbTaskTypes - 1;
while(g != c) {
TypeData & d = taskTypes[g]; TypeData &e = taskTypes[c];
if(GPUlen + d.nbTasks * d.GPUtime <= CPUlen + e.nbTasks * e.CPUtime) {
GPUlen += d.nbTasks * d.GPUtime;
g++;
} else {
CPUlen += e.nbTasks * e.CPUtime;
c--;
}
}
TypeData &d = taskTypes[g];
double remCPU = d.nbTasks * d.CPUtime;
double remGPU = d.nbTasks * d.GPUtime;
double area;
if(remCPU + CPUlen <= GPUlen)
area = GPUlen ;
else if(remGPU + GPUlen <= CPUlen)
area = CPUlen;
else
area = (CPUlen*remGPU +remGPU * remCPU + GPUlen * remCPU) / (remCPU + remGPU);
// cout << "LB: " << area << " " << longest << " " << remCPU << " " << remGPU << " " << CPUlen << " " << GPUlen << " " << g << " " << c << endl;
/*for(auto &&t: taskTypes) {
cout << " " << t.nbTasks << " " << t.CPUtime << " " << t.GPUtime <<" " << t.ratio << endl;
} */
return max(area, longest);
return value + existingCPUload;
}
#include "IndepDP3Demi.h"
#include <iostream>
#include <limits>
#include <new>
#include <algorithm>
#include <cmath>
#include "util.h"
using namespace std;
IndepDP3Demi::IndepDP3Demi(const AlgOptions& opt): IndepDualGeneric(opt) {
discretizationConstant = opt.asDouble("disc", discretizationConstant);
}
// returns minimum CPU load, taking into account existing loads
double IndepDP3Demi::tryGuess(Instance& instance, std::vector<int> taskSet, vector<double>& loads,
double target, IndepResult &result, bool getResult) {
// CPUload(i, g) := smallest load on CPU from the first i tasks,
// with at most g load on GPU
// So need to discretize the GPU load ? Yes. Paper says with a ratio of lambda/3n
// For all tasks in taskSet:
// forall g, CPUload(i, g) = min CPUload(i-1, g-T^G_i) CPUload(i-1, g) + T^C_i
int nbCPU = instance.nbWorkers[0];
int nbGPU = instance.nbWorkers[1];
double existingCPUload = loads[0];
double existingGPUload = loads[1];
double maxGPUload = target * nbGPU - existingGPUload;
if(maxGPUload < 0) maxGPUload = 1;
double ratio = target / (discretizationConstant * taskSet.size());
vector<int> discreteGPUtimings(instance.nbTaskTypes);
for(int i = 0; i < instance.nbTaskTypes; i++)
discreteGPUtimings[i] = ceil(instance.execTimes[1][i] / ratio);
int nbJobsWithLargeCPUTime = 0;
int nbJobsWithLargeGPUTime = 0;
for(int & t: taskSet) {
int taskType = instance.taskTypes[t];
if(instance.execTimes[0][taskType] > (target / 2))
++ nbJobsWithLargeCPUTime;
if(instance.execTimes[1][taskType] > (target / 2))
++ nbJobsWithLargeGPUTime;
}
const int N = ceil(maxGPUload / ratio);
const int maxMu = min(nbCPU, nbJobsWithLargeCPUTime);
const int maxNu = min(nbGPU, nbJobsWithLargeGPUTime);
const int stateSpaceSize = (N+1) * (maxMu + 1) * (maxNu + 1);
int length = getResult ? taskSet.size() + 1 : 1;
double** CPUload = new double*[length];
if(verbosity >= 7) {
cerr << "DP3demi: N=" << N << " maxMu= " << maxMu << " maxNu = " << maxNu << endl;
cerr << "DP3demi: allocating " << length << " space= " << stateSpaceSize << ", total= " << length * stateSpaceSize << endl;
}
CPUload[0] = new double[length * stateSpaceSize];
for(int i = 1; i < length; i++)
CPUload[i] = CPUload[i-1] + stateSpaceSize;
#define getTabValue(tab, l, m, k) (tab[l + (N+1)*m + (N+1)*(maxMu+1)*k])
if(verbosity >= 7)
cout << "IndepDP3Demi: maxGLoad = " << maxGPUload << ", ratio = " << ratio << " N= " << N << " gR: " << getResult << " mL " << target << endl;
int index = 0;
for(int i = 0; i < stateSpaceSize; ++i)
CPUload[index][i] = 0;
for(int t : taskSet) {
const int taskType = instance.taskTypes[t];
const int nextIndex = getResult ? index+1: index;
const double exec0 = instance.execTimes[0][taskType];
const double exec1 = instance.execTimes[1][taskType];
const int discreteGPUtime = discreteGPUtimings[taskType];
const int muOffset = exec0 > target ? 1 : 0;
const int nuOffset = exec1 > target ? 1 : 0;
if(exec0 > target && exec1 > target)
return -1; // Problem is not feasible: task t cannot be placed on any resource
if((exec0 <= target) && (exec1 <= target)) { // Task t can be placed on both resources
for(int mu = maxMu; mu >= muOffset; --mu) {
for(int nu = maxNu; nu >= nuOffset; --nu) {
for(int l = N; l >= discreteGPUtime; --l) {
getTabValue(CPUload[nextIndex], l, mu, nu) = min(getTabValue(CPUload[index], l, mu-muOffset, nu) + exec0,
getTabValue(CPUload[index], l - discreteGPUtime, mu, nu-nuOffset));
}
for(int l = discreteGPUtime - 1; l >= 0; --l) {
getTabValue(CPUload[nextIndex], l, mu, nu) = getTabValue(CPUload[index], l, mu-muOffset, nu) + exec0;
}
}
if(nuOffset) {
for(int l = N; l >= 0; --l) {
getTabValue(CPUload[nextIndex], l, mu, 0) = getTabValue(CPUload[index], l, mu-muOffset, 1) + exec0;
}
}
}
if(muOffset) {
for(int nu = maxNu; nu >= nuOffset; --nu) {
for(int l = N; l >= discreteGPUtime; --l) {
getTabValue(CPUload[nextIndex], l, 0, nu) = getTabValue(CPUload[index], l - discreteGPUtime, 0, nu-nuOffset);
}
for(int l = discreteGPUtime - 1; l >= 0; --l) {
getTabValue(CPUload[nextIndex], l, 0, nu) = std::numeric_limits<double>::infinity();
}
}
if(nuOffset) {
for(int l = N; l >= 0; --l) {
getTabValue(CPUload[nextIndex], l, 0, 0) = std::numeric_limits<double>::infinity();
}
}
}
} else if ((exec0 <= target) && (exec1 > target)) { // Task t can only be placed on CPUs
for(int mu = maxMu; mu >= muOffset; --mu) {
for(int nu = maxNu; nu >= 0; --nu) {
for(int l = N; l >= 0; --l) {
getTabValue(CPUload[nextIndex], l, mu, nu) = getTabValue(CPUload[index], l, mu-muOffset, nu) + exec0;
}
}
}
if(muOffset) {
for(int nu = maxNu; nu >= 0; --nu) {
for(int l = N; l >= 0; --l) {
getTabValue(CPUload[nextIndex], l, 0, nu) = std::numeric_limits<double>::infinity();
}
}
}
} else /* ((exec0 > target) && (exec1 <= target)) */ { // Task t can only be placed on GPUs
for(int mu = maxMu; mu >= 0; --mu) {
for(int nu = maxNu; nu >= nuOffset; --nu) {
for(int l = N; l >= discreteGPUtime; --l) {
getTabValue(CPUload[nextIndex], l, mu, nu) = getTabValue(CPUload[index], l - discreteGPUtime, mu, nu-nuOffset);
}
for(int l = discreteGPUtime - 1; l >= 0; l--) {
getTabValue(CPUload[nextIndex], l, mu, nu) = std::numeric_limits<double>::infinity();
}
}
if(nuOffset) {
for(int l = N; l >= 0; l--) {
getTabValue(CPUload[nextIndex], l, mu, 0) = std::numeric_limits<double>::infinity();
}
}
}
}
index = nextIndex;
}
double value = getTabValue(CPUload[index], N, maxMu, maxNu);
if(value == std::numeric_limits<double>::infinity()) {
// Problem not feasible.
return -1;
}
if(verbosity >= 7)
cerr << "DP3demi: final value is " << value << endl;
int gLoad = N;
int mu = maxMu;
int nu = maxNu;
if(getResult) {
result[0].clear();
result[1].clear();
for(; index > 0; index--) {
const int taskType = instance.taskTypes[taskSet[index-1]];
const double exec0 = instance.execTimes[0][taskType];
const double exec1 = instance.execTimes[1][taskType];
const int discreteGPUtime = discreteGPUtimings[taskType];
const int muOffset = exec0 > target ? 1 : 0;
const int nuOffset = exec1 > target ? 1 : 0;
if(getTabValue(CPUload[index], gLoad, mu, nu) == getTabValue(CPUload[index-1], gLoad, mu-muOffset, nu) + exec0) {
mu -= muOffset;
result[0].push_back(taskSet[index-1]);
}
else {
gLoad -= discreteGPUtimings[taskType];
nu -= nuOffset;
result[1].push_back(taskSet[index-1]);
}
}
}
delete[] CPUload[0];
delete[] CPUload;
return value + existingCPUload;
}