diff --git a/Addons/BenchEfficiency/bordeaux_0116.pdf b/Addons/BenchEfficiency/bordeaux_0116.pdf
deleted file mode 100644
index c7785df796322f7b6779f10deff6bef7ef3bb436..0000000000000000000000000000000000000000
Binary files a/Addons/BenchEfficiency/bordeaux_0116.pdf and /dev/null differ
diff --git a/Addons/BenchEfficiency/execAllHomogeneous.sh b/Addons/BenchEfficiency/execAllHomogeneous.sh
index 483add05b41f306b9bc971e6f943703284e1d028..a4d5899711d3699d8ae5d1d9d223bf67aacf659b 100644
--- a/Addons/BenchEfficiency/execAllHomogeneous.sh
+++ b/Addons/BenchEfficiency/execAllHomogeneous.sh
@@ -13,14 +13,14 @@ cpu=1
 STARPU_NCPUS=$cpu
 STARPU_NCUDA=0
 
-logoutput=`./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_SEQ`
+logoutput=`./Tests/Release/testBlockedUniformBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_SEQ`
 if [[ $VERBOSE ]] ; then
     echo $logoutput
 fi
-$SCALFMM_STARPU_DIR/bin/starpu_fxt_tool -i "/tmp/prof_file_"$USER"_0"
+$TUTORIAL_STARPU_DIR/bin/starpu_fxt_tool -i "/tmp/prof_file_"$USER"_0"
 rec_name="$SCALFMM_RES_DIR/trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_BS_CPU_SEQ-CPU_$cpu.rec"
 mv trace.rec $rec_name
-python $SCALFMM_STARPU_DIR/bin/starpu_trace_state_stats.py -t $rec_name > $rec_name.time
+python $TUTORIAL_STARPU_DIR/bin/starpu_trace_state_stats.py -t $rec_name > $rec_name.time
 
 
 for (( cpu=1 ; cpu<=$SCALFMM_MAX_NB_CPU ; cpu++)) ; do
@@ -29,13 +29,13 @@ for (( cpu=1 ; cpu<=$SCALFMM_MAX_NB_CPU ; cpu++)) ; do
     STARPU_NCPUS=$cpu
     STARPU_NCUDA=0
 
-    logoutput=`./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_PAR`
+    logoutput=`./Tests/Release/testBlockedUniformBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_PAR`
     if [[ $VERBOSE ]] ; then
         echo $logoutput
     fi
-    $SCALFMM_STARPU_DIR/bin/starpu_fxt_tool -i "/tmp/prof_file_"$USER"_0"
+    $TUTORIAL_STARPU_DIR/bin/starpu_fxt_tool -i "/tmp/prof_file_"$USER"_0"
     rec_name="$SCALFMM_RES_DIR/trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_BS_CPU_PAR-CPU_$cpu.rec"
     mv trace.rec $rec_name
-    python $SCALFMM_STARPU_DIR/bin/starpu_trace_state_stats.py -t $rec_name > $rec_name.time
+    python $TUTORIAL_STARPU_DIR/bin/starpu_trace_state_stats.py -t $rec_name > $rec_name.time
 done
 
diff --git a/Addons/BenchEfficiency/global-eff.data b/Addons/BenchEfficiency/global-eff.data
deleted file mode 100644
index b6c8743f9af5b92d32687ae620f4b810a3bdd682..0000000000000000000000000000000000000000
--- a/Addons/BenchEfficiency/global-eff.data
+++ /dev/null
@@ -1,25 +0,0 @@
-0 	granularity-eff 	tasks-eff 	runtime-eff 	pipeline-eff
-1 	1.000000e+00 	1.000000e+00 	1.000000e+00 	1.000000e+00
-2 	9.588832e-01 	9.588832e-01 	9.972215e-01 	9.999844e-01
-3 	9.984195e-01 	9.984195e-01 	9.992539e-01 	9.999840e-01
-4 	9.936055e-01 	9.936055e-01 	9.992505e-01 	9.999843e-01
-5 	9.859209e-01 	9.859209e-01 	9.991938e-01 	9.999840e-01
-6 	9.913540e-01 	9.913540e-01 	9.992224e-01 	9.999848e-01
-7 	9.980442e-01 	9.980442e-01 	9.993216e-01 	9.999841e-01
-8 	9.932070e-01 	9.932070e-01 	9.993356e-01 	9.999844e-01
-9 	9.953908e-01 	9.953908e-01 	9.993136e-01 	9.999852e-01
-10 	9.930517e-01 	9.930517e-01 	9.991280e-01 	9.999848e-01
-11 	9.937148e-01 	9.937148e-01 	9.992802e-01 	9.999838e-01
-12 	9.895039e-01 	9.895039e-01 	9.992958e-01 	9.999842e-01
-13 	9.934571e-01 	9.934571e-01 	9.992770e-01 	9.999845e-01
-14 	9.939346e-01 	9.939346e-01 	9.993242e-01 	9.999845e-01
-15 	9.929928e-01 	9.929928e-01 	9.993077e-01 	9.999849e-01
-16 	9.946804e-01 	9.946804e-01 	9.993051e-01 	9.999838e-01
-17 	9.959137e-01 	9.959137e-01 	9.992893e-01 	9.999839e-01
-18 	9.652375e-01 	9.652375e-01 	9.961152e-01 	9.999832e-01
-19 	9.937258e-01 	9.937258e-01 	9.992987e-01 	9.999845e-01
-20 	9.949256e-01 	9.949256e-01 	9.992757e-01 	9.999843e-01
-21 	9.886613e-01 	9.886613e-01 	9.992616e-01 	9.999838e-01
-22 	9.921982e-01 	9.921982e-01 	9.992499e-01 	9.999842e-01
-23 	1.001717e+00 	1.001717e+00 	9.992881e-01 	9.999846e-01
-24 	9.957642e-01 	9.957642e-01 	9.992461e-01 	9.999836e-01
diff --git a/Addons/BenchEfficiency/global-eff.png b/Addons/BenchEfficiency/global-eff.png
deleted file mode 100644
index fc24c14787e6bbc9276edf022a5f81b405a91268..0000000000000000000000000000000000000000
Binary files a/Addons/BenchEfficiency/global-eff.png and /dev/null differ
diff --git a/Addons/BenchEfficiency/mergetimefile.cpp b/Addons/BenchEfficiency/mergetimefile.cpp
index 8e781fea5cb538651c7c2fd5e89cd5ede55edaa4..6dcdc96f733833ebb39d516c18224030cfe9c570 100644
--- a/Addons/BenchEfficiency/mergetimefile.cpp
+++ b/Addons/BenchEfficiency/mergetimefile.cpp
@@ -68,7 +68,7 @@ struct LineData{
             }
         }
         if(words.size() != 4){
-            printf("Error line is no composed of 4 words\n");
+            printf("Error line is no composed of 4 words, has %lu for %s\n", words.size(), line);
             exit(111);
         }
         name = ReduceName(words[0].substr(1, words[0].size() - 2));
@@ -186,30 +186,32 @@ int main(int argc, char** argv){
         }
 
         while((sizeLine = getline((char**)&line, &sizeLine, timeFile)) != -1){
-            LineData dt(line);
-            // Task, Runtime, Other
-            if(dt.type == "Task"){
-                if(dt.name != "execute_on_all_wrapper"){
-                    timeTasks[idxFile][dt.name] += dt.duration;
-                    allTaskNames.insert(dt.name);
-                    times[idxFile].tt += dt.duration;
+            if(strncmp(line, "WARNING", 7) != 0){
+                LineData dt(line);
+                // Task, Runtime, Other
+                if(dt.type == "Task"){
+                    if(dt.name != "execute_on_all_wrapper"){
+                        timeTasks[idxFile][dt.name] += dt.duration;
+                        allTaskNames.insert(dt.name);
+                        times[idxFile].tt += dt.duration;
+                    }
                 }
-            }
-            else if(dt.type == "Runtime"){
-                if(dt.name == "Scheduling"
-                        || dt.name == "FetchingInput"
-                        || dt.name == "PushingOutput"){
-                    times[idxFile].tr += dt.duration;
+                else if(dt.type == "Runtime"){
+                    if(dt.name == "Scheduling"
+                            || dt.name == "FetchingInput"
+                            || dt.name == "PushingOutput"){
+                        times[idxFile].tr += dt.duration;
+                    }
                 }
-            }
-            else if(dt.type == "Other"){
-                if(dt.name == "Idle"){
-                    times[idxFile].ti += dt.duration;
+                else if(dt.type == "Other"){
+                    if(dt.name == "Idle"){
+                        times[idxFile].ti += dt.duration;
+                    }
+                }
+                else {
+                    printf("Arg do not know type %s\n", dt.type.c_str());
+                    //return 3;
                 }
-            }
-            else {
-                printf("Arg do not know type %s\n", dt.type.c_str());
-                return 3;
             }
         }
 
diff --git a/Addons/BenchEfficiency/par-bs-search.png b/Addons/BenchEfficiency/par-bs-search.png
deleted file mode 100644
index 4243937f718415f59047a70a48cb51391dcd223e..0000000000000000000000000000000000000000
Binary files a/Addons/BenchEfficiency/par-bs-search.png and /dev/null differ
diff --git a/Addons/BenchEfficiency/scalfmm.html b/Addons/BenchEfficiency/scalfmm.html
deleted file mode 100644
index 3ac064e14379adf390033d3a6632416598dcf5b0..0000000000000000000000000000000000000000
--- a/Addons/BenchEfficiency/scalfmm.html
+++ /dev/null
@@ -1,594 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
-  <title>scalfmm.html</title>
-  <meta name="generator" content="Haroopad 0.13.1" />
-  <meta name="viewport" content="width=device-width, initial-scale=1.0">
-
-  <style>div.oembedall-githubrepos{border:1px solid #DDD;border-radius:4px;list-style-type:none;margin:0 0 10px;padding:8px 10px 0;font:13.34px/1.4 helvetica,arial,freesans,clean,sans-serif;width:452px;background-color:#fff}div.oembedall-githubrepos .oembedall-body{background:-moz-linear-gradient(center top,#FAFAFA,#EFEFEF);background:-webkit-gradient(linear,left top,left bottom,from(#FAFAFA),to(#EFEFEF));border-bottom-left-radius:4px;border-bottom-right-radius:4px;border-top:1px solid #EEE;margin-left:-10px;margin-top:8px;padding:5px 10px;width:100%}div.oembedall-githubrepos h3{font-size:14px;margin:0;padding-left:18px;white-space:nowrap}div.oembedall-githubrepos p.oembedall-description{color:#444;font-size:12px;margin:0 0 3px}div.oembedall-githubrepos p.oembedall-updated-at{color:#888;font-size:11px;margin:0}div.oembedall-githubrepos ul.oembedall-repo-stats{border:none;float:right;font-size:11px;font-weight:700;padding-left:15px;position:relative;z-index:5;margin:0}div.oembedall-githubrepos ul.oembedall-repo-stats li{border:none;color:#666;display:inline-block;list-style-type:none;margin:0!important}div.oembedall-githubrepos ul.oembedall-repo-stats li a{background-color:transparent;border:none;color:#666!important;background-position:5px -2px;background-repeat:no-repeat;border-left:1px solid #DDD;display:inline-block;height:21px;line-height:21px;padding:0 5px 0 23px}div.oembedall-githubrepos ul.oembedall-repo-stats li:first-child a{border-left:medium none;margin-right:-3px}div.oembedall-githubrepos ul.oembedall-repo-stats li a:hover{background:5px -27px no-repeat #4183C4;color:#FFF!important;text-decoration:none}div.oembedall-githubrepos ul.oembedall-repo-stats li:first-child a:hover{border-bottom-left-radius:3px;border-top-left-radius:3px}ul.oembedall-repo-stats li:last-child a:hover{border-bottom-right-radius:3px;border-top-right-radius:3px}span.oembedall-closehide{background-color:#aaa;border-radius:2px;cursor:pointer;margin-right:3px}div.oembedall-container{margin-top:5px;text-align:left}.oembedall-ljuser{font-weight:700}.oembedall-ljuser img{vertical-align:bottom;border:0;padding-right:1px}.oembedall-stoqembed{border-bottom:1px dotted #999;float:left;overflow:hidden;width:730px;line-height:1;background:#FFF;color:#000;font-family:Arial,Liberation Sans,DejaVu Sans,sans-serif;font-size:80%;text-align:left;margin:0;padding:0}.oembedall-stoqembed a{color:#07C;text-decoration:none;margin:0;padding:0}.oembedall-stoqembed a:hover{text-decoration:underline}.oembedall-stoqembed a:visited{color:#4A6B82}.oembedall-stoqembed h3{font-family:Trebuchet MS,Liberation Sans,DejaVu Sans,sans-serif;font-size:130%;font-weight:700;margin:0;padding:0}.oembedall-stoqembed .oembedall-reputation-score{color:#444;font-size:120%;font-weight:700;margin-right:2px}.oembedall-stoqembed .oembedall-user-info{height:35px;width:185px}.oembedall-stoqembed .oembedall-user-info .oembedall-user-gravatar32{float:left;height:32px;width:32px}.oembedall-stoqembed .oembedall-user-info .oembedall-user-details{float:left;margin-left:5px;overflow:hidden;white-space:nowrap;width:145px}.oembedall-stoqembed .oembedall-question-hyperlink{font-weight:700}.oembedall-stoqembed .oembedall-stats{background:#EEE;margin:0 0 0 7px;padding:4px 7px 6px;width:58px}.oembedall-stoqembed .oembedall-statscontainer{float:left;margin-right:8px;width:86px}.oembedall-stoqembed .oembedall-votes{color:#555;padding:0 0 7px;text-align:center}.oembedall-stoqembed .oembedall-vote-count-post{font-size:240%;color:#808185;display:block;font-weight:700}.oembedall-stoqembed .oembedall-views{color:#999;padding-top:4px;text-align:center}.oembedall-stoqembed .oembedall-status{margin-top:-3px;padding:4px 0;text-align:center;background:#75845C;color:#FFF}.oembedall-stoqembed .oembedall-status strong{color:#FFF;display:block;font-size:140%}.oembedall-stoqembed .oembedall-summary{float:left;width:635px}.oembedall-stoqembed .oembedall-excerpt{line-height:1.2;margin:0;padding:0 0 5px}.oembedall-stoqembed .oembedall-tags{float:left;line-height:18px}.oembedall-stoqembed .oembedall-tags a:hover{text-decoration:none}.oembedall-stoqembed .oembedall-post-tag{background-color:#E0EAF1;border-bottom:1px solid #3E6D8E;border-right:1px solid #7F9FB6;color:#3E6D8E;font-size:90%;line-height:2.4;margin:2px 2px 2px 0;padding:3px 4px;text-decoration:none;white-space:nowrap}.oembedall-stoqembed .oembedall-post-tag:hover{background-color:#3E6D8E;border-bottom:1px solid #37607D;border-right:1px solid #37607D;color:#E0EAF1}.oembedall-stoqembed .oembedall-fr{float:right}.oembedall-stoqembed .oembedall-statsarrow{background-image:url(http://cdn.sstatic.net/stackoverflow/img/sprites.png?v=3);background-repeat:no-repeat;overflow:hidden;background-position:0 -435px;float:right;height:13px;margin-top:12px;width:7px}.oembedall-facebook1{border:1px solid #1A3C6C;padding:0;font:13.34px/1.4 verdana;width:500px}.oembedall-facebook2{background-color:#627add}.oembedall-facebook2 a{color:#e8e8e8;text-decoration:none}.oembedall-facebookBody{background-color:#fff;vertical-align:top;padding:5px}.oembedall-facebookBody .contents{display:inline-block;width:100%}.oembedall-facebookBody div img{float:left;margin-right:5px}div.oembedall-lanyard{-webkit-box-shadow:none;-webkit-transition-delay:0s;-webkit-transition-duration:.4000000059604645s;-webkit-transition-property:width;-webkit-transition-timing-function:cubic-bezier(0.42,0,.58,1);background-attachment:scroll;background-clip:border-box;background-color:transparent;background-image:none;background-origin:padding-box;border-width:0;box-shadow:none;color:#112644;display:block;float:left;font-family:'Trebuchet MS',Trebuchet,sans-serif;font-size:16px;height:253px;line-height:19px;margin:0;max-width:none;min-height:0;outline:#112644 0;overflow-x:visible;overflow-y:visible;padding:0;position:relative;text-align:left;vertical-align:baseline;width:804px}div.oembedall-lanyard .tagline{font-size:1.5em}div.oembedall-lanyard .wrapper{overflow:hidden;clear:both}div.oembedall-lanyard .split{float:left;display:inline}div.oembedall-lanyard .prominent-place .flag:active,div.oembedall-lanyard .prominent-place .flag:focus,div.oembedall-lanyard .prominent-place .flag:hover,div.oembedall-lanyard .prominent-place .flag:link,div.oembedall-lanyard .prominent-place .flag:visited{float:left;display:block;width:48px;height:48px;position:relative;top:-5px;margin-right:10px}div.oembedall-lanyard .place-context{font-size:.889em}div.oembedall-lanyard .prominent-place .sub-place{display:block}div.oembedall-lanyard .prominent-place{font-size:1.125em;line-height:1.1em;font-weight:400}div.oembedall-lanyard .main-date{color:#8CB4E0;font-weight:700;line-height:1.1}div.oembedall-lanyard .first{width:48.57%;margin:0 0 0 2.857%}.mermaid .label{color:#333}.node circle,.node polygon,.node rect{fill:#cde498;stroke:#13540c;stroke-width:1px}.edgePath .path{stroke:green;stroke-width:1.5px}.cluster rect{fill:#cdffb2;rx:40;stroke:#6eaa49;stroke-width:1px}.cluster text{fill:#333}.actor{stroke:#13540c;fill:#cde498}text.actor{fill:#000;stroke:none}.actor-line{stroke:grey}.messageLine0{stroke-width:1.5;stroke-dasharray:"2 2";marker-end:"url(#arrowhead)";stroke:#333}.messageLine1{stroke-width:1.5;stroke-dasharray:"2 2";stroke:#333}#arrowhead{fill:#333}#crosshead path{fill:#333!important;stroke:#333!important}.messageText{fill:#333;stroke:none}.labelBox{stroke:#326932;fill:#cde498}.labelText,.loopText{fill:#000;stroke:none}.loopLine{stroke-width:2;stroke-dasharray:"2 2";marker-end:"url(#arrowhead)";stroke:#326932}.note{stroke:#6eaa49;fill:#fff5ad}.noteText{fill:#000;stroke:none;font-family:'trebuchet ms',verdana,arial;font-size:14px}.section{stroke:none;opacity:.2}.section0,.section2{fill:#6eaa49}.section1,.section3{fill:#fff;opacity:.2}.sectionTitle0,.sectionTitle1,.sectionTitle2,.sectionTitle3{fill:#333}.sectionTitle{text-anchor:start;font-size:11px;text-height:14px}.grid .tick{stroke:lightgrey;opacity:.3;shape-rendering:crispEdges}.grid path{stroke-width:0}.today{fill:none;stroke:red;stroke-width:2px}.task{stroke-width:2}.taskText{text-anchor:middle;font-size:11px}.taskTextOutsideRight{fill:#000;text-anchor:start;font-size:11px}.taskTextOutsideLeft{fill:#000;text-anchor:end;font-size:11px}.taskText0,.taskText1,.taskText2,.taskText3{fill:#fff}.task0,.task1,.task2,.task3{fill:#487e3a;stroke:#13540c}.taskTextOutside0,.taskTextOutside1,.taskTextOutside2,.taskTextOutside3{fill:#000}.active0,.active1,.active2,.active3{fill:#cde498;stroke:#13540c}.activeText0,.activeText1,.activeText2,.activeText3{fill:#000!important}.done0,.done1,.done2,.done3{stroke:grey;fill:lightgrey;stroke-width:2}.doneText0,.doneText1,.doneText2,.doneText3{fill:#000!important}.crit0,.crit1,.crit2,.crit3{stroke:#f88;fill:red;stroke-width:2}.activeCrit0,.activeCrit1,.activeCrit2,.activeCrit3{stroke:#f88;fill:#cde498;stroke-width:2}.doneCrit0,.doneCrit1,.doneCrit2,.doneCrit3{stroke:#f88;fill:lightgrey;stroke-width:2;cursor:pointer;shape-rendering:crispEdges}.activeCritText0,.activeCritText1,.activeCritText2,.activeCritText3,.doneCritText0,.doneCritText1,.doneCritText2,.doneCritText3{fill:#000!important}.titleText{text-anchor:middle;font-size:18px;fill:#000}text{font-family:'trebuchet ms',verdana,arial;font-size:14px}html{height:100%}body{margin:0!important;padding:5px 20px 26px!important;background-color:#fff;font-family:"Lucida Grande","Segoe UI","Apple SD Gothic Neo","Malgun Gothic","Lucida Sans Unicode",Helvetica,Arial,sans-serif;font-size:.9em;overflow-x:hidden;overflow-y:auto}br,h1,h2,h3,h4,h5,h6{clear:both}hr.page{background:url() repeat-x;border:0;height:3px;padding:0}hr.underscore{border-top-style:dashed!important}body >:first-child{margin-top:0!important}img.plugin{box-shadow:0 1px 3px rgba(0,0,0,.1);border-radius:3px}iframe{border:0}figure{-webkit-margin-before:0;-webkit-margin-after:0;-webkit-margin-start:0;-webkit-margin-end:0}kbd{border:1px solid #aaa;-moz-border-radius:2px;-webkit-border-radius:2px;border-radius:2px;-moz-box-shadow:1px 2px 2px #ddd;-webkit-box-shadow:1px 2px 2px #ddd;box-shadow:1px 2px 2px #ddd;background-color:#f9f9f9;background-image:-moz-linear-gradient(top,#eee,#f9f9f9,#eee);background-image:-o-linear-gradient(top,#eee,#f9f9f9,#eee);background-image:-webkit-linear-gradient(top,#eee,#f9f9f9,#eee);background-image:linear-gradient(top,#eee,#f9f9f9,#eee);padding:1px 3px;font-family:inherit;font-size:.85em}.oembeded .oembed_photo{display:inline-block}img[data-echo]{margin:25px 0;width:100px;height:100px;background:url(../img/ajax.gif) center center no-repeat #fff}.spinner{display:inline-block;width:10px;height:10px;margin-bottom:-.1em;border:2px solid rgba(0,0,0,.5);border-top-color:transparent;border-radius:100%;-webkit-animation:spin 1s infinite linear;animation:spin 1s infinite linear}.spinner:after{content:'';display:block;width:0;height:0;position:absolute;top:-6px;left:0;border:4px solid transparent;border-bottom-color:rgba(0,0,0,.5);-webkit-transform:rotate(45deg);transform:rotate(45deg)}@-webkit-keyframes spin{to{-webkit-transform:rotate(360deg)}}@keyframes spin{to{transform:rotate(360deg)}}p.toc{margin:0!important}p.toc ul{padding-left:10px}p.toc>ul{padding:10px;margin:0 10px;display:inline-block;border:1px solid #ededed;border-radius:5px}p.toc li,p.toc ul{list-style-type:none}p.toc li{width:100%;padding:0;overflow:hidden}p.toc li a::after{content:"."}p.toc li a:before{content:"• "}p.toc h5{text-transform:uppercase}p.toc .title{float:left;padding-right:3px}p.toc .number{margin:0;float:right;padding-left:3px;background:#fff;display:none}input.task-list-item{margin-left:-1.62em}.markdown{font-family:"Hiragino Sans GB","Microsoft YaHei",STHeiti,SimSun,"Lucida Grande","Lucida Sans Unicode","Lucida Sans",'Segoe UI',AppleSDGothicNeo-Medium,'Malgun Gothic',Verdana,Tahoma,sans-serif;padding:20px}.markdown a{text-decoration:none;vertical-align:baseline}.markdown a:hover{text-decoration:underline}.markdown h1{font-size:2.2em;font-weight:700;margin:1.5em 0 1em}.markdown h2{font-size:1.8em;font-weight:700;margin:1.275em 0 .85em}.markdown h3{font-size:1.6em;font-weight:700;margin:1.125em 0 .75em}.markdown h4{font-size:1.4em;font-weight:700;margin:.99em 0 .66em}.markdown h5{font-size:1.2em;font-weight:700;margin:.855em 0 .57em}.markdown h6{font-size:1em;font-weight:700;margin:.75em 0 .5em}.markdown h1+p,.markdown h1:first-child,.markdown h2+p,.markdown h2:first-child,.markdown h3+p,.markdown h3:first-child,.markdown h4+p,.markdown h4:first-child,.markdown h5+p,.markdown h5:first-child,.markdown h6+p,.markdown h6:first-child{margin-top:0}.markdown hr{border:1px solid #ccc}.markdown p{margin:1em 0;word-wrap:break-word}.markdown ol{list-style-type:decimal}.markdown li{display:list-item;line-height:1.4em}.markdown blockquote{margin:1em 20px}.markdown blockquote>:first-child{margin-top:0}.markdown blockquote>:last-child{margin-bottom:0}.markdown blockquote cite:before{content:'\2014 \00A0'}.markdown .code{border-radius:3px;word-wrap:break-word}.markdown pre{border-radius:3px;word-wrap:break-word;border:1px solid #ccc;overflow:auto;padding:.5em}.markdown pre code{border:0;display:block}.markdown pre>code{font-family:Consolas,Inconsolata,Courier,monospace;font-weight:700;white-space:pre;margin:0}.markdown code{border-radius:3px;word-wrap:break-word;border:1px solid #ccc;padding:0 5px;margin:0 2px}.markdown img{max-width:100%}.markdown mark{color:#000;background-color:#fcf8e3}.markdown table{padding:0;border-collapse:collapse;border-spacing:0;margin-bottom:16px}.markdown table tr td,.markdown table tr th{border:1px solid #ccc;margin:0;padding:6px 13px}.markdown table tr th{font-weight:700}.markdown table tr th>:first-child{margin-top:0}.markdown table tr th>:last-child{margin-bottom:0}.markdown table tr td>:first-child{margin-top:0}.markdown table tr td>:last-child{margin-bottom:0}@import url(http://fonts.googleapis.com/css?family=Roboto+Condensed:300italic,400italic,700italic,400,300,700);.haroopad{padding:20px;color:#222;font-size:15px;font-family:"Roboto Condensed",Tauri,"Hiragino Sans GB","Microsoft YaHei",STHeiti,SimSun,"Lucida Grande","Lucida Sans Unicode","Lucida Sans",'Segoe UI',AppleSDGothicNeo-Medium,'Malgun Gothic',Verdana,Tahoma,sans-serif;background:#fff;line-height:1.6;-webkit-font-smoothing:antialiased}.haroopad a{color:#3269a0}.haroopad a:hover{color:#4183c4}.haroopad h2{border-bottom:1px solid #e6e6e6}.haroopad h6{color:#777}.haroopad hr{border:1px solid #e6e6e6}.haroopad blockquote>code,.haroopad h1>code,.haroopad h2>code,.haroopad h3>code,.haroopad h4>code,.haroopad h5>code,.haroopad h6>code,.haroopad li>code,.haroopad p>code,.haroopad td>code{font-family:Consolas,"Liberation Mono",Menlo,Courier,monospace;font-size:85%;background-color:rgba(0,0,0,.02);padding:.2em .5em;border:1px solid #efefef}.haroopad pre>code{font-size:1em;letter-spacing:-1px;font-weight:700}.haroopad blockquote{border-left:4px solid #e6e6e6;padding:0 15px;color:#777}.haroopad table{background-color:#fafafa}.haroopad table tr td,.haroopad table tr th{border:1px solid #e6e6e6}.haroopad table tr:nth-child(2n){background-color:#f2f2f2}.hljs{display:block;overflow-x:auto;padding:.5em;background:#282b2e;-webkit-text-size-adjust:none}.css .hljs-id,.hljs-change,.hljs-flow,.hljs-keyword,.hljs-literal,.hljs-winutils,.nginx .hljs-title,.tex .hljs-special{color:#93c763}.hljs-number{color:#ffcd22}.hljs{color:#e0e2e4}.css .hljs-pseudo,.css .hljs-tag{color:#d0d2b5}.hljs .hljs-constant,.hljs-attribute{color:#668bb0}.xml .hljs-attribute{color:#b3b689}.xml .hljs-tag .hljs-value{color:#e8e2b7}.hljs-class .hljs-title,.hljs-code,.hljs-header{color:#fff}.hljs-class,.hljs-hexcolor{color:#93c763}.hljs-regexp{color:#d39745}.hljs-at_rule,.hljs-at_rule .hljs-keyword{color:#a082bd}.hljs-doctype{color:#557182}.apache .hljs-cbracket,.apache .hljs-tag,.django .hljs-filter .hljs-argument,.django .hljs-template_tag,.django .hljs-variable,.hljs-addition,.hljs-attr_selector,.hljs-built_in,.hljs-bullet,.hljs-emphasis,.hljs-envvar,.hljs-javadoc,.hljs-link_url,.hljs-pragma,.hljs-preprocessor,.hljs-prompt,.hljs-pseudo,.hljs-stream,.hljs-subst,.hljs-tag,.hljs-tag .hljs-title,.hljs-type,.ruby .hljs-class .hljs-parent,.smalltalk .hljs-array,.smalltalk .hljs-class,.smalltalk .hljs-localvars,.tex .hljs-command{color:#8cbbad}.hljs-string{color:#ec7600}.apache .hljs-sqbracket,.hljs-annotation,.hljs-blockquote,.hljs-comment,.hljs-decorator,.hljs-deletion,.hljs-horizontal_rule,.hljs-pi,.hljs-shebang,.tex .hljs-formula{color:#818e96}.apache .hljs-tag,.bash .hljs-variable,.css .hljs-id,.diff .hljs-header,.hljs-at_rule .hljs-keyword,.hljs-chunk,.hljs-dartdoc,.hljs-header,.hljs-keyword,.hljs-literal,.hljs-phpdoc,.hljs-request,.hljs-status,.hljs-title,.hljs-type,.hljs-winutils,.rsl .hljs-built_in,.smalltalk .hljs-class,.tex .hljs-special,.vbscript .hljs-built_in{font-weight:700}.coffeescript .javascript,.javascript .xml,.tex .hljs-formula,.xml .css,.xml .hljs-cdata,.xml .javascript,.xml .vbscript{opacity:.5}.MathJax_Hover_Frame{border-radius:.25em;-webkit-border-radius:.25em;-moz-border-radius:.25em;-khtml-border-radius:.25em;box-shadow:0 0 15px #83A;-webkit-box-shadow:0 0 15px #83A;-moz-box-shadow:0 0 15px #83A;-khtml-box-shadow:0 0 15px #83A;border:1px solid #A6D!important;display:inline-block;position:absolute}.MathJax_Hover_Arrow{position:absolute;width:15px;height:11px;cursor:pointer}#MathJax_About{position:fixed;left:50%;width:auto;text-align:center;border:3px outset;padding:1em 2em;background-color:#DDD;color:#000;cursor:default;font-family:message-box;font-size:120%;font-style:normal;text-indent:0;text-transform:none;line-height:normal;letter-spacing:normal;word-spacing:normal;word-wrap:normal;white-space:nowrap;float:none;z-index:201;border-radius:15px;-webkit-border-radius:15px;-moz-border-radius:15px;-khtml-border-radius:15px;box-shadow:0 10px 20px gray;-webkit-box-shadow:0 10px 20px gray;-moz-box-shadow:0 10px 20px gray;-khtml-box-shadow:0 10px 20px gray;filter:progid:DXImageTransform.Microsoft.dropshadow(OffX=2, OffY=2, Color='gray', Positive='true')}.MathJax_Menu{position:absolute;background-color:#fff;color:#000;width:auto;padding:5px 0;border:1px solid #CCC;margin:0;cursor:default;font:menu;text-align:left;text-indent:0;text-transform:none;line-height:normal;letter-spacing:normal;word-spacing:normal;word-wrap:normal;white-space:nowrap;float:none;z-index:201;border-radius:5px;-webkit-border-radius:5px;-moz-border-radius:5px;-khtml-border-radius:5px;box-shadow:0 10px 20px gray;-webkit-box-shadow:0 10px 20px gray;-moz-box-shadow:0 10px 20px gray;-khtml-box-shadow:0 10px 20px gray;filter:progid:DXImageTransform.Microsoft.dropshadow(OffX=2, OffY=2, Color='gray', Positive='true')}.MathJax_MenuItem{padding:1px 2em;background:0 0}.MathJax_MenuArrow{position:absolute;right:.5em;color:#666}.MathJax_MenuActive .MathJax_MenuArrow{color:#fff}.MathJax_MenuArrow.RTL{left:.5em;right:auto}.MathJax_MenuCheck{position:absolute;left:.7em}.MathJax_MenuCheck.RTL{right:.7em;left:auto}.MathJax_MenuRadioCheck{position:absolute;left:.7em}.MathJax_MenuRadioCheck.RTL{right:.7em;left:auto}.MathJax_MenuLabel{padding:1px 2em 3px 1.33em;font-style:italic}.MathJax_MenuRule{border-top:1px solid #DDD;margin:4px 3px}.MathJax_MenuDisabled{color:GrayText}.MathJax_MenuActive{background-color:#606872;color:#fff}.MathJax_Menu_Close{position:absolute;width:31px;height:31px;top:-15px;left:-15px}#MathJax_Zoom{position:absolute;background-color:#F0F0F0;overflow:auto;display:block;z-index:301;padding:.5em;border:1px solid #000;margin:0;font-weight:400;font-style:normal;text-align:left;text-indent:0;text-transform:none;line-height:normal;letter-spacing:normal;word-spacing:normal;word-wrap:normal;white-space:nowrap;float:none;box-shadow:5px 5px 15px #AAA;-webkit-box-shadow:5px 5px 15px #AAA;-moz-box-shadow:5px 5px 15px #AAA;-khtml-box-shadow:5px 5px 15px #AAA;filter:progid:DXImageTransform.Microsoft.dropshadow(OffX=2, OffY=2, Color='gray', Positive='true')}#MathJax_ZoomOverlay{position:absolute;left:0;top:0;z-index:300;display:inline-block;width:100%;height:100%;border:0;padding:0;margin:0;background-color:#fff;opacity:0;filter:alpha(opacity=0)}#MathJax_ZoomFrame{position:relative;display:inline-block;height:0;width:0}#MathJax_ZoomEventTrap{position:absolute;left:0;top:0;z-index:302;display:inline-block;border:0;padding:0;margin:0;background-color:#fff;opacity:0;filter:alpha(opacity=0)}.MathJax_Preview{color:#888}#MathJax_Message{position:fixed;left:1px;bottom:2px;background-color:#E6E6E6;border:1px solid #959595;margin:0;padding:2px 8px;z-index:102;color:#000;font-size:80%;width:auto;white-space:nowrap}#MathJax_MSIE_Frame{position:absolute;top:0;left:0;width:0;z-index:101;border:0;margin:0;padding:0}.MathJax_Error{color:#C00;font-style:italic}footer{position:fixed;font-size:.8em;text-align:right;bottom:0;margin-left:-25px;height:20px;width:100%}</style>
-</head>
-<body class="markdown haroopad">
-<h1 id="scalfmm-with-starpu+cuda"><a name="scalfmm-with-starpu+cuda" href="#scalfmm-with-starpu+cuda"></a>ScalFMM with StarPU+CUDA</h1><p>In this tutorial, we provide the commands to install ScalFMM and the needed tools in order to compute parallel efficiencies.<br>We first show how to obtain the homogeneous efficencies and then the heterogeneous ones (not done yet).</p><h2 id="installing-the-libraries"><a name="installing-the-libraries" href="#installing-the-libraries"></a>Installing the libraries</h2><p>For some installation steps, we provide a “valid-if” test which shows if the previous command has been done correctly or not.<br>In case of success <code>STEP-OK</code> will be print-out.<br>In addition, if a library is already installed on the system, it is possible to set the output variables directly and test with the “valid-if” command if it will work.</p><p>It is possible to follow these steps only to compile ScalFMM above StarPU and so we marked the installation of execution-trace tools as <strong>Optional</strong>.<br>However, we higly recommended to install them and to follow all the steps since they let have the efficiencies.<br>But if one wants to execute without any overhead, it might need to remove the usage of FXT.</p><h3 id="pre-requiste:"><a name="pre-requiste:" href="#pre-requiste:"></a>Pre-requiste:</h3><p>In order to follow this tutorial, it is needed to have the following applications installed:</p><ul>
-<li>autoconf (&gt;= 2.69)</li><li>gawk (Awk &gt;= 4.0.1)</li><li>make (&gt;= 3.81) </li><li>cmake (&gt;= 3.2.2)</li><li>gcc/g<ins> (&gt;= 4.9) and the gcc/g</ins> names should point to the correct binaries</li><li>BLAS/LAPACK (The configure of ScalFMM is different if the MKL is used or not, but with the MKL it is recommended to set environment variable <code>MKLROOT</code>)</li><li>CUDA (&gt;= 7) and <code>CUDA_PATH</code> must be set. In our case, <code>CUDA_PATH=/usr/local/cuda-7.5/</code></li><li><strong>Optional</strong> Vite (from <code>sudo apt-get install vite</code> or see <a href="http://vite.gforge.inria.fr/download.php"></a><a href="http://vite.gforge.inria.fr/download.php">http://vite.gforge.inria.fr/download.php</a>)</li><li><strong>Optional</strong> Qt5 library to be able to change the colors of the execution traces in order to visualize the different FMM operators</li><li>gnuplot to generate the figures</li></ul><blockquote>
-<p>[Remark] Some installations of CUDA does not have libcuda file.<br>In this case, one needs to create a link : <code>sudo ln /usr/local/cuda-7.5/lib64/libcudart.so /usr/local/cuda-7.5/lib64/libcuda.so</code></p>
-<p>[Plafrim-Developers] </p>
-<p>For those who use this tutorial on Plafrim (or a similar cluster), we provide extra informations.</p>
-<p>To allocate an heterogeneous node : <code>salloc -N 1 --time=03:00:00 --exclusive -p court_sirocco -CHaswell --gres=gpu:4 -x sirocco06</code></p>
-<p>Then, find it using <code>squeue</code> and access it by <code>ssh</code>.</p>
-<p>We have run this tutorial with the modules : <code>module load compiler/gcc/4.9.2 cuda75/toolkit/7.5.18 intel/mkl/64/11.2/2016.0.0 build/cmake/3.2.1</code></p>
-</blockquote><h3 id="working-directory"><a name="working-directory" href="#working-directory"></a>Working directory</h3><p>The variable <code>SCALFMM_TEST_DIR</code> is used to specify the working directory where all the tools are going to be installed:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>export SCALFMM_TEST_DIR=~/scalfmm_test   
-cd $SCALFMM_TEST_DIR
-</code></pre>"><span class="hljs-built_in">export</span> SCALFMM_TEST_DIR=~/scalfmm_<span class="hljs-built_in">test</span>   
-<span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span>
-</code></pre><p>In order to be able to stop the tutorial in the middle and restart later, we will register the variables in a file that should be source to restart later:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;># function scalfmmRegisterVariable() { echo &quot;export $1=${!1}&quot; &amp;gt;&amp;gt; &quot;$SCALFMM_TEST_DIR/environment.source&quot;; }
-echo &quot;function scalfmmRegisterVariable() { echo \&quot;export \$1=\${!1}\&quot; &amp;gt;&amp;gt; \&quot;$SCALFMM_TEST_DIR/environment.source\&quot;; }&quot; &amp;gt; &quot;$SCALFMM_TEST_DIR/environment.source&quot;
-source &quot;$SCALFMM_TEST_DIR/environment.source&quot;
-</code></pre>"><span class="hljs-comment"># function scalfmmRegisterVariable() { echo "export $1=${!1}" &gt;&gt; "$SCALFMM_TEST_DIR/environment.source"; }</span>
-<span class="hljs-built_in">echo</span> <span class="hljs-string">"function scalfmmRegisterVariable() { echo \"export \$1=\${!1}\" &gt;&gt; \"<span class="hljs-variable">$SCALFMM_TEST_DIR</span>/environment.source\"; }"</span> &gt; <span class="hljs-string">"<span class="hljs-variable">$SCALFMM_TEST_DIR</span>/environment.source"</span>
-<span class="hljs-built_in">source</span> <span class="hljs-string">"<span class="hljs-variable">$SCALFMM_TEST_DIR</span>/environment.source"</span>
-</code></pre><p><em>Output variables:</em> <code>scalfmmRegisterVariable SCALFMM_TEST_DIR</code></p><p>Valid-if</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>if [[ -n $SCALFMM_TEST_DIR ]] &amp;amp;&amp;amp; [[ -d $SCALFMM_TEST_DIR ]] ; then
-   echo “STEP-OK”
-fi
-</code></pre>"><span class="hljs-keyword">if</span> [[ -n <span class="hljs-variable">$SCALFMM_TEST_DIR</span> ]] &amp;&amp; [[ <span class="hljs-operator">-d</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span> ]] ; <span class="hljs-keyword">then</span>
-   <span class="hljs-built_in">echo</span> “STEP-OK”
-<span class="hljs-keyword">fi</span>
-</code></pre><ul>
-<li>Restarting the tutorial</li></ul><p>To restart the tutorial, one needs to re-define the working directory and to source the save file before to resume:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>export SCALFMM_TEST_DIR=~/scalfmm_test
-if [[ ! -d $SCALFMM_TEST_DIR ]] ; then
-    mkdir $SCALFMM_TEST_DIR
-else
-    source &quot;$SCALFMM_TEST_DIR/environment.source&quot;
-fi    
-cd $SCALFMM_TEST_DIR
-</code></pre>"><span class="hljs-built_in">export</span> SCALFMM_TEST_DIR=~/scalfmm_<span class="hljs-built_in">test</span>
-<span class="hljs-keyword">if</span> [[ ! <span class="hljs-operator">-d</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span> ]] ; <span class="hljs-keyword">then</span>
-    mkdir <span class="hljs-variable">$SCALFMM_TEST_DIR</span>
-<span class="hljs-keyword">else</span>
-    <span class="hljs-built_in">source</span> <span class="hljs-string">"<span class="hljs-variable">$SCALFMM_TEST_DIR</span>/environment.source"</span>
-<span class="hljs-keyword">fi</span>    
-<span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span>
-</code></pre><h3 id="downloading-the-packages-(in-advance)"><a name="downloading-the-packages-(in-advance)" href="#downloading-the-packages-(in-advance)"></a>Downloading the Packages (in Advance)</h3><p>If the computational node does not have access to internet, we provide a command to download the needed packages (otherwise the next commands still include just in time download):</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>cd $SCALFMM_TEST_DIR
-wget https://www.open-mpi.org/software/hwloc/v1.11/downloads/hwloc-1.11.2.tar.gz
-wget http://download.savannah.gnu.org/releases/fkt/fxt-0.2.11.tar.gz # Optional
-wget http://www.fftw.org/fftw-3.3.4.tar.gz
-svn export svn://scm.gforge.inria.fr/svnroot/starpu/trunk starpu
-git clone --depth=1 https://scm.gforge.inria.fr/anonscm/git/scalfmm-public/scalfmm-public.git
-</code></pre>"><span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span>
-wget https://www.open-mpi.org/software/hwloc/v1.<span class="hljs-number">11</span>/downloads/hwloc-<span class="hljs-number">1.11</span>.<span class="hljs-number">2</span>.tar.gz
-wget http://download.savannah.gnu.org/releases/fkt/fxt-<span class="hljs-number">0.2</span>.<span class="hljs-number">11</span>.tar.gz <span class="hljs-comment"># Optional</span>
-wget http://www.fftw.org/fftw-<span class="hljs-number">3.3</span>.<span class="hljs-number">4</span>.tar.gz
-svn <span class="hljs-built_in">export</span> svn://scm.gforge.inria.fr/svnroot/starpu/trunk starpu
-git <span class="hljs-built_in">clone</span> --depth=<span class="hljs-number">1</span> https://scm.gforge.inria.fr/anonscm/git/scalfmm-public/scalfmm-public.git
-</code></pre><h3 id="hwloc"><a name="hwloc" href="#hwloc"></a>HWLOC</h3><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>cd $SCALFMM_TEST_DIR
-if [[ ! -f hwloc-1.11.2.tar.gz ]] ; then
-    wget https://www.open-mpi.org/software/hwloc/v1.11/downloads/hwloc-1.11.2.tar.gz
-fi
-tar xvf hwloc-1.11.2.tar.gz
-cd hwloc-1.11.2/
-export SCALFMM_HWLOC_DIR=$SCALFMM_TEST_DIR/hwlocinstall
-./configure --prefix=$SCALFMM_HWLOC_DIR
-make install
-</code></pre>"><span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span>
-<span class="hljs-keyword">if</span> [[ ! <span class="hljs-operator">-f</span> hwloc-<span class="hljs-number">1.11</span>.<span class="hljs-number">2</span>.tar.gz ]] ; <span class="hljs-keyword">then</span>
-    wget https://www.open-mpi.org/software/hwloc/v1.<span class="hljs-number">11</span>/downloads/hwloc-<span class="hljs-number">1.11</span>.<span class="hljs-number">2</span>.tar.gz
-<span class="hljs-keyword">fi</span>
-tar xvf hwloc-<span class="hljs-number">1.11</span>.<span class="hljs-number">2</span>.tar.gz
-<span class="hljs-built_in">cd</span> hwloc-<span class="hljs-number">1.11</span>.<span class="hljs-number">2</span>/
-<span class="hljs-built_in">export</span> SCALFMM_HWLOC_DIR=<span class="hljs-variable">$SCALFMM_TEST_DIR</span>/hwlocinstall
-./configure --prefix=<span class="hljs-variable">$SCALFMM_HWLOC_DIR</span>
-make install
-</code></pre><p><em>Output variables:</em> <code>scalfmmRegisterVariable SCALFMM_HWLOC_DIR</code></p><p>Valid-if:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>if [[ -n $SCALFMM_HWLOC_DIR ]] &amp;amp;&amp;amp; [[ -d $SCALFMM_HWLOC_DIR/lib/ ]] &amp;amp;&amp;amp; [[ -f  $SCALFMM_HWLOC_DIR/lib/libhwloc.so ]]; then
-   echo &quot;STEP-OK&quot;
-fi
-</code></pre>"><span class="hljs-keyword">if</span> [[ -n <span class="hljs-variable">$SCALFMM_HWLOC_DIR</span> ]] &amp;&amp; [[ <span class="hljs-operator">-d</span> <span class="hljs-variable">$SCALFMM_HWLOC_DIR</span>/lib/ ]] &amp;&amp; [[ <span class="hljs-operator">-f</span>  <span class="hljs-variable">$SCALFMM_HWLOC_DIR</span>/lib/libhwloc.so ]]; <span class="hljs-keyword">then</span>
-   <span class="hljs-built_in">echo</span> <span class="hljs-string">"STEP-OK"</span>
-<span class="hljs-keyword">fi</span>
-</code></pre><h3 id="fxt-(__optional__)"><a name="fxt-(__optional__)" href="#fxt-(__optional__)"></a>FXT (<strong>Optional</strong>)</h3><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>cd $SCALFMM_TEST_DIR
-if [[ ! -f fxt-0.2.11.tar.gz ]] ; then
-    wget http://download.savannah.gnu.org/releases/fkt/fxt-0.2.11.tar.gz
-fi
-tar xvf fxt-0.2.11.tar.gz
-cd fxt-0.2.11/
-export SCALFMM_FXT_DIR=$SCALFMM_TEST_DIR/fxtinstall
-./configure --prefix=$SCALFMM_FXT_DIR
-make install
-</code></pre>"><span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span>
-<span class="hljs-keyword">if</span> [[ ! <span class="hljs-operator">-f</span> fxt-<span class="hljs-number">0.2</span>.<span class="hljs-number">11</span>.tar.gz ]] ; <span class="hljs-keyword">then</span>
-    wget http://download.savannah.gnu.org/releases/fkt/fxt-<span class="hljs-number">0.2</span>.<span class="hljs-number">11</span>.tar.gz
-<span class="hljs-keyword">fi</span>
-tar xvf fxt-<span class="hljs-number">0.2</span>.<span class="hljs-number">11</span>.tar.gz
-<span class="hljs-built_in">cd</span> fxt-<span class="hljs-number">0.2</span>.<span class="hljs-number">11</span>/
-<span class="hljs-built_in">export</span> SCALFMM_FXT_DIR=<span class="hljs-variable">$SCALFMM_TEST_DIR</span>/fxtinstall
-./configure --prefix=<span class="hljs-variable">$SCALFMM_FXT_DIR</span>
-make install
-</code></pre><p><em>Output variables:</em> <code>scalfmmRegisterVariable SCALFMM_FXT_DIR</code></p><p>Valid-if:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>if [[ -n $SCALFMM_FXT_DIR ]] &amp;amp;&amp;amp; [[ -d $SCALFMM_FXT_DIR/lib/ ]] &amp;amp;&amp;amp; [[ -f  $SCALFMM_FXT_DIR/lib/libfxt.so ]]; then
-   echo &quot;STEP-OK&quot;
-fi
-</code></pre>"><span class="hljs-keyword">if</span> [[ -n <span class="hljs-variable">$SCALFMM_FXT_DIR</span> ]] &amp;&amp; [[ <span class="hljs-operator">-d</span> <span class="hljs-variable">$SCALFMM_FXT_DIR</span>/lib/ ]] &amp;&amp; [[ <span class="hljs-operator">-f</span>  <span class="hljs-variable">$SCALFMM_FXT_DIR</span>/lib/libfxt.so ]]; <span class="hljs-keyword">then</span>
-   <span class="hljs-built_in">echo</span> <span class="hljs-string">"STEP-OK"</span>
-<span class="hljs-keyword">fi</span>
-</code></pre><h3 id="fftw-(if-no-mkl-fft)"><a name="fftw-(if-no-mkl-fft)" href="#fftw-(if-no-mkl-fft)"></a>FFTW (If No MKL-FFT)</h3><p>For those who do not use MKL FFT interface, they have to install FFTW (float/double):</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>cd $SCALFMM_TEST_DIR
-if [[ ! -f fftw-3.3.4.tar.gz ]] ; then
-    wget http://www.fftw.org/fftw-3.3.4.tar.gz
-fi    
-tar xvf fftw-3.3.4.tar.gz
-cd fftw-3.3.4/
-export SCALFMM_FFTW_DIR=$SCALFMM_TEST_DIR/fftinstall
-./configure --prefix=$SCALFMM_FFTW_DIR
-make install
-./configure --prefix=$SCALFMM_FFTW_DIR --enable-float
-make install
-</code></pre>"><span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span>
-<span class="hljs-keyword">if</span> [[ ! <span class="hljs-operator">-f</span> fftw-<span class="hljs-number">3.3</span>.<span class="hljs-number">4</span>.tar.gz ]] ; <span class="hljs-keyword">then</span>
-    wget http://www.fftw.org/fftw-<span class="hljs-number">3.3</span>.<span class="hljs-number">4</span>.tar.gz
-<span class="hljs-keyword">fi</span>    
-tar xvf fftw-<span class="hljs-number">3.3</span>.<span class="hljs-number">4</span>.tar.gz
-<span class="hljs-built_in">cd</span> fftw-<span class="hljs-number">3.3</span>.<span class="hljs-number">4</span>/
-<span class="hljs-built_in">export</span> SCALFMM_FFTW_DIR=<span class="hljs-variable">$SCALFMM_TEST_DIR</span>/fftinstall
-./configure --prefix=<span class="hljs-variable">$SCALFMM_FFTW_DIR</span>
-make install
-./configure --prefix=<span class="hljs-variable">$SCALFMM_FFTW_DIR</span> --enable-float
-make install
-</code></pre><p><em>Output variables:</em> <code>scalfmmRegisterVariable SCALFMM_FFTW_DIR</code></p><p>Valid-if:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>if [[ -n $SCALFMM_FFTW_DIR ]] &amp;amp;&amp;amp; [[ -d $SCALFMM_FFTW_DIR/lib/ ]] &amp;amp;&amp;amp; [[ -f  $SCALFMM_FFTW_DIR/lib/libfftw3.a ]] &amp;amp;&amp;amp; [[ -f  $SCALFMM_FFTW_DIR/lib/libfftw3f.a ]]; then
-   echo &quot;STEP-OK&quot;
-fi
-</code></pre>"><span class="hljs-keyword">if</span> [[ -n <span class="hljs-variable">$SCALFMM_FFTW_DIR</span> ]] &amp;&amp; [[ <span class="hljs-operator">-d</span> <span class="hljs-variable">$SCALFMM_FFTW_DIR</span>/lib/ ]] &amp;&amp; [[ <span class="hljs-operator">-f</span>  <span class="hljs-variable">$SCALFMM_FFTW_DIR</span>/lib/libfftw3.a ]] &amp;&amp; [[ <span class="hljs-operator">-f</span>  <span class="hljs-variable">$SCALFMM_FFTW_DIR</span>/lib/libfftw3f.a ]]; <span class="hljs-keyword">then</span>
-   <span class="hljs-built_in">echo</span> <span class="hljs-string">"STEP-OK"</span>
-<span class="hljs-keyword">fi</span>
-</code></pre><h3 id="starpu"><a name="starpu" href="#starpu"></a>StarPU</h3><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>cd $SCALFMM_TEST_DIR
-if [[ ! -d starpu ]] ; then
-    svn export svn://scm.gforge.inria.fr/svnroot/starpu/trunk starpu
-fi    
-cd starpu/
-export SCALFMM_STARPU_DIR=$SCALFMM_TEST_DIR/starpuinstall
-./autogen.sh
-./configure --prefix=$SCALFMM_STARPU_DIR --with-fxt=$SCALFMM_FXT_DIR --with-hwloc=$SCALFMM_HWLOC_DIR --with-cuda-dir=$CUDA_PATH --disable-opencl
-make install
-</code></pre>"><span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span>
-<span class="hljs-keyword">if</span> [[ ! <span class="hljs-operator">-d</span> starpu ]] ; <span class="hljs-keyword">then</span>
-    svn <span class="hljs-built_in">export</span> svn://scm.gforge.inria.fr/svnroot/starpu/trunk starpu
-<span class="hljs-keyword">fi</span>    
-<span class="hljs-built_in">cd</span> starpu/
-<span class="hljs-built_in">export</span> SCALFMM_STARPU_DIR=<span class="hljs-variable">$SCALFMM_TEST_DIR</span>/starpuinstall
-./autogen.sh
-./configure --prefix=<span class="hljs-variable">$SCALFMM_STARPU_DIR</span> --with-fxt=<span class="hljs-variable">$SCALFMM_FXT_DIR</span> --with-hwloc=<span class="hljs-variable">$SCALFMM_HWLOC_DIR</span> --with-cuda-dir=<span class="hljs-variable">$CUDA_PATH</span> --disable-opencl
-make install
-</code></pre><blockquote>
-<p><strong>Optional</strong> In case you do not want to use trace (FXT) please remove the <code>--with-fxt=$SCALFMM_FXT_DIR</code> parameter from the command</p>
-</blockquote><p><em>Output variables:</em> <code>scalfmmRegisterVariable SCALFMM_STARPU_DIR</code></p><p>Valid-if:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>if [[ -n $SCALFMM_STARPU_DIR ]] &amp;amp;&amp;amp; [[ -d $SCALFMM_STARPU_DIR/lib/ ]] &amp;amp;&amp;amp; [[ -f  $SCALFMM_STARPU_DIR/lib/libstarpu.so ]] ; then
-   echo &quot;STEP-OK&quot;
-fi
-</code></pre>"><span class="hljs-keyword">if</span> [[ -n <span class="hljs-variable">$SCALFMM_STARPU_DIR</span> ]] &amp;&amp; [[ <span class="hljs-operator">-d</span> <span class="hljs-variable">$SCALFMM_STARPU_DIR</span>/lib/ ]] &amp;&amp; [[ <span class="hljs-operator">-f</span>  <span class="hljs-variable">$SCALFMM_STARPU_DIR</span>/lib/libstarpu.so ]] ; <span class="hljs-keyword">then</span>
-   <span class="hljs-built_in">echo</span> <span class="hljs-string">"STEP-OK"</span>
-<span class="hljs-keyword">fi</span>
-</code></pre><h3 id="scalfmm"><a name="scalfmm" href="#scalfmm"></a>ScalFMM</h3><h4 id="configure"><a name="configure" href="#configure"></a>Configure</h4><ul>
-<li>Getting the source from the last commit:<pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>cd $SCALFMM_TEST_DIR
-if [[ ! -d scalfmm-public ]] ; then
-  git clone --depth=1 https://scm.gforge.inria.fr/anonscm/git/scalfmm-public/scalfmm-public.git
-fi    
-cd scalfmm-public/
-export SCALFMM_SOURCE_DIR=`pwd`
-cd Build/
-export SCALFMM_BUILD_DIR=`pwd`
-</code></pre>"><span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span>
-<span class="hljs-keyword">if</span> [[ ! <span class="hljs-operator">-d</span> scalfmm-public ]] ; <span class="hljs-keyword">then</span>
-  git <span class="hljs-built_in">clone</span> --depth=<span class="hljs-number">1</span> https://scm.gforge.inria.fr/anonscm/git/scalfmm-public/scalfmm-public.git
-<span class="hljs-keyword">fi</span>    
-<span class="hljs-built_in">cd</span> scalfmm-public/
-<span class="hljs-built_in">export</span> SCALFMM_SOURCE_DIR=`<span class="hljs-built_in">pwd</span>`
-<span class="hljs-built_in">cd</span> Build/
-<span class="hljs-built_in">export</span> SCALFMM_BUILD_DIR=`<span class="hljs-built_in">pwd</span>`
-</code></pre>
-</li></ul><p><em>Output variables:</em> <code>scalfmmRegisterVariable SCALFMM_BUILD_DIR</code> <code>scalfmmRegisterVariable SCALFMM_SOURCE_DIR</code></p><ul>
-<li>Configure (No MKL):<pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF \
-             -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF \
-             -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=OFF \
-             -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON \
-             -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF \
-             -DHWLOC_DIR=$SCALFMM_HWLOC_DIR -DSTARPU_DIR=$SCALFMM_STARPU_DIR \
-             -DSCALFMM_USE_FFT=ON -DFFT_DIR=$SCALFMM_FFT_DIR
-</code></pre>">cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF \
-             -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF \
-             -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=OFF \
-             -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON \
-             -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF \
-             -DHWLOC_DIR=<span class="hljs-variable">$SCALFMM_HWLOC_DIR</span> -DSTARPU_DIR=<span class="hljs-variable">$SCALFMM_STARPU_DIR</span> \
-             -DSCALFMM_USE_FFT=ON -DFFT_DIR=<span class="hljs-variable">$SCALFMM_FFT_DIR</span>
-</code></pre>
-</li><li>Configure (MKL BLAS/LAPACK and FFTW):<pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF \
-             -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF \
-             -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=ON \
-             -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON \
-             -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF \
-             -DHWLOC_DIR=$SCALFMM_HWLOC_DIR -DSTARPU_DIR=$SCALFMM_STARPU_DIR \
-             -DSCALFMM_USE_FFT=ON -DFFT_DIR=$SCALFMM_FFT_DIR
-</code></pre>">cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF \
-             -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF \
-             -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=ON \
-             -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON \
-             -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF \
-             -DHWLOC_DIR=<span class="hljs-variable">$SCALFMM_HWLOC_DIR</span> -DSTARPU_DIR=<span class="hljs-variable">$SCALFMM_STARPU_DIR</span> \
-             -DSCALFMM_USE_FFT=ON -DFFT_DIR=<span class="hljs-variable">$SCALFMM_FFT_DIR</span>
-</code></pre>
-</li><li>Configure (MKL BLAS/LAPACK/FFT and No FFTW):</li></ul><blockquote>
-<p>[Plafrim-Developers] Should use that one</p>
-</blockquote><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF \
-               -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF \
-               -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=ON \
-               -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON \
-               -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF \
-               -DHWLOC_DIR=$SCALFMM_HWLOC_DIR -DSTARPU_DIR=$SCALFMM_STARPU_DIR \
-               -DSCALFMM_USE_FFT=ON -DSCALFMM_USE_MKL_AS_FFTW=ON
-</code></pre>">cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF \
-               -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF \
-               -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=ON \
-               -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON \
-               -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF \
-               -DHWLOC_DIR=<span class="hljs-variable">$SCALFMM_HWLOC_DIR</span> -DSTARPU_DIR=<span class="hljs-variable">$SCALFMM_STARPU_DIR</span> \
-               -DSCALFMM_USE_FFT=ON -DSCALFMM_USE_MKL_AS_FFTW=ON
-</code></pre><p>Valid-if:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>cmake .. ; if [[ &quot;$?&quot; == &quot;0&quot; ]] ; then echo &quot;STEP-OK&quot; ; fi
-</code></pre>">cmake .. ; <span class="hljs-keyword">if</span> [[ <span class="hljs-string">"$?"</span> == <span class="hljs-string">"0"</span> ]] ; <span class="hljs-keyword">then</span> <span class="hljs-built_in">echo</span> <span class="hljs-string">"STEP-OK"</span> ; <span class="hljs-keyword">fi</span>
-</code></pre><h4 id="build"><a name="build" href="#build"></a>Build</h4><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>cd $SCALFMM_BUILD_DIR
-make testBlockedUnifCudaBench
-</code></pre>"><span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_BUILD_DIR</span>
-make <span class="hljs-built_in">test</span>BlockedUnifCudaBench
-</code></pre><p>Valid-if:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>ls ./Tests/Release/testBlockedUnifCudaBench ; if [[ &quot;$?&quot; == &quot;0&quot; ]] ; then echo &quot;STEP-OK&quot; ; fi
-</code></pre>">ls ./Tests/Release/<span class="hljs-built_in">test</span>BlockedUnifCudaBench ; <span class="hljs-keyword">if</span> [[ <span class="hljs-string">"$?"</span> == <span class="hljs-string">"0"</span> ]] ; <span class="hljs-keyword">then</span> <span class="hljs-built_in">echo</span> <span class="hljs-string">"STEP-OK"</span> ; <span class="hljs-keyword">fi</span>
-</code></pre><h4 id="first-execution"><a name="first-execution" href="#first-execution"></a>First Execution</h4><p>In this section we compute a simulation and look at the resulting trace.<br>ScalFMM binary parameters and descriptions:</p><ul>
-<li>Passing <code>--help</code> as parameter provide the possible/valid parameters</li><li>Simulation properties are choosen by :<ul>
-<li><code>-h</code> : height of the tree</li><li><code>-bs</code> : granularity/size of the group</li><li><code>-nb</code> : number of particles generated</li></ul>
-</li><li>Execution properties are choosen by the StarPU environment variables :<ul>
-<li><code>STARPU_NCPUS</code> : the number of CPU workers</li><li><code>STARPU_NCUDA</code> : the number of GPU workers (for heterogeneous binary)</li></ul>
-</li><li>By default the application will not compare the FMM interactions against the direct method (which is N^2) and so it is recommended to avoid the validation for large test cases. But to get the accuracy one must pass the parameter <code>-validation</code></li><li><code>-p2p-m2l-cuda-only</code> : to compute the P2P and the M2L only on GPU (the rest on the CPU)</li></ul><p>Examples:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>export STARPU_NCPUS=12
-export STARPU_NCUDA=2
-./Tests/Release/testBlockedUnifCudaBench -nb 30000000 -h 7 -bs 800
-</code></pre>"><span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-number">12</span>
-<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-number">2</span>
-./Tests/Release/<span class="hljs-built_in">test</span>BlockedUnifCudaBench -nb <span class="hljs-number">30000000</span> -h <span class="hljs-number">7</span> -bs <span class="hljs-number">800</span>
-</code></pre><p>Last part of the output should be:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>    Start FGroupTaskStarPUAlgorithm
-         directPass in 0.0406482s
-             inblock  in 0.000780428s
-             outblock in 0.0398674s
-         bottomPass in 0.00586269s
-         upwardPass in 0.00265723s
-         transferPass in 0.00323571s
-             inblock in  0.000124817s
-             outblock in 0.00298331s
-         downardPass in 0.00257975s
-         transferPass in 0.0652285s
-             inblock in  0.00164774s
-             outblock in 0.0635799s
-         L2P in 0.0115733s
-         Submitting the tasks took 0.139101s
-         Moving data to the host took 0.0578765s
-@EXEC TIME = 14.6321s
-</code></pre>">    Start FGroupTaskStarPUAlgorithm
-         directPass <span class="hljs-keyword">in</span> <span class="hljs-number">0.0406482</span>s
-             inblock  <span class="hljs-keyword">in</span> <span class="hljs-number">0.000780428</span>s
-             outblock <span class="hljs-keyword">in</span> <span class="hljs-number">0.0398674</span>s
-         bottomPass <span class="hljs-keyword">in</span> <span class="hljs-number">0.00586269</span>s
-         upwardPass <span class="hljs-keyword">in</span> <span class="hljs-number">0.00265723</span>s
-         transferPass <span class="hljs-keyword">in</span> <span class="hljs-number">0.00323571</span>s
-             inblock <span class="hljs-keyword">in</span>  <span class="hljs-number">0.000124817</span>s
-             outblock <span class="hljs-keyword">in</span> <span class="hljs-number">0.00298331</span>s
-         downardPass <span class="hljs-keyword">in</span> <span class="hljs-number">0.00257975</span>s
-         transferPass <span class="hljs-keyword">in</span> <span class="hljs-number">0.0652285</span>s
-             inblock <span class="hljs-keyword">in</span>  <span class="hljs-number">0.00164774</span>s
-             outblock <span class="hljs-keyword">in</span> <span class="hljs-number">0.0635799</span>s
-         L2P <span class="hljs-keyword">in</span> <span class="hljs-number">0.0115733</span>s
-         Submitting the tasks took <span class="hljs-number">0.139101</span>s
-         Moving data to the host took <span class="hljs-number">0.0578765</span>s
-@EXEC TIME = <span class="hljs-number">14.6321</span>s
-</code></pre><ul>
-<li>Visualize the execution trace (<strong>Optional</strong>)</li></ul><p>Convert the fxt file</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>$SCALFMM_STARPU_DIR/bin/starpu_fxt_tool -i &quot;/tmp/prof_file_&quot;$USER&quot;_0&quot;
-</code></pre>"><span class="hljs-variable">$SCALFMM_STARPU_DIR</span>/bin/starpu_fxt_tool -i <span class="hljs-string">"/tmp/prof_file_"</span><span class="hljs-variable">$USER</span><span class="hljs-string">"_0"</span>
-</code></pre><p>Then visualize the output with <code>vite</code> (maybe by copying the paje.trace file locally)</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>vite ./paje.trace
-</code></pre>">vite ./paje.trace
-</code></pre><p>Should be like:<br><img src="/home/bbramas/Projects/ScalfmmGit/scalfmm/Addons/BenchEfficiency/trace-example.png" alt="Trace"></p><p>We can convert the color of the trace by (requiere Qt5 library):</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>$SCALFMM_SOURCE_DIR/Addons/BenchEfficiency/pajecolor paje.trace $SCALFMM_SOURCE_DIR/Addons/BenchEfficiency/paintmodel.fmm.colors
-vite ./paje.trace.painted
-</code></pre>"><span class="hljs-variable">$SCALFMM_SOURCE_DIR</span>/Addons/BenchEfficiency/pajecolor paje.trace <span class="hljs-variable">$SCALFMM_SOURCE_DIR</span>/Addons/BenchEfficiency/paintmodel.fmm.colors
-vite ./paje.trace.painted
-</code></pre><p>Should be like:<br><img src="/home/bbramas/Projects/ScalfmmGit/scalfmm/Addons/BenchEfficiency/trace-example-colors.png" alt="Trace"></p><ul>
-<li>Get execution times</li></ul><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>python $SCALFMM_STARPU_DIR/bin/starpu_trace_state_stats.py -t trace.rec
-</code></pre>">python <span class="hljs-variable">$SCALFMM_STARPU_DIR</span>/bin/starpu_trace_state_stats.py -t trace.rec
-</code></pre><p>Should give something like:</p><pre><code data-origin="<pre><code>&quot;Name&quot;,&quot;Count&quot;,&quot;Type&quot;,&quot;Duration&quot;
-&quot;Initializing&quot;,14,&quot;Runtime&quot;,7153.096196
-&quot;Overhead&quot;,57010,&quot;Runtime&quot;,376.473463
-&quot;Idle&quot;,14355,&quot;Other&quot;,12.815899
-&quot;Scheduling&quot;,28441,&quot;Runtime&quot;,238.367394
-&quot;Sleeping&quot;,610,&quot;Other&quot;,13786.513208
-&quot;FetchingInput&quot;,14341,&quot;Runtime&quot;,13918.805814
-&quot;execute_on_all_wrapper&quot;,30,&quot;Task&quot;,21.288802
-&quot;Executing&quot;,414,&quot;Runtime&quot;,26852.864578
-&quot;PushingOutput&quot;,14341,&quot;Runtime&quot;,284.96123
-&quot;P2P-out&quot;,3846,&quot;Task&quot;,60378.266619
-&quot;Callback&quot;,13559,&quot;Runtime&quot;,4.210633
-&quot;P2P&quot;,328,&quot;Task&quot;,15383.426991
-&quot;M2L-level-5&quot;,41,&quot;Task&quot;,2354.702554
-&quot;M2L-level-6&quot;,328,&quot;Task&quot;,18349.915495
-&quot;Deinitializing&quot;,14,&quot;Runtime&quot;,109.87483
-&quot;M2L-level-4&quot;,6,&quot;Task&quot;,275.088295
-&quot;P2M&quot;,328,&quot;Task&quot;,11312.022842
-&quot;M2M-level-5&quot;,328,&quot;Task&quot;,829.9055
-&quot;M2M-level-4&quot;,41,&quot;Task&quot;,93.130498
-&quot;M2L-out-level-5&quot;,638,&quot;Task&quot;,1914.900053
-&quot;M2M-level-3&quot;,6,&quot;Task&quot;,11.053067
-&quot;M2M-level-2&quot;,1,&quot;Task&quot;,1.363157
-&quot;M2L-out-level-4&quot;,22,&quot;Task&quot;,159.580457
-&quot;L2L-level-4&quot;,41,&quot;Task&quot;,84.554065
-&quot;L2L-level-5&quot;,328,&quot;Task&quot;,1087.717767
-&quot;M2L-out-level-6&quot;,7692,&quot;Task&quot;,18322.518045
-&quot;L2P&quot;,328,&quot;Task&quot;,27146.256793
-&quot;M2L-level-2&quot;,1,&quot;Task&quot;,2.661235
-&quot;L2L-level-3&quot;,6,&quot;Task&quot;,11.346978
-&quot;M2L-level-3&quot;,1,&quot;Task&quot;,47.612555
-&quot;L2L-level-2&quot;,1,&quot;Task&quot;,1.471873
-</code></pre>">"Name","Count","Type","Duration"
-"Initializing",14,"Runtime",7153.096196
-"Overhead",57010,"Runtime",376.473463
-"Idle",14355,"Other",12.815899
-"Scheduling",28441,"Runtime",238.367394
-"Sleeping",610,"Other",13786.513208
-"FetchingInput",14341,"Runtime",13918.805814
-"execute_on_all_wrapper",30,"Task",21.288802
-"Executing",414,"Runtime",26852.864578
-"PushingOutput",14341,"Runtime",284.96123
-"P2P-out",3846,"Task",60378.266619
-"Callback",13559,"Runtime",4.210633
-"P2P",328,"Task",15383.426991
-"M2L-level-5",41,"Task",2354.702554
-"M2L-level-6",328,"Task",18349.915495
-"Deinitializing",14,"Runtime",109.87483
-"M2L-level-4",6,"Task",275.088295
-"P2M",328,"Task",11312.022842
-"M2M-level-5",328,"Task",829.9055
-"M2M-level-4",41,"Task",93.130498
-"M2L-out-level-5",638,"Task",1914.900053
-"M2M-level-3",6,"Task",11.053067
-"M2M-level-2",1,"Task",1.363157
-"M2L-out-level-4",22,"Task",159.580457
-"L2L-level-4",41,"Task",84.554065
-"L2L-level-5",328,"Task",1087.717767
-"M2L-out-level-6",7692,"Task",18322.518045
-"L2P",328,"Task",27146.256793
-"M2L-level-2",1,"Task",2.661235
-"L2L-level-3",6,"Task",11.346978
-"M2L-level-3",1,"Task",47.612555
-"L2L-level-2",1,"Task",1.471873
-</code></pre><p>Most of the script are in the addon directories</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>export SCALFMM_AB=$SCALFMM_SOURCE_DIR/Addons/BenchEfficiency/
-</code></pre>"><span class="hljs-built_in">export</span> SCALFMM_AB=<span class="hljs-variable">$SCALFMM_SOURCE_DIR</span>/Addons/BenchEfficiency/
-</code></pre><p><em>Output variable:</em> <code>scalfmmRegisterVariable SCALFMM_AB</code></p><h2 id="homogeneous-efficiencies"><a name="homogeneous-efficiencies" href="#homogeneous-efficiencies"></a>Homogeneous Efficiencies</h2><p>Here we compute the efficiencies for a given test case on CPU only.</p><p>Go in the build dir and create output dir</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>cd $SCALFMM_BUILD_DIR
-export SCALFMM_RES_DIR=$SCALFMM_BUILD_DIR/homogeneous
-mkdir $SCALFMM_RES_DIR
-</code></pre>"><span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_BUILD_DIR</span>
-<span class="hljs-built_in">export</span> SCALFMM_RES_DIR=<span class="hljs-variable">$SCALFMM_BUILD_DIR</span>/homogeneous
-mkdir <span class="hljs-variable">$SCALFMM_RES_DIR</span>
-</code></pre><p><em>Output variable:</em> <code>scalfmmRegisterVariable SCALFMM_RES_DIR</code> </p><p>Set up the configuration variables:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>export SCALFMM_NB=10000000
-export SCALFMM_H=7
-export SCALFMM_MIN_BS=100
-export SCALFMM_MAX_BS=10000
-export SCALFMM_MAX_NB_CPU=24
-</code></pre>"><span class="hljs-built_in">export</span> SCALFMM_NB=<span class="hljs-number">10000000</span>
-<span class="hljs-built_in">export</span> SCALFMM_H=<span class="hljs-number">7</span>
-<span class="hljs-built_in">export</span> SCALFMM_MIN_BS=<span class="hljs-number">100</span>
-<span class="hljs-built_in">export</span> SCALFMM_MAX_BS=<span class="hljs-number">10000</span>
-<span class="hljs-built_in">export</span> SCALFMM_MAX_NB_CPU=<span class="hljs-number">24</span>
-</code></pre><p>Find best granularity in sequential and in parallel:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>export STARPU_NCPUS=1
-export STARPU_NCUDA=0
-export SCALFMM_BS_CPU_SEQ=`$SCALFMM_AB/scalfmmFindBs.sh &quot;./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs&quot; $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmmExtractKey.sh &quot;@BEST BS&quot; `
-if [[ `which gnuplot | wc -l` == &quot;1&quot; ]] ;  then
-    gnuplot -e &quot;filename='seq-bs-search'&quot; $SCALFMM_AB/scalfmmFindBs.gplot
-fi
-
-export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU
-export STARPU_NCUDA=0
-export SCALFMM_BS_CPU_PAR=`$SCALFMM_AB/scalfmmFindBs.sh &quot;./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs&quot; $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key &quot;@BEST BS&quot; `
-if [[ `which gnuplot | wc -l` == &quot;1&quot; ]] ;  then
-    gnuplot -e &quot;filename='par-bs-search'&quot; $SCALFMM_AB/scalfmmFindBs.gplot
-fi
-</code></pre>"><span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-number">1</span>
-<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-number">0</span>
-<span class="hljs-built_in">export</span> SCALFMM_BS_CPU_SEQ=`<span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.sh <span class="hljs-string">"./Tests/Release/testBlockedUnifCudaBench -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> -bs"</span> <span class="hljs-variable">$SCALFMM_MIN_BS</span> <span class="hljs-variable">$SCALFMM_MAX_BS</span> | <span class="hljs-variable">$SCALFMM_AB</span>/scalfmmExtractKey.sh <span class="hljs-string">"@BEST BS"</span> `
-<span class="hljs-keyword">if</span> [[ `<span class="hljs-built_in">which</span> gnuplot | wc <span class="hljs-operator">-l</span>` == <span class="hljs-string">"1"</span> ]] ;  <span class="hljs-keyword">then</span>
-    gnuplot <span class="hljs-operator">-e</span> <span class="hljs-string">"filename='seq-bs-search'"</span> <span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.gplot
-<span class="hljs-keyword">fi</span>
-
-<span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-variable">$SCALFMM_MAX_NB_CPU</span>
-<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-number">0</span>
-<span class="hljs-built_in">export</span> SCALFMM_BS_CPU_PAR=`<span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.sh <span class="hljs-string">"./Tests/Release/testBlockedUnifCudaBench -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> -bs"</span> <span class="hljs-variable">$SCALFMM_MIN_BS</span> <span class="hljs-variable">$SCALFMM_MAX_BS</span> | <span class="hljs-variable">$SCALFMM_AB</span>/scalfmm_extract_key <span class="hljs-string">"@BEST BS"</span> `
-<span class="hljs-keyword">if</span> [[ `<span class="hljs-built_in">which</span> gnuplot | wc <span class="hljs-operator">-l</span>` == <span class="hljs-string">"1"</span> ]] ;  <span class="hljs-keyword">then</span>
-    gnuplot <span class="hljs-operator">-e</span> <span class="hljs-string">"filename='par-bs-search'"</span> <span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.gplot
-<span class="hljs-keyword">fi</span>
-</code></pre><p>In our case we get 9710  and 5385.</p><p><em>Output variable:</em> <code>scalfmmRegisterVariable SCALFMM_BS_CPU_SEQ</code>  <code>scalfmmRegisterVariable SCALFMM_BS_CPU_PAR</code></p><p>We can look to the work that has been done to find the best granularity:<br><img src="/home/bbramas/Projects/ScalfmmGit/scalfmm/Addons/BenchEfficiency/seq-bs-search.png" alt="In sequential"><br><img src="/home/bbramas/Projects/ScalfmmGit/scalfmm/Addons/BenchEfficiency/par-bs-search.png" alt="In parallel"></p><p>Then we compute the efficiency using both granulirities and keep the .rec files:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>export SCALFMM_MAX_NB_CPU=24
-export STARPU_NCUDA=0
-source &quot;$SCALFMM_AB/execAllHomogeneous.sh&quot;
-</code></pre>"><span class="hljs-built_in">export</span> SCALFMM_MAX_NB_CPU=<span class="hljs-number">24</span>
-<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-number">0</span>
-<span class="hljs-built_in">source</span> <span class="hljs-string">"<span class="hljs-variable">$SCALFMM_AB</span>/execAllHomogeneous.sh"</span>
-</code></pre><p>We should end with all the .rec files and their corresponding time files and <code>ls "$SCALFMM_RES_DIR"</code> should return something like:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>trace-nb_10000000-h_7-bs_5385-CPU_10.rec       trace-nb_10000000-h_7-bs_5385-CPU_16.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_22.rec       trace-nb_10000000-h_7-bs_5385-CPU_5.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_10.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_17.rec       trace-nb_10000000-h_7-bs_5385-CPU_22.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_6.rec
-trace-nb_10000000-h_7-bs_5385-CPU_11.rec       trace-nb_10000000-h_7-bs_5385-CPU_17.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_23.rec       trace-nb_10000000-h_7-bs_5385-CPU_6.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_11.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_18.rec       trace-nb_10000000-h_7-bs_5385-CPU_23.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_7.rec
-trace-nb_10000000-h_7-bs_5385-CPU_12.rec       trace-nb_10000000-h_7-bs_5385-CPU_18.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_24.rec       trace-nb_10000000-h_7-bs_5385-CPU_7.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_12.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_19.rec       trace-nb_10000000-h_7-bs_5385-CPU_24.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_8.rec
-trace-nb_10000000-h_7-bs_5385-CPU_13.rec       trace-nb_10000000-h_7-bs_5385-CPU_19.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_2.rec        trace-nb_10000000-h_7-bs_5385-CPU_8.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_13.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_1.rec        trace-nb_10000000-h_7-bs_5385-CPU_2.rec.time   trace-nb_10000000-h_7-bs_5385-CPU_9.rec
-trace-nb_10000000-h_7-bs_5385-CPU_14.rec       trace-nb_10000000-h_7-bs_5385-CPU_1.rec.time   trace-nb_10000000-h_7-bs_5385-CPU_3.rec        trace-nb_10000000-h_7-bs_5385-CPU_9.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_14.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_20.rec       trace-nb_10000000-h_7-bs_5385-CPU_3.rec.time   trace-nb_10000000-h_7-bs_9710-CPU_1.rec
-trace-nb_10000000-h_7-bs_5385-CPU_15.rec       trace-nb_10000000-h_7-bs_5385-CPU_20.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_4.rec        trace-nb_10000000-h_7-bs_9710-CPU_1.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_15.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_21.rec       trace-nb_10000000-h_7-bs_5385-CPU_4.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_16.rec       trace-nb_10000000-h_7-bs_5385-CPU_21.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_5.rec
-</code></pre>">trace-nb_10000000-h_7-bs_5385-CPU_10.rec       trace-nb_10000000-h_7-bs_5385-CPU_16.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_22.rec       trace-nb_10000000-h_7-bs_5385-CPU_5.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_10.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_17.rec       trace-nb_10000000-h_7-bs_5385-CPU_22.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_6.rec
-trace-nb_10000000-h_7-bs_5385-CPU_11.rec       trace-nb_10000000-h_7-bs_5385-CPU_17.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_23.rec       trace-nb_10000000-h_7-bs_5385-CPU_6.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_11.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_18.rec       trace-nb_10000000-h_7-bs_5385-CPU_23.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_7.rec
-trace-nb_10000000-h_7-bs_5385-CPU_12.rec       trace-nb_10000000-h_7-bs_5385-CPU_18.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_24.rec       trace-nb_10000000-h_7-bs_5385-CPU_7.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_12.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_19.rec       trace-nb_10000000-h_7-bs_5385-CPU_24.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_8.rec
-trace-nb_10000000-h_7-bs_5385-CPU_13.rec       trace-nb_10000000-h_7-bs_5385-CPU_19.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_2.rec        trace-nb_10000000-h_7-bs_5385-CPU_8.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_13.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_1.rec        trace-nb_10000000-h_7-bs_5385-CPU_2.rec.time   trace-nb_10000000-h_7-bs_5385-CPU_9.rec
-trace-nb_10000000-h_7-bs_5385-CPU_14.rec       trace-nb_10000000-h_7-bs_5385-CPU_1.rec.time   trace-nb_10000000-h_7-bs_5385-CPU_3.rec        trace-nb_10000000-h_7-bs_5385-CPU_9.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_14.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_20.rec       trace-nb_10000000-h_7-bs_5385-CPU_3.rec.time   trace-nb_10000000-h_7-bs_9710-CPU_1.rec
-trace-nb_10000000-h_7-bs_5385-CPU_15.rec       trace-nb_10000000-h_7-bs_5385-CPU_20.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_4.rec        trace-nb_10000000-h_7-bs_9710-CPU_1.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_15.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_21.rec       trace-nb_10000000-h_7-bs_5385-CPU_4.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_16.rec       trace-nb_10000000-h_7-bs_5385-CPU_21.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_5.rec
-</code></pre><p>We then compute the efficiencies from these files</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>g++ -std=c++11 $SCALFMM_AB/mergetimefile.cpp -o $SCALFMM_AB/mergetimefile.exe
-$SCALFMM_AB/mergetimefile.exe \
-        &quot;$SCALFMM_RES_DIR/trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_BS_CPU_SEQ-CPU_1.rec.time&quot; \
-        &quot;$SCALFMM_RES_DIR/trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_BS_CPU_PAR-CPU_%d.rec.time&quot;\
-         $SCALFMM_MAX_NB_CPU
-</code></pre>">g++ -std=c++<span class="hljs-number">11</span> <span class="hljs-variable">$SCALFMM_AB</span>/mergetimefile.cpp -o <span class="hljs-variable">$SCALFMM_AB</span>/mergetimefile.exe
-<span class="hljs-variable">$SCALFMM_AB</span>/mergetimefile.exe \
-        <span class="hljs-string">"<span class="hljs-variable">$SCALFMM_RES_DIR</span>/trace-nb_<span class="hljs-variable">$SCALFMM_NB</span>-h_<span class="hljs-variable">$SCALFMM_H</span>-bs_<span class="hljs-variable">$SCALFMM_BS_CPU_SEQ</span>-CPU_1.rec.time"</span> \
-        <span class="hljs-string">"<span class="hljs-variable">$SCALFMM_RES_DIR</span>/trace-nb_<span class="hljs-variable">$SCALFMM_NB</span>-h_<span class="hljs-variable">$SCALFMM_H</span>-bs_<span class="hljs-variable">$SCALFMM_BS_CPU_PAR</span>-CPU_%d.rec.time"</span>\
-         <span class="hljs-variable">$SCALFMM_MAX_NB_CPU</span>
-</code></pre><p>We end-up with the global efficiencies (for the application) but also for the different operators.</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>Create global-eff.data
-Create task-eff.data
-Create task-gr-eff.dat
-</code></pre>">Create global-eff.data
-Create task-eff.data
-Create task-gr-eff.dat
-</code></pre><p>We can plot each of them</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>gnuplot -e &quot;filename='global-eff'&quot; $SCALFMM_AB/scalfmmPlotAll.gplot
-gnuplot -e &quot;filename='task-eff'&quot; $SCALFMM_AB/scalfmmPlotAll.gplot
-gnuplot -e &quot;filename='task-gr-eff'&quot; $SCALFMM_AB/scalfmmPlotAll.gplot
-</code></pre>">gnuplot <span class="hljs-operator">-e</span> <span class="hljs-string">"filename='global-eff'"</span> <span class="hljs-variable">$SCALFMM_AB</span>/scalfmmPlotAll.gplot
-gnuplot <span class="hljs-operator">-e</span> <span class="hljs-string">"filename='task-eff'"</span> <span class="hljs-variable">$SCALFMM_AB</span>/scalfmmPlotAll.gplot
-gnuplot <span class="hljs-operator">-e</span> <span class="hljs-string">"filename='task-gr-eff'"</span> <span class="hljs-variable">$SCALFMM_AB</span>/scalfmmPlotAll.gplot
-</code></pre><p>In our case it gives:<br><img src="/home/bbramas/Projects/ScalfmmGit/scalfmm/Addons/BenchEfficiency/global-eff.png" alt="global-eff"><br><img src="/home/bbramas/Projects/ScalfmmGit/scalfmm/Addons/BenchEfficiency/task-eff.png" alt="task-eff"><br><img src="/home/bbramas/Projects/ScalfmmGit/scalfmm/Addons/BenchEfficiency/task-gr-eff.png" alt="task-gr-eff"></p><h2 id="heterogeneous"><a name="heterogeneous" href="#heterogeneous"></a>Heterogeneous</h2><p><strong>NOT FINISHED!!!!</strong></p><p>For test case <code>-nb 10000000</code> (10 million) and <code>-h 6</code> (height of the tree equal to 6),<br>we first want to know the best granularity <code>-bs</code>.</p><p>This parameter will certainly not be the same for sequential/parallel/heterogenous configurations.</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>export SCALFMM_NB=10000000
-export SCALFMM_H=7
-export SCALFMM_MIN_BS=100
-export SCALFMM_MAX_BS=3000
-export SCALFMM_MAX_NB_CPU=24
-export SCALFMM_MAX_NB_GPU=4
-</code></pre>"><span class="hljs-built_in">export</span> SCALFMM_NB=<span class="hljs-number">10000000</span>
-<span class="hljs-built_in">export</span> SCALFMM_H=<span class="hljs-number">7</span>
-<span class="hljs-built_in">export</span> SCALFMM_MIN_BS=<span class="hljs-number">100</span>
-<span class="hljs-built_in">export</span> SCALFMM_MAX_BS=<span class="hljs-number">3000</span>
-<span class="hljs-built_in">export</span> SCALFMM_MAX_NB_CPU=<span class="hljs-number">24</span>
-<span class="hljs-built_in">export</span> SCALFMM_MAX_NB_GPU=<span class="hljs-number">4</span>
-</code></pre><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>export STARPU_NCPUS=1
-export STARPU_NCUDA=0
-export SCALFMM_BS_CPU_SEQ=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key &quot;@BEST BS&quot; `
-if [[ `which gnuplot | wc -l` == &quot;1&quot; ]] ;  then
-    gnuplot -e &quot;filename='seq-bs-search'&quot; $SCALFMM_AB/scalfmmFindBs.gplot
-fi
-
-export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU
-export STARPU_NCUDA=0
-export SCALFMM_BS_CPU_PAR=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key &quot;@BEST BS&quot; `
-if [[ `which gnuplot | wc -l` == &quot;1&quot; ]] ;  then
-    gnuplot -e &quot;filename='par-bs-search'&quot; $SCALFMM_AB/scalfmmFindBs.gplot
-fi
-
-export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU
-export STARPU_NCUDA=$SCALFMM_MAX_NB_GPU
-export SCALFMM_BS_CPU_GPU=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key &quot;@BEST BS&quot; `
-if [[ `which gnuplot | wc -l` == &quot;1&quot; ]] ;  then
-    gnuplot -e &quot;filename='cpugpu-bs-search'&quot; $SCALFMM_AB/scalfmmFindBs.gplot
-fi
-</code></pre>"><span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-number">1</span>
-<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-number">0</span>
-<span class="hljs-built_in">export</span> SCALFMM_BS_CPU_SEQ=`<span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.sh -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> <span class="hljs-variable">$SCALFMM_MIN_BS</span> <span class="hljs-variable">$SCALFMM_MAX_BS</span> | <span class="hljs-variable">$SCALFMM_AB</span>/scalfmm_extract_key <span class="hljs-string">"@BEST BS"</span> `
-<span class="hljs-keyword">if</span> [[ `<span class="hljs-built_in">which</span> gnuplot | wc <span class="hljs-operator">-l</span>` == <span class="hljs-string">"1"</span> ]] ;  <span class="hljs-keyword">then</span>
-    gnuplot <span class="hljs-operator">-e</span> <span class="hljs-string">"filename='seq-bs-search'"</span> <span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.gplot
-<span class="hljs-keyword">fi</span>
-
-<span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-variable">$SCALFMM_MAX_NB_CPU</span>
-<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-number">0</span>
-<span class="hljs-built_in">export</span> SCALFMM_BS_CPU_PAR=`<span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.sh -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> <span class="hljs-variable">$SCALFMM_MIN_BS</span> <span class="hljs-variable">$SCALFMM_MAX_BS</span> | <span class="hljs-variable">$SCALFMM_AB</span>/scalfmm_extract_key <span class="hljs-string">"@BEST BS"</span> `
-<span class="hljs-keyword">if</span> [[ `<span class="hljs-built_in">which</span> gnuplot | wc <span class="hljs-operator">-l</span>` == <span class="hljs-string">"1"</span> ]] ;  <span class="hljs-keyword">then</span>
-    gnuplot <span class="hljs-operator">-e</span> <span class="hljs-string">"filename='par-bs-search'"</span> <span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.gplot
-<span class="hljs-keyword">fi</span>
-
-<span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-variable">$SCALFMM_MAX_NB_CPU</span>
-<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-variable">$SCALFMM_MAX_NB_GPU</span>
-<span class="hljs-built_in">export</span> SCALFMM_BS_CPU_GPU=`<span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.sh -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> <span class="hljs-variable">$SCALFMM_MIN_BS</span> <span class="hljs-variable">$SCALFMM_MAX_BS</span> | <span class="hljs-variable">$SCALFMM_AB</span>/scalfmm_extract_key <span class="hljs-string">"@BEST BS"</span> `
-<span class="hljs-keyword">if</span> [[ `<span class="hljs-built_in">which</span> gnuplot | wc <span class="hljs-operator">-l</span>` == <span class="hljs-string">"1"</span> ]] ;  <span class="hljs-keyword">then</span>
-    gnuplot <span class="hljs-operator">-e</span> <span class="hljs-string">"filename='cpugpu-bs-search'"</span> <span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.gplot
-<span class="hljs-keyword">fi</span>
-</code></pre><p>Then, we can execute three best configurations, and keep .rec for each of them:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>export STARPU_NCPUS=1
-export STARPU_NCUDA=0
-./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_CPU_SEQ
-export SCALFMM_SEQ_REC=&quot;trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec&quot;
-mv trace.rec $SCALFMM_SEQ_REC
-
-export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU
-export STARPU_NCUDA=0
-./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_PAR
-export SCALFMM_PAR_REC=&quot;trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec&quot;
-mv trace.rec $SCALFMM_PAR_REC
-
-export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU
-export STARPU_NCUDA=$SCALFMM_MAX_NB_GPU
-./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_GPU
-export SCALFMM_PAR_CPU_GPU_REC=&quot;trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec&quot;
-mv trace.rec $SCALFMM_PAR_CPU_GPU_REC
-</code></pre>"><span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-number">1</span>
-<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-number">0</span>
-./Tests/Release/<span class="hljs-built_in">test</span>BlockedUnifCudaBench -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> -bs <span class="hljs-variable">$SCALFMM_CPU_SEQ</span>
-<span class="hljs-built_in">export</span> SCALFMM_SEQ_REC=<span class="hljs-string">"trace-nb_<span class="hljs-variable">$SCALFMM_NB</span>-h_<span class="hljs-variable">$SCALFMM_H</span>-bs_<span class="hljs-variable">$SCALFMM_CPU_SEQ</span>-CPU_<span class="hljs-variable">$STARPU_NCPUS</span>-GPU_<span class="hljs-variable">$STARPU_NCUDA</span>.rec"</span>
-mv trace.rec <span class="hljs-variable">$SCALFMM_SEQ_REC</span>
-
-<span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-variable">$SCALFMM_MAX_NB_CPU</span>
-<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-number">0</span>
-./Tests/Release/<span class="hljs-built_in">test</span>BlockedUnifCudaBench -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> -bs <span class="hljs-variable">$SCALFMM_BS_CPU_PAR</span>
-<span class="hljs-built_in">export</span> SCALFMM_PAR_REC=<span class="hljs-string">"trace-nb_<span class="hljs-variable">$SCALFMM_NB</span>-h_<span class="hljs-variable">$SCALFMM_H</span>-bs_<span class="hljs-variable">$SCALFMM_CPU_SEQ</span>-CPU_<span class="hljs-variable">$STARPU_NCPUS</span>-GPU_<span class="hljs-variable">$STARPU_NCUDA</span>.rec"</span>
-mv trace.rec <span class="hljs-variable">$SCALFMM_PAR_REC</span>
-
-<span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-variable">$SCALFMM_MAX_NB_CPU</span>
-<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-variable">$SCALFMM_MAX_NB_GPU</span>
-./Tests/Release/<span class="hljs-built_in">test</span>BlockedUnifCudaBench -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> -bs <span class="hljs-variable">$SCALFMM_BS_CPU_GPU</span>
-<span class="hljs-built_in">export</span> SCALFMM_PAR_CPU_GPU_REC=<span class="hljs-string">"trace-nb_<span class="hljs-variable">$SCALFMM_NB</span>-h_<span class="hljs-variable">$SCALFMM_H</span>-bs_<span class="hljs-variable">$SCALFMM_CPU_SEQ</span>-CPU_<span class="hljs-variable">$STARPU_NCPUS</span>-GPU_<span class="hljs-variable">$STARPU_NCUDA</span>.rec"</span>
-mv trace.rec <span class="hljs-variable">$SCALFMM_PAR_CPU_GPU_REC</span>
-</code></pre><p>And we also want the GPU tasks only on GPU</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU
-export STARPU_NCUDA=$SCALFMM_MAX_NB_GPU
-./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_GPU -p2p-m2l-cuda-only
-export SCALFMM_PAR_GPU_REC=&quot;trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA-GPUONLY.rec&quot;
-mv trace.rec $SCALFMM_PAR_GPU_REC
-</code></pre>"><span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-variable">$SCALFMM_MAX_NB_CPU</span>
-<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-variable">$SCALFMM_MAX_NB_GPU</span>
-./Tests/Release/<span class="hljs-built_in">test</span>BlockedUnifCudaBench -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> -bs <span class="hljs-variable">$SCALFMM_BS_CPU_GPU</span> -p2p-m2l-cuda-only
-<span class="hljs-built_in">export</span> SCALFMM_PAR_GPU_REC=<span class="hljs-string">"trace-nb_<span class="hljs-variable">$SCALFMM_NB</span>-h_<span class="hljs-variable">$SCALFMM_H</span>-bs_<span class="hljs-variable">$SCALFMM_CPU_SEQ</span>-CPU_<span class="hljs-variable">$STARPU_NCPUS</span>-GPU_<span class="hljs-variable">$STARPU_NCUDA</span>-GPUONLY.rec"</span>
-mv trace.rec <span class="hljs-variable">$SCALFMM_PAR_GPU_REC</span>
-</code></pre><p>And we want the sequential version with parallel granularity:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>export STARPU_NCPUS=1
-export STARPU_NCUDA=0
-
-./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_PAR
-SCALFMM_SEQ_CPU_BS_REC=&quot;trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec&quot;
-mv trace.rec $SCALFMM_SEQ_CPU_BS_REC
-
-./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_GPU
-SCALFMM_SEQ_GPU_BS_REC=&quot;trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec&quot;
-mv trace.rec $SCALFMM_SEQ_GPU_BS_REC
-</code></pre>"><span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-number">1</span>
-<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-number">0</span>
-
-./Tests/Release/<span class="hljs-built_in">test</span>BlockedUnifCudaBench -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> -bs <span class="hljs-variable">$SCALFMM_BS_CPU_PAR</span>
-SCALFMM_SEQ_CPU_BS_REC=<span class="hljs-string">"trace-nb_<span class="hljs-variable">$SCALFMM_NB</span>-h_<span class="hljs-variable">$SCALFMM_H</span>-bs_<span class="hljs-variable">$SCALFMM_CPU_SEQ</span>-CPU_<span class="hljs-variable">$STARPU_NCPUS</span>-GPU_<span class="hljs-variable">$STARPU_NCUDA</span>.rec"</span>
-mv trace.rec <span class="hljs-variable">$SCALFMM_SEQ_CPU_BS_REC</span>
-
-./Tests/Release/<span class="hljs-built_in">test</span>BlockedUnifCudaBench -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> -bs <span class="hljs-variable">$SCALFMM_BS_CPU_GPU</span>
-SCALFMM_SEQ_GPU_BS_REC=<span class="hljs-string">"trace-nb_<span class="hljs-variable">$SCALFMM_NB</span>-h_<span class="hljs-variable">$SCALFMM_H</span>-bs_<span class="hljs-variable">$SCALFMM_CPU_SEQ</span>-CPU_<span class="hljs-variable">$STARPU_NCPUS</span>-GPU_<span class="hljs-variable">$STARPU_NCUDA</span>.rec"</span>
-mv trace.rec <span class="hljs-variable">$SCALFMM_SEQ_GPU_BS_REC</span>
-</code></pre><p>From these files, we are able to get the different efficencies.</p><h2 id="post-processing-and-plot"><a name="post-processing-and-plot" href="#post-processing-and-plot"></a>Post-processing and Plot</h2><p>From the file:</p><ul>
-<li><code>$SCALFMM_SEQ_REC</code> : the resulting file from the sequential execution with best sequential granularity</li><li><code>$SCALFMM_PAR_REC</code> : the resulting file from a parallel execution (no GPU) with best parallel granularity</li><li><code>$SCALFMM_PAR_CPU_GPU_REC</code> : the resulting file from a parallel execution (hybrid) with best parallel-hybrid granularity</li><li><code>$SCALFMM_PAR_GPU_REC</code> : the resulting file with all possible tasks on GPU with best parallel-hybrid granularity</li><li><code>$SCALFMM_SEQ_CPU_BS_REC</code> : the resulting file from sequential execution with best parallel granularity</li><li><code>$SCALFMM_SEQ_GPU_BS_REC</code> : the resulting file from sequential execution with best parallel-hybrid granularity</li></ul><p>Getting all the efficency<br>Solving the linear programming problem</p><p>Plotting the results</p><h2 id="automatization"><a name="automatization" href="#automatization"></a>Automatization</h2><pre class="bash hljs"><code class="bash" data-origin="<pre><code class=&quot;bash&quot;>SCALFMM_NB=10000000
-SCALFMM_H=7
-SCALFMM_MIN_BS=100
-SCALFMM_MAX_BS=3000
-SCALFMM_MAX_NB_CPU=24
-SCALFMM_MAX_NB_GPU=4
-
-scalfmm_generate_efficiency -nb $SCALFMM_NB -h $SCALFMM_H -start $SCALFMM_MIN_BS -end $SCALFMM_MAX_BS
-</code></pre>">SCALFMM_NB=<span class="hljs-number">10000000</span>
-SCALFMM_H=<span class="hljs-number">7</span>
-SCALFMM_MIN_BS=<span class="hljs-number">100</span>
-SCALFMM_MAX_BS=<span class="hljs-number">3000</span>
-SCALFMM_MAX_NB_CPU=<span class="hljs-number">24</span>
-SCALFMM_MAX_NB_GPU=<span class="hljs-number">4</span>
-
-scalfmm_generate_efficiency -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> -start <span class="hljs-variable">$SCALFMM_MIN_BS</span> -end <span class="hljs-variable">$SCALFMM_MAX_BS</span>
-</code></pre>
-
-<footer style="position:fixed; font-size:.8em; text-align:right; bottom:0px; margin-left:-25px; height:20px; width:100%;">generated by <a href="http://pad.haroopress.com" target="_blank">haroopad</a></footer>
-</body>
-</html>
diff --git a/Addons/BenchEfficiency/scalfmm.md b/Addons/BenchEfficiency/scalfmm.md
deleted file mode 100644
index 0cf00f00e179fba2cfd4d579d4aa97712394876d..0000000000000000000000000000000000000000
--- a/Addons/BenchEfficiency/scalfmm.md
+++ /dev/null
@@ -1,585 +0,0 @@
-ScalFMM with StarPU+CUDA
-========================
-
-In this tutorial, we provide the commands to install ScalFMM and the needed tools in order to compute parallel efficiencies.
-We first show how to obtain the homogeneous efficencies and then the heterogeneous ones (not done yet).
-
-## Installing the libraries
-
-For some installation steps, we provide a "valid-if" test which shows if the previous command has been done correctly or not.
-In case of success `STEP-OK` will be print-out.
-In addition, if a library is already installed on the system, it is possible to set the output variables directly and test with the "valid-if" command if it will work.
-
-It is possible to follow these steps only to compile ScalFMM above StarPU and so we marked the installation of execution-trace tools as __Optional__.
-However, we higly recommended to install them and to follow all the steps since they let have the efficiencies.
-But if one wants to execute without any overhead, it might need to remove the usage of FXT.
-
-### Pre-requiste:
-In order to follow this tutorial, it is needed to have the following applications installed:
-
-* autoconf (>= 2.69)
-* gawk (Awk >= 4.0.1)
-* make (>= 3.81) 
-* cmake (>= 3.2.2)
-* gcc/g++ (>= 4.9) and the gcc/g++ names should point to the correct binaries
-* BLAS/LAPACK (The configure of ScalFMM is different if the MKL is used or not, but with the MKL it is recommended to set environment variable `MKLROOT`)
-* CUDA (>= 7) and `CUDA_PATH` must be set. In our case, `CUDA_PATH=/usr/local/cuda-7.5/`
-* __Optional__ Vite (from `sudo apt-get install vite` or see [http://vite.gforge.inria.fr/download.php](http://vite.gforge.inria.fr/download.php))
-*  __Optional__ Qt5 library to be able to change the colors of the execution traces in order to visualize the different FMM operators
-* gnuplot to generate the figures
-
-> [Remark] Some installations of CUDA does not have libcuda file.
-> In this case, one needs to create a link : `sudo ln /usr/local/cuda-7.5/lib64/libcudart.so /usr/local/cuda-7.5/lib64/libcuda.so`
-
-> [Plafrim-Developers] 
->
-> For those who use this tutorial on Plafrim (or a similar cluster), we provide extra informations.
->
-> To allocate an heterogeneous node : `salloc -N 1 --time=03:00:00 --exclusive -p court_sirocco -CHaswell --gres=gpu:4 -x sirocco06`
-> 
-> Then, find it using `squeue` and access it by `ssh`.
->
-> We have run this tutorial with the modules : `module load compiler/gcc/4.9.2 cuda75/toolkit/7.5.18 intel/mkl/64/11.2/2016.0.0 build/cmake/3.2.1`
-
-### Working directory
-
-The variable `SCALFMM_TEST_DIR` is used to specify the working directory where all the tools are going to be installed:
-```bash
-export SCALFMM_TEST_DIR=~/scalfmm_test   
-cd $SCALFMM_TEST_DIR
-```
-
-In order to be able to stop the tutorial in the middle and restart later, we will register the variables in a file that should be source to restart later:
-```bash
-# function scalfmmRegisterVariable() { echo "export $1=${!1}" >> "$SCALFMM_TEST_DIR/environment.source"; }
-echo "function scalfmmRegisterVariable() { echo \"export \$1=\${!1}\" >> \"$SCALFMM_TEST_DIR/environment.source\"; }" > "$SCALFMM_TEST_DIR/environment.source"
-source "$SCALFMM_TEST_DIR/environment.source"
-```
-
-*Output variables:* `scalfmmRegisterVariable SCALFMM_TEST_DIR`
-
-Valid-if
-```bash
-if [[ -n $SCALFMM_TEST_DIR ]] && [[ -d $SCALFMM_TEST_DIR ]] ; then
-   echo “STEP-OK”
-fi
-```
-
-- Restarting the tutorial
-
-To restart the tutorial, one needs to re-define the working directory and to source the save file before to resume:
-```bash
-export SCALFMM_TEST_DIR=~/scalfmm_test
-if [[ ! -d $SCALFMM_TEST_DIR ]] ; then
-	mkdir $SCALFMM_TEST_DIR
-else
-	source "$SCALFMM_TEST_DIR/environment.source"
-fi    
-cd $SCALFMM_TEST_DIR
-```
-
-### Downloading the Packages (in Advance)
-
-If the computational node does not have access to internet, we provide a command to download the needed packages (otherwise the next commands still include just in time download):
-```bash
-cd $SCALFMM_TEST_DIR
-wget https://www.open-mpi.org/software/hwloc/v1.11/downloads/hwloc-1.11.2.tar.gz
-wget http://download.savannah.gnu.org/releases/fkt/fxt-0.2.11.tar.gz # Optional
-wget http://www.fftw.org/fftw-3.3.4.tar.gz
-svn export svn://scm.gforge.inria.fr/svnroot/starpu/trunk starpu
-git clone --depth=1 https://scm.gforge.inria.fr/anonscm/git/scalfmm-public/scalfmm-public.git
-```
-
-### HWLOC
-```bash
-cd $SCALFMM_TEST_DIR
-if [[ ! -f hwloc-1.11.2.tar.gz ]] ; then
-    wget https://www.open-mpi.org/software/hwloc/v1.11/downloads/hwloc-1.11.2.tar.gz
-fi
-tar xvf hwloc-1.11.2.tar.gz
-cd hwloc-1.11.2/
-export SCALFMM_HWLOC_DIR=$SCALFMM_TEST_DIR/hwlocinstall
-./configure --prefix=$SCALFMM_HWLOC_DIR
-make install
-```
-
-*Output variables:* `scalfmmRegisterVariable SCALFMM_HWLOC_DIR`
-
-Valid-if:
-```bash
-if [[ -n $SCALFMM_HWLOC_DIR ]] && [[ -d $SCALFMM_HWLOC_DIR/lib/ ]] && [[ -f  $SCALFMM_HWLOC_DIR/lib/libhwloc.so ]]; then
-   echo "STEP-OK"
-fi
-```
-
-### FXT (__Optional__)
-```bash
-cd $SCALFMM_TEST_DIR
-if [[ ! -f fxt-0.2.11.tar.gz ]] ; then
-    wget http://download.savannah.gnu.org/releases/fkt/fxt-0.2.11.tar.gz
-fi
-tar xvf fxt-0.2.11.tar.gz
-cd fxt-0.2.11/
-export SCALFMM_FXT_DIR=$SCALFMM_TEST_DIR/fxtinstall
-./configure --prefix=$SCALFMM_FXT_DIR
-make install
-```
-
-*Output variables:* `scalfmmRegisterVariable SCALFMM_FXT_DIR`
-
-Valid-if:
-```bash
-if [[ -n $SCALFMM_FXT_DIR ]] && [[ -d $SCALFMM_FXT_DIR/lib/ ]] && [[ -f  $SCALFMM_FXT_DIR/lib/libfxt.so ]]; then
-   echo "STEP-OK"
-fi
-```
-
-### FFTW (If No MKL-FFT)
-For those who do not use MKL FFT interface, they have to install FFTW (float/double):
-```bash
-cd $SCALFMM_TEST_DIR
-if [[ ! -f fftw-3.3.4.tar.gz ]] ; then
-    wget http://www.fftw.org/fftw-3.3.4.tar.gz
-fi    
-tar xvf fftw-3.3.4.tar.gz
-cd fftw-3.3.4/
-export SCALFMM_FFTW_DIR=$SCALFMM_TEST_DIR/fftinstall
-./configure --prefix=$SCALFMM_FFTW_DIR
-make install
-./configure --prefix=$SCALFMM_FFTW_DIR --enable-float
-make install
-```
-
-*Output variables:* `scalfmmRegisterVariable SCALFMM_FFTW_DIR`
-
-Valid-if:
-```bash
-if [[ -n $SCALFMM_FFTW_DIR ]] && [[ -d $SCALFMM_FFTW_DIR/lib/ ]] && [[ -f  $SCALFMM_FFTW_DIR/lib/libfftw3.a ]] && [[ -f  $SCALFMM_FFTW_DIR/lib/libfftw3f.a ]]; then
-   echo "STEP-OK"
-fi
-```
-
-### StarPU
-```bash
-cd $SCALFMM_TEST_DIR
-if [[ ! -d starpu ]] ; then
-	svn export svn://scm.gforge.inria.fr/svnroot/starpu/trunk starpu
-fi    
-cd starpu/
-export SCALFMM_STARPU_DIR=$SCALFMM_TEST_DIR/starpuinstall
-./autogen.sh
-./configure --prefix=$SCALFMM_STARPU_DIR --with-fxt=$SCALFMM_FXT_DIR --with-hwloc=$SCALFMM_HWLOC_DIR --with-cuda-dir=$CUDA_PATH --disable-opencl
-make install
-```
-> __Optional__ In case you do not want to use trace (FXT) please remove the `--with-fxt=$SCALFMM_FXT_DIR` parameter from the command
-
-*Output variables:* `scalfmmRegisterVariable SCALFMM_STARPU_DIR`
-
-Valid-if:
-```bash
-if [[ -n $SCALFMM_STARPU_DIR ]] && [[ -d $SCALFMM_STARPU_DIR/lib/ ]] && [[ -f  $SCALFMM_STARPU_DIR/lib/libstarpu.so ]] ; then
-   echo "STEP-OK"
-fi
-```
-
-### ScalFMM
-
-#### Configure
-+ Getting the source from the last commit:
-```bash
-cd $SCALFMM_TEST_DIR
-if [[ ! -d scalfmm-public ]] ; then
-    git clone --depth=1 https://scm.gforge.inria.fr/anonscm/git/scalfmm-public/scalfmm-public.git
-fi    
-cd scalfmm-public/
-export SCALFMM_SOURCE_DIR=`pwd`
-cd Build/
-export SCALFMM_BUILD_DIR=`pwd`
-```
-
-*Output variables:* `scalfmmRegisterVariable SCALFMM_BUILD_DIR` `scalfmmRegisterVariable SCALFMM_SOURCE_DIR`
-
-+ Configure (No MKL):
-```bash
-cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF \
-               -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF \
-               -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=OFF \
-               -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON \
-               -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF \
-               -DHWLOC_DIR=$SCALFMM_HWLOC_DIR -DSTARPU_DIR=$SCALFMM_STARPU_DIR \
-               -DSCALFMM_USE_FFT=ON -DFFT_DIR=$SCALFMM_FFT_DIR
-```
-+ Configure (MKL BLAS/LAPACK and FFTW):
-```bash
-cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF \
-               -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF \
-               -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=ON \
-               -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON \
-               -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF \
-               -DHWLOC_DIR=$SCALFMM_HWLOC_DIR -DSTARPU_DIR=$SCALFMM_STARPU_DIR \
-               -DSCALFMM_USE_FFT=ON -DFFT_DIR=$SCALFMM_FFT_DIR
-```
-+ Configure (MKL BLAS/LAPACK/FFT and No FFTW):
-
-> [Plafrim-Developers] Should use that one
-
-```bash
-cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF \
-               -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF \
-               -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=ON \
-               -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON \
-               -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF \
-               -DHWLOC_DIR=$SCALFMM_HWLOC_DIR -DSTARPU_DIR=$SCALFMM_STARPU_DIR \
-               -DSCALFMM_USE_FFT=ON -DSCALFMM_USE_MKL_AS_FFTW=ON
-```
-
-Valid-if:
-```bash
-cmake .. ; if [[ "$?" == "0" ]] ; then echo "STEP-OK" ; fi
-```
-
-#### Build
-
-```bash
-cd $SCALFMM_BUILD_DIR
-make testBlockedUnifCudaBench
-```
-
-Valid-if:
-```bash
-ls ./Tests/Release/testBlockedUnifCudaBench ; if [[ "$?" == "0" ]] ; then echo "STEP-OK" ; fi
-```
-
-#### First Execution
-
-In this section we compute a simulation and look at the resulting trace.
-ScalFMM binary parameters and descriptions:
-
-* Passing `--help` as parameter provide the possible/valid parameters
-* Simulation properties are choosen by :
-  * `-h` : height of the tree
-  * `-bs` : granularity/size of the group
-  * `-nb` : number of particles generated
-* Execution properties are choosen by the StarPU environment variables :
-  * `STARPU_NCPUS` : the number of CPU workers
-  * `STARPU_NCUDA` : the number of GPU workers (for heterogeneous binary)
-* By default the application will not compare the FMM interactions against the direct method (which is N^2) and so it is recommended to avoid the validation for large test cases. But to get the accuracy one must pass the parameter `-validation`
-* `-p2p-m2l-cuda-only` : to compute the P2P and the M2L only on GPU (the rest on the CPU)
-
-Examples:
-
-```bash
-export STARPU_NCPUS=12
-export STARPU_NCUDA=2
-./Tests/Release/testBlockedUnifCudaBench -nb 30000000 -h 7 -bs 800
-```
-
-Last part of the output should be:
-```bash
-	Start FGroupTaskStarPUAlgorithm
-		 directPass in 0.0406482s
-			 inblock  in 0.000780428s
-			 outblock in 0.0398674s
-		 bottomPass in 0.00586269s
-		 upwardPass in 0.00265723s
-		 transferPass in 0.00323571s
-			 inblock in  0.000124817s
-			 outblock in 0.00298331s
-		 downardPass in 0.00257975s
-		 transferPass in 0.0652285s
-			 inblock in  0.00164774s
-			 outblock in 0.0635799s
-		 L2P in 0.0115733s
-		 Submitting the tasks took 0.139101s
-		 Moving data to the host took 0.0578765s
-@EXEC TIME = 14.6321s
-```
-
-+ Visualize the execution trace (__Optional__)
-
-Convert the fxt file
-```bash
-$SCALFMM_STARPU_DIR/bin/starpu_fxt_tool -i "/tmp/prof_file_"$USER"_0"
-```
-Then visualize the output with `vite` (maybe by copying the paje.trace file locally)
-```bash
-vite ./paje.trace
-```
-
-Should be like:
-![Trace](trace-example.png)
-
-We can convert the color of the trace by (requiere Qt5 library):
-
-```bash
-$SCALFMM_SOURCE_DIR/Addons/BenchEfficiency/pajecolor paje.trace $SCALFMM_SOURCE_DIR/Addons/BenchEfficiency/paintmodel.fmm.colors
-vite ./paje.trace.painted
-```
-
-Should be like: 
-![Trace](trace-example-colors.png)
-
-+ Get execution times
-
-```bash
-python $SCALFMM_STARPU_DIR/bin/starpu_trace_state_stats.py -t trace.rec
-```
-
-Should give something like:
-```
-"Name","Count","Type","Duration"
-"Initializing",14,"Runtime",7153.096196
-"Overhead",57010,"Runtime",376.473463
-"Idle",14355,"Other",12.815899
-"Scheduling",28441,"Runtime",238.367394
-"Sleeping",610,"Other",13786.513208
-"FetchingInput",14341,"Runtime",13918.805814
-"execute_on_all_wrapper",30,"Task",21.288802
-"Executing",414,"Runtime",26852.864578
-"PushingOutput",14341,"Runtime",284.96123
-"P2P-out",3846,"Task",60378.266619
-"Callback",13559,"Runtime",4.210633
-"P2P",328,"Task",15383.426991
-"M2L-level-5",41,"Task",2354.702554
-"M2L-level-6",328,"Task",18349.915495
-"Deinitializing",14,"Runtime",109.87483
-"M2L-level-4",6,"Task",275.088295
-"P2M",328,"Task",11312.022842
-"M2M-level-5",328,"Task",829.9055
-"M2M-level-4",41,"Task",93.130498
-"M2L-out-level-5",638,"Task",1914.900053
-"M2M-level-3",6,"Task",11.053067
-"M2M-level-2",1,"Task",1.363157
-"M2L-out-level-4",22,"Task",159.580457
-"L2L-level-4",41,"Task",84.554065
-"L2L-level-5",328,"Task",1087.717767
-"M2L-out-level-6",7692,"Task",18322.518045
-"L2P",328,"Task",27146.256793
-"M2L-level-2",1,"Task",2.661235
-"L2L-level-3",6,"Task",11.346978
-"M2L-level-3",1,"Task",47.612555
-"L2L-level-2",1,"Task",1.471873
-```
-
-Most of the script are in the addon directories
-```bash
-export SCALFMM_AB=$SCALFMM_SOURCE_DIR/Addons/BenchEfficiency/
-```
-
-*Output variable:* `scalfmmRegisterVariable SCALFMM_AB`
-
-## Homogeneous Efficiencies
-
-Here we compute the efficiencies for a given test case on CPU only.
-
-Go in the build dir and create output dir
-```bash
-cd $SCALFMM_BUILD_DIR
-export SCALFMM_RES_DIR=$SCALFMM_BUILD_DIR/homogeneous
-mkdir $SCALFMM_RES_DIR
-```
-*Output variable:* `scalfmmRegisterVariable SCALFMM_RES_DIR` 
-
-Set up the configuration variables:
-```bash
-export SCALFMM_NB=10000000
-export SCALFMM_H=7
-export SCALFMM_MIN_BS=100
-export SCALFMM_MAX_BS=10000
-export SCALFMM_MAX_NB_CPU=24
-```
-
-Find best granularity in sequential and in parallel:
-```bash
-export STARPU_NCPUS=1
-export STARPU_NCUDA=0
-export SCALFMM_BS_CPU_SEQ=`$SCALFMM_AB/scalfmmFindBs.sh "./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs" $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmmExtractKey.sh "@BEST BS" `
-if [[ `which gnuplot | wc -l` == "1" ]] ;  then
-    gnuplot -e "filename='seq-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot
-fi
-
-export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU
-export STARPU_NCUDA=0
-export SCALFMM_BS_CPU_PAR=`$SCALFMM_AB/scalfmmFindBs.sh "./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs" $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" `
-if [[ `which gnuplot | wc -l` == "1" ]] ;  then
-    gnuplot -e "filename='par-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot
-fi
-```
-In our case we get 9710  and 5385.
-
-*Output variable:* `scalfmmRegisterVariable SCALFMM_BS_CPU_SEQ`  `scalfmmRegisterVariable SCALFMM_BS_CPU_PAR`
-
-We can look to the work that has been done to find the best granularity:
-![In sequential](seq-bs-search.png)
-![In parallel](par-bs-search.png)
-
-
-Then we compute the efficiency using both granulirities and keep the .rec files:
-```bash
-export SCALFMM_MAX_NB_CPU=24
-export STARPU_NCUDA=0
-source "$SCALFMM_AB/execAllHomogeneous.sh"
-```
-
-We should end with all the .rec files and their corresponding time files and `ls "$SCALFMM_RES_DIR"` should return something like:
-```bash
-trace-nb_10000000-h_7-bs_5385-CPU_10.rec       trace-nb_10000000-h_7-bs_5385-CPU_16.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_22.rec       trace-nb_10000000-h_7-bs_5385-CPU_5.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_10.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_17.rec       trace-nb_10000000-h_7-bs_5385-CPU_22.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_6.rec
-trace-nb_10000000-h_7-bs_5385-CPU_11.rec       trace-nb_10000000-h_7-bs_5385-CPU_17.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_23.rec       trace-nb_10000000-h_7-bs_5385-CPU_6.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_11.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_18.rec       trace-nb_10000000-h_7-bs_5385-CPU_23.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_7.rec
-trace-nb_10000000-h_7-bs_5385-CPU_12.rec       trace-nb_10000000-h_7-bs_5385-CPU_18.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_24.rec       trace-nb_10000000-h_7-bs_5385-CPU_7.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_12.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_19.rec       trace-nb_10000000-h_7-bs_5385-CPU_24.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_8.rec
-trace-nb_10000000-h_7-bs_5385-CPU_13.rec       trace-nb_10000000-h_7-bs_5385-CPU_19.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_2.rec        trace-nb_10000000-h_7-bs_5385-CPU_8.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_13.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_1.rec        trace-nb_10000000-h_7-bs_5385-CPU_2.rec.time   trace-nb_10000000-h_7-bs_5385-CPU_9.rec
-trace-nb_10000000-h_7-bs_5385-CPU_14.rec       trace-nb_10000000-h_7-bs_5385-CPU_1.rec.time   trace-nb_10000000-h_7-bs_5385-CPU_3.rec        trace-nb_10000000-h_7-bs_5385-CPU_9.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_14.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_20.rec       trace-nb_10000000-h_7-bs_5385-CPU_3.rec.time   trace-nb_10000000-h_7-bs_9710-CPU_1.rec
-trace-nb_10000000-h_7-bs_5385-CPU_15.rec       trace-nb_10000000-h_7-bs_5385-CPU_20.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_4.rec        trace-nb_10000000-h_7-bs_9710-CPU_1.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_15.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_21.rec       trace-nb_10000000-h_7-bs_5385-CPU_4.rec.time
-trace-nb_10000000-h_7-bs_5385-CPU_16.rec       trace-nb_10000000-h_7-bs_5385-CPU_21.rec.time  trace-nb_10000000-h_7-bs_5385-CPU_5.rec
-```
-
-We then compute the efficiencies from these files
-```bash
-g++ -std=c++11 $SCALFMM_AB/mergetimefile.cpp -o $SCALFMM_AB/mergetimefile.exe
-$SCALFMM_AB/mergetimefile.exe \
-        "$SCALFMM_RES_DIR/trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_BS_CPU_SEQ-CPU_1.rec.time" \
-        "$SCALFMM_RES_DIR/trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_BS_CPU_PAR-CPU_%d.rec.time"\
-         $SCALFMM_MAX_NB_CPU
-```
-
-We end-up with the global efficiencies (for the application) but also for the different operators.
-```bash
-Create global-eff.data
-Create task-eff.data
-Create task-gr-eff.dat
-```
-
-We can plot each of them
-```bash
-gnuplot -e "filename='global-eff'" $SCALFMM_AB/scalfmmPlotAll.gplot
-gnuplot -e "filename='task-eff'" $SCALFMM_AB/scalfmmPlotAll.gplot
-gnuplot -e "filename='task-gr-eff'" $SCALFMM_AB/scalfmmPlotAll.gplot
-```
-
-In our case it gives:
-![global-eff](global-eff.png)
-![task-eff](task-eff.png)
-![task-gr-eff](task-gr-eff.png)
-
-
-## Heterogeneous
-
-__NOT FINISHED!!!!__
-
-For test case `-nb 10000000` (10 million) and `-h 6` (height of the tree equal to 6),
-we first want to know the best granularity `-bs`.
-
-This parameter will certainly not be the same for sequential/parallel/heterogenous configurations.
-
-```bash
-export SCALFMM_NB=10000000
-export SCALFMM_H=7
-export SCALFMM_MIN_BS=100
-export SCALFMM_MAX_BS=3000
-export SCALFMM_MAX_NB_CPU=24
-export SCALFMM_MAX_NB_GPU=4
-```
-
-```bash
-export STARPU_NCPUS=1
-export STARPU_NCUDA=0
-export SCALFMM_BS_CPU_SEQ=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" `
-if [[ `which gnuplot | wc -l` == "1" ]] ;  then
-    gnuplot -e "filename='seq-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot
-fi
-
-export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU
-export STARPU_NCUDA=0
-export SCALFMM_BS_CPU_PAR=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" `
-if [[ `which gnuplot | wc -l` == "1" ]] ;  then
-    gnuplot -e "filename='par-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot
-fi
-
-export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU
-export STARPU_NCUDA=$SCALFMM_MAX_NB_GPU
-export SCALFMM_BS_CPU_GPU=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" `
-if [[ `which gnuplot | wc -l` == "1" ]] ;  then
-    gnuplot -e "filename='cpugpu-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot
-fi
-```
-
-Then, we can execute three best configurations, and keep .rec for each of them:
-```bash
-export STARPU_NCPUS=1
-export STARPU_NCUDA=0
-./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_CPU_SEQ
-export SCALFMM_SEQ_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec"
-mv trace.rec $SCALFMM_SEQ_REC
-
-export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU
-export STARPU_NCUDA=0
-./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_PAR
-export SCALFMM_PAR_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec"
-mv trace.rec $SCALFMM_PAR_REC
-
-export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU
-export STARPU_NCUDA=$SCALFMM_MAX_NB_GPU
-./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_GPU
-export SCALFMM_PAR_CPU_GPU_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec"
-mv trace.rec $SCALFMM_PAR_CPU_GPU_REC
-```
-
-And we also want the GPU tasks only on GPU
-```bash
-export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU
-export STARPU_NCUDA=$SCALFMM_MAX_NB_GPU
-./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_GPU -p2p-m2l-cuda-only
-export SCALFMM_PAR_GPU_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA-GPUONLY.rec"
-mv trace.rec $SCALFMM_PAR_GPU_REC
-```
-
-And we want the sequential version with parallel granularity:
-```bash
-export STARPU_NCPUS=1
-export STARPU_NCUDA=0
-
-./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_PAR
-SCALFMM_SEQ_CPU_BS_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec"
-mv trace.rec $SCALFMM_SEQ_CPU_BS_REC
-
-./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_GPU
-SCALFMM_SEQ_GPU_BS_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec"
-mv trace.rec $SCALFMM_SEQ_GPU_BS_REC
-```
-
-From these files, we are able to get the different efficencies.
-
-## Post-processing and Plot
-
-From the file:
-
-+ `$SCALFMM_SEQ_REC` : the resulting file from the sequential execution with best sequential granularity
-+ `$SCALFMM_PAR_REC` : the resulting file from a parallel execution (no GPU) with best parallel granularity
-+ `$SCALFMM_PAR_CPU_GPU_REC` : the resulting file from a parallel execution (hybrid) with best parallel-hybrid granularity
-+ `$SCALFMM_PAR_GPU_REC` : the resulting file with all possible tasks on GPU with best parallel-hybrid granularity
-+ `$SCALFMM_SEQ_CPU_BS_REC` : the resulting file from sequential execution with best parallel granularity
-+ `$SCALFMM_SEQ_GPU_BS_REC` : the resulting file from sequential execution with best parallel-hybrid granularity
-
-Getting all the efficency
-Solving the linear programming problem
-
-Plotting the results
-
-
-## Automatization
-
-```bash
-SCALFMM_NB=10000000
-SCALFMM_H=7
-SCALFMM_MIN_BS=100
-SCALFMM_MAX_BS=3000
-SCALFMM_MAX_NB_CPU=24
-SCALFMM_MAX_NB_GPU=4
-
-scalfmm_generate_efficiency -nb $SCALFMM_NB -h $SCALFMM_H -start $SCALFMM_MIN_BS -end $SCALFMM_MAX_BS
-```
\ No newline at end of file
diff --git a/Addons/BenchEfficiency/scalfmmExtractKey.sh b/Addons/BenchEfficiency/scalfmmExtractKey.sh
index a1909bcb4db4a77e7238db47dba78721c8045043..dbdc7f0e20309561a11cf009161fc305bbb18765 100644
--- a/Addons/BenchEfficiency/scalfmmExtractKey.sh
+++ b/Addons/BenchEfficiency/scalfmmExtractKey.sh
@@ -6,5 +6,5 @@ if [[ $# -ne 1 ]] ; then
 fi
 
 input=$(cat)
-res=`echo "$input" | grep "$3" | cut -d'=' -f2 | cut -d's' -f1`
+res=`echo "$input" | grep "$1" | cut -d'=' -f2 | cut -d' ' -f2`
 echo $res
diff --git a/Addons/BenchEfficiency/seq-bs-search.png b/Addons/BenchEfficiency/seq-bs-search.png
deleted file mode 100644
index 3bc1ece1cf46f09a17f2b95f59040589fdd91bd9..0000000000000000000000000000000000000000
Binary files a/Addons/BenchEfficiency/seq-bs-search.png and /dev/null differ
diff --git a/Addons/BenchEfficiency/task-eff.data b/Addons/BenchEfficiency/task-eff.data
deleted file mode 100644
index d921d6a94df131fea43c5d9e12cdb61f290322a2..0000000000000000000000000000000000000000
--- a/Addons/BenchEfficiency/task-eff.data
+++ /dev/null
@@ -1,25 +0,0 @@
-0 	L2L 	M2M 	P2M 	L2P 	M2L-out 	M2L 	P2P-out 	P2P 
-1	1.000000e+00 	1.000000e+00 	1.000000e+00 	1.000000e+00 	1.000000e+00 	1.000000e+00 	1.000000e+00 	1.000000e+00 
-2	9.565659e-01 	9.665736e-01 	1.031103e+00 	1.004286e+00 	9.715094e-01 	9.208541e-01 	9.697996e-01 	9.763831e-01 
-3	9.263068e-01 	1.024516e+00 	1.029574e+00 	9.889095e-01 	9.937418e-01 	9.954310e-01 	1.001689e+00 	1.000994e+00 
-4	1.005226e+00 	1.006333e+00 	1.033745e+00 	1.010624e+00 	9.534195e-01 	9.864280e-01 	9.895790e-01 	9.995851e-01 
-5	8.615300e-01 	9.844517e-01 	9.938413e-01 	1.009990e+00 	9.569465e-01 	9.791331e-01 	9.887700e-01 	9.975625e-01 
-6	8.535893e-01 	9.410083e-01 	1.014109e+00 	1.018876e+00 	9.739749e-01 	9.860534e-01 	9.782539e-01 	9.964238e-01 
-7	1.046813e+00 	9.975072e-01 	1.037954e+00 	1.003486e+00 	9.786087e-01 	9.933857e-01 	1.004895e+00 	9.965736e-01 
-8	9.995985e-01 	1.013025e+00 	9.895591e-01 	1.013030e+00 	9.652670e-01 	9.907845e-01 	1.000561e+00 	9.971405e-01 
-9	1.039365e+00 	1.013929e+00 	1.047827e+00 	9.852421e-01 	9.711139e-01 	9.898517e-01 	9.980679e-01 	9.993222e-01 
-10	9.181035e-01 	9.952685e-01 	1.031850e+00 	1.012496e+00 	9.670203e-01 	9.852214e-01 	9.859215e-01 	9.985014e-01 
-11	8.717502e-01 	9.889525e-01 	1.028373e+00 	1.011922e+00 	9.699808e-01 	9.888136e-01 	9.826419e-01 	9.981512e-01 
-12	9.452144e-01 	1.040015e+00 	1.013514e+00 	9.762884e-01 	9.389195e-01 	9.915452e-01 	9.996240e-01 	9.998256e-01 
-13	1.022490e+00 	1.021529e+00 	1.014210e+00 	9.896566e-01 	9.668669e-01 	9.898209e-01 	1.011145e+00 	9.991000e-01 
-14	9.383201e-01 	9.923898e-01 	1.030084e+00 	1.009296e+00 	9.748870e-01 	9.858361e-01 	1.005721e+00 	9.971995e-01 
-15	9.387378e-01 	9.986737e-01 	1.032522e+00 	9.967096e-01 	9.675984e-01 	9.877332e-01 	1.003181e+00 	9.974178e-01 
-16	9.377196e-01 	9.853747e-01 	1.043778e+00 	1.003874e+00 	9.786853e-01 	9.873092e-01 	1.003464e+00 	9.958178e-01 
-17	9.293735e-01 	1.034251e+00 	1.038271e+00 	1.003177e+00 	9.700248e-01 	9.915540e-01 	9.899480e-01 	9.984129e-01 
-18	9.081814e-01 	9.992797e-01 	1.018655e+00 	9.982681e-01 	9.627375e-01 	9.752319e-01 	9.739917e-01 	9.297086e-01 
-19	9.471672e-01 	9.763513e-01 	1.026148e+00 	1.013503e+00 	9.656781e-01 	9.868543e-01 	9.891711e-01 	9.992051e-01 
-20	9.376034e-01 	1.008523e+00 	1.015422e+00 	9.988900e-01 	9.763451e-01 	9.917410e-01 	1.016855e+00 	9.974959e-01 
-21	9.649789e-01 	9.941223e-01 	1.023371e+00 	9.720318e-01 	9.427889e-01 	9.864717e-01 	1.011408e+00 	1.001528e+00 
-22	8.085859e-01 	1.003002e+00 	1.024132e+00 	1.015483e+00 	9.586926e-01 	9.888563e-01 	9.829068e-01 	9.982469e-01 
-23	9.843031e-01 	1.009513e+00 	1.041257e+00 	1.012564e+00 	1.009160e+00 	9.949415e-01 	9.970272e-01 	9.964763e-01 
-24	9.408696e-01 	9.847445e-01 	1.030481e+00 	9.726508e-01 	9.691133e-01 	9.975819e-01 	1.022271e+00 	1.000680e+00 
diff --git a/Addons/BenchEfficiency/task-eff.png b/Addons/BenchEfficiency/task-eff.png
deleted file mode 100644
index d8e15fae86dc95d544ca3ca3f1fe85b624b5b32a..0000000000000000000000000000000000000000
Binary files a/Addons/BenchEfficiency/task-eff.png and /dev/null differ
diff --git a/Addons/BenchEfficiency/task-gr-eff.data b/Addons/BenchEfficiency/task-gr-eff.data
deleted file mode 100644
index d921d6a94df131fea43c5d9e12cdb61f290322a2..0000000000000000000000000000000000000000
--- a/Addons/BenchEfficiency/task-gr-eff.data
+++ /dev/null
@@ -1,25 +0,0 @@
-0 	L2L 	M2M 	P2M 	L2P 	M2L-out 	M2L 	P2P-out 	P2P 
-1	1.000000e+00 	1.000000e+00 	1.000000e+00 	1.000000e+00 	1.000000e+00 	1.000000e+00 	1.000000e+00 	1.000000e+00 
-2	9.565659e-01 	9.665736e-01 	1.031103e+00 	1.004286e+00 	9.715094e-01 	9.208541e-01 	9.697996e-01 	9.763831e-01 
-3	9.263068e-01 	1.024516e+00 	1.029574e+00 	9.889095e-01 	9.937418e-01 	9.954310e-01 	1.001689e+00 	1.000994e+00 
-4	1.005226e+00 	1.006333e+00 	1.033745e+00 	1.010624e+00 	9.534195e-01 	9.864280e-01 	9.895790e-01 	9.995851e-01 
-5	8.615300e-01 	9.844517e-01 	9.938413e-01 	1.009990e+00 	9.569465e-01 	9.791331e-01 	9.887700e-01 	9.975625e-01 
-6	8.535893e-01 	9.410083e-01 	1.014109e+00 	1.018876e+00 	9.739749e-01 	9.860534e-01 	9.782539e-01 	9.964238e-01 
-7	1.046813e+00 	9.975072e-01 	1.037954e+00 	1.003486e+00 	9.786087e-01 	9.933857e-01 	1.004895e+00 	9.965736e-01 
-8	9.995985e-01 	1.013025e+00 	9.895591e-01 	1.013030e+00 	9.652670e-01 	9.907845e-01 	1.000561e+00 	9.971405e-01 
-9	1.039365e+00 	1.013929e+00 	1.047827e+00 	9.852421e-01 	9.711139e-01 	9.898517e-01 	9.980679e-01 	9.993222e-01 
-10	9.181035e-01 	9.952685e-01 	1.031850e+00 	1.012496e+00 	9.670203e-01 	9.852214e-01 	9.859215e-01 	9.985014e-01 
-11	8.717502e-01 	9.889525e-01 	1.028373e+00 	1.011922e+00 	9.699808e-01 	9.888136e-01 	9.826419e-01 	9.981512e-01 
-12	9.452144e-01 	1.040015e+00 	1.013514e+00 	9.762884e-01 	9.389195e-01 	9.915452e-01 	9.996240e-01 	9.998256e-01 
-13	1.022490e+00 	1.021529e+00 	1.014210e+00 	9.896566e-01 	9.668669e-01 	9.898209e-01 	1.011145e+00 	9.991000e-01 
-14	9.383201e-01 	9.923898e-01 	1.030084e+00 	1.009296e+00 	9.748870e-01 	9.858361e-01 	1.005721e+00 	9.971995e-01 
-15	9.387378e-01 	9.986737e-01 	1.032522e+00 	9.967096e-01 	9.675984e-01 	9.877332e-01 	1.003181e+00 	9.974178e-01 
-16	9.377196e-01 	9.853747e-01 	1.043778e+00 	1.003874e+00 	9.786853e-01 	9.873092e-01 	1.003464e+00 	9.958178e-01 
-17	9.293735e-01 	1.034251e+00 	1.038271e+00 	1.003177e+00 	9.700248e-01 	9.915540e-01 	9.899480e-01 	9.984129e-01 
-18	9.081814e-01 	9.992797e-01 	1.018655e+00 	9.982681e-01 	9.627375e-01 	9.752319e-01 	9.739917e-01 	9.297086e-01 
-19	9.471672e-01 	9.763513e-01 	1.026148e+00 	1.013503e+00 	9.656781e-01 	9.868543e-01 	9.891711e-01 	9.992051e-01 
-20	9.376034e-01 	1.008523e+00 	1.015422e+00 	9.988900e-01 	9.763451e-01 	9.917410e-01 	1.016855e+00 	9.974959e-01 
-21	9.649789e-01 	9.941223e-01 	1.023371e+00 	9.720318e-01 	9.427889e-01 	9.864717e-01 	1.011408e+00 	1.001528e+00 
-22	8.085859e-01 	1.003002e+00 	1.024132e+00 	1.015483e+00 	9.586926e-01 	9.888563e-01 	9.829068e-01 	9.982469e-01 
-23	9.843031e-01 	1.009513e+00 	1.041257e+00 	1.012564e+00 	1.009160e+00 	9.949415e-01 	9.970272e-01 	9.964763e-01 
-24	9.408696e-01 	9.847445e-01 	1.030481e+00 	9.726508e-01 	9.691133e-01 	9.975819e-01 	1.022271e+00 	1.000680e+00 
diff --git a/Addons/BenchEfficiency/task-gr-eff.png b/Addons/BenchEfficiency/task-gr-eff.png
deleted file mode 100644
index c748b76826f08ea7f7f09769eba322f0dce0cf3e..0000000000000000000000000000000000000000
Binary files a/Addons/BenchEfficiency/task-gr-eff.png and /dev/null differ
diff --git a/Addons/BenchEfficiency/trace-example-colors.png b/Addons/BenchEfficiency/trace-example-colors.png
deleted file mode 100644
index dcefa9fb53660927f1509d64f89254ee03e60dec..0000000000000000000000000000000000000000
Binary files a/Addons/BenchEfficiency/trace-example-colors.png and /dev/null differ
diff --git a/Addons/BenchEfficiency/trace-example.png b/Addons/BenchEfficiency/trace-example.png
deleted file mode 100644
index 5e466b94ed15a4d0905484425a75de9d390f45d7..0000000000000000000000000000000000000000
Binary files a/Addons/BenchEfficiency/trace-example.png and /dev/null differ
diff --git a/Addons/CKernelApi/Src/FInterEngine.hpp b/Addons/CKernelApi/Src/FInterEngine.hpp
index 499e74e1a6fb4942799b36ccc58fd7574c601d95..6d304220cc07a5a4bc16228e7b65704a3d6e136e 100644
--- a/Addons/CKernelApi/Src/FInterEngine.hpp
+++ b/Addons/CKernelApi/Src/FInterEngine.hpp
@@ -115,12 +115,12 @@ public:
         }else{
             if(type==SOURCE){
                 for(FSize idPart = 0; idPart<NbPositions ; ++idPart){
-                    octree->insert(FPoint<FReal>(&XYZ[3*idPart]),FParticleTypeSource,idPart);
+                    octree->insert(FPoint<FReal>(&XYZ[3*idPart]),FParticleType::FParticleTypeSource,idPart);
                 }
                 FScalFMMEngine<FReal>::nbPart += NbPositions;
             }else{
                 for(FSize idPart = 0; idPart<NbPositions ; ++idPart){
-                    octree->insert(FPoint<FReal>(&XYZ[3*idPart]),FParticleTypeTarget,idPart);
+                    octree->insert(FPoint<FReal>(&XYZ[3*idPart]),FParticleType::FParticleTypeTarget,idPart);
                 }
                 FScalFMMEngine<FReal>::nbPart += NbPositions;
             }
@@ -138,12 +138,12 @@ public:
         }else{
             if(type==SOURCE){
                 for(FSize idPart = 0; idPart<NbPositions ; ++idPart){
-                    octree->insert(FPoint<FReal>(X[idPart],Y[idPart],Z[idPart]),FParticleTypeSource,idPart);
+                    octree->insert(FPoint<FReal>(X[idPart],Y[idPart],Z[idPart]),FParticleType::FParticleTypeSource,idPart);
                 }
                 FScalFMMEngine<FReal>::nbPart += NbPositions;
             }else{
                 for(FSize idPart = 0; idPart<NbPositions ; ++idPart){
-                    octree->insert(FPoint<FReal>(X[idPart],Y[idPart],Z[idPart]),FParticleTypeTarget,idPart);
+                    octree->insert(FPoint<FReal>(X[idPart],Y[idPart],Z[idPart]),FParticleType::FParticleTypeTarget,idPart);
                 }
                 FScalFMMEngine<FReal>::nbPart += NbPositions;
             }
diff --git a/Addons/CKernelApi/Src/FUserKernelEngine.hpp b/Addons/CKernelApi/Src/FUserKernelEngine.hpp
index 27a0243bdbd2865d1174ec1018720c931bd95959..a4fb90b9164d67205c035fede02f1c4868edf28b 100644
--- a/Addons/CKernelApi/Src/FUserKernelEngine.hpp
+++ b/Addons/CKernelApi/Src/FUserKernelEngine.hpp
@@ -388,12 +388,12 @@ public:
         }else{
             if(type==SOURCE){
                 for(FSize idPart = 0; idPart<NbPositions ; ++idPart){
-                    octree->insert(FPoint<FReal>(X[idPart],Y[idPart],Z[idPart]),FParticleTypeSource,idPart);
+                    octree->insert(FPoint<FReal>(X[idPart],Y[idPart],Z[idPart]),FParticleType::FParticleTypeSource,idPart);
                 }
                 FScalFMMEngine<FReal>::nbPart += NbPositions;
             }else{
                 for(FSize idPart = 0; idPart<NbPositions ; ++idPart){
-                    octree->insert(FPoint<FReal>(X[idPart],Y[idPart],Z[idPart]),FParticleTypeTarget,idPart);
+                    octree->insert(FPoint<FReal>(X[idPart],Y[idPart],Z[idPart]),FParticleType::FParticleTypeTarget,idPart);
                 }
                 FScalFMMEngine<FReal>::nbPart += NbPositions;
             }
@@ -411,12 +411,12 @@ public:
         }else{
             if(type==SOURCE){
                 for(FSize idPart = 0; idPart<NbPositions ; ++idPart){
-                    octree->insert(FPoint<FReal>(&XYZ[3*idPart]),FParticleTypeSource,idPart);
+                    octree->insert(FPoint<FReal>(&XYZ[3*idPart]),FParticleType::FParticleTypeSource,idPart);
                 }
                 FScalFMMEngine<FReal>::nbPart += NbPositions;
             }else{
                 for(FSize idPart = 0; idPart<NbPositions ; ++idPart){
-                    octree->insert(FPoint<FReal>(&XYZ[3*idPart]),FParticleTypeTarget,idPart);
+                    octree->insert(FPoint<FReal>(&XYZ[3*idPart]),FParticleType::FParticleTypeTarget,idPart);
                 }
                 FScalFMMEngine<FReal>::nbPart += NbPositions;
             }
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b71fb71acdf6b68151022d9a22248b680088781..ecba3000c7fafc93b3927d9d19135ce5268bb6b9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -321,10 +321,16 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/
   ##############################################################################
   #
   if( SCALFMM_USE_BLAS )
+    #   include(FortranCInterface)
+    #    # Define a Fortran interface file (FCMangle.hpp)
+    #   FortranCInterface_HEADER( ${CMAKE_CURRENT_SOURCE_DIR}/Src/FCMangle.hpp
+    #                            MACRO_NAMESPACE "PM_"
+    #                           SYMBOL_NAMESPACE "PM_"
+    #                           SYMBOLS init testPPM:init)
     message(STATUS "CMAKE_CXX_COMPILER_ID STREQUAL  ${CMAKE_CXX_COMPILER_ID}")
-
+    
     option( SCALFMM_USE_MKL_AS_BLAS "Set to ON to use MKL CBLAS" OFF )
-
+    
     if( SCALFMM_USE_MKL_AS_BLAS )
       set(BLA_VENDOR "Intel10_64lp_seq")
       find_package(BLASEXT QUIET) # not REQUIRED
@@ -346,7 +352,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/
         list(APPEND BLASLAPACK_LIBRARIES "${BLAS_LIBRARIES}")
       endif()
     endif()
-
+    
     if(BLAS_FOUND)
       set(SCALFMM_LIBRARIES "${SCALFMM_LIBRARIES};${BLASLAPACK_LIBRARIES}")
       if(BLAS_LIBRARY_DIRS)
@@ -357,6 +363,35 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/
         # the RPATH to be used when installing
         list(APPEND CMAKE_INSTALL_RPATH "${LAPACK_LIBRARY_DIRS}")
       endif()
+      # check blas and lapack symbols naming
+      set(CMAKE_REQUIRED_LIBRARIES "${BLAS_LIBRARIES}")
+      check_function_exists(dgemv_ DGEMV_ADD_)
+      set (SCALFMM_BLAS_UPCASE OFF)
+      set (SCALFMM_BLAS_NOCHANGE OFF)
+      message (STATUS "BLAS dgemv_ " ${DGEMV_ADD_}   ${SCALFMM_BLAS_UPCASE})
+      if (DGEMV_ADD_)
+        set (SCALFMM_BLAS_ADD_ ON)
+        message (STATUS "BLAS dgemv_ symbol found, SCALFMM_BLAS_ADD_ is ON")
+      else (DGEMV_ADD_)
+        set (SCALFMM_BLAS_ADD_ OFF)
+	check_function_exists(DGEMV  DGEMV_UPCASE)
+	if (DGEMV_UPCASE)
+	  set (SCALFMM_BLAS_UPCASE ON)
+	  message (STATUS "BLAS DGEMV symbol found, SCALFMM_BLAS_UPCASE is ON")
+	else (DGEMV_UPCASE)
+	  #          set (SCALFMM_BLAS_UPCASE OFF)
+          check_function_exists(dgemv  DGEMV_NOCHANGE)
+          if (DGEMV_NOCHANGE) 
+            set (SCALFMM_BLAS_NOCHANGE ON)
+            message (STATUS "BLAS dgemv symbol found, SCALFMM_BLAS_NOCHANGE is ON")
+	    #	  else   (DGEMV_NOCHANGE)
+	    #            set (SCALFMM_BLAS_NOCHANGE OFF)
+	  endif (DGEMV_NOCHANGE)
+        endif (DGEMV_UPCASE)
+      endif (DGEMV_ADD_)
+      if ( (NOT DGEMV_ADD_) AND (NOT DGEMV_UPCASE) AND (NOT DGEMV_NOCHANGE) )
+        message(FATAL_ERROR "BLAS Fortran mangling cannot be properly detected")
+      endif ()
     else()
       message(WARNING "BLAS has not been found, SCALFMM will continue to compile but some applications will be disabled.")
       message(WARNING "If you have BLAS set BLAS_LIBDIR, BLAS_INCDIR or BLAS_DIR (CMake variables using -D or environment variables).")        
@@ -612,7 +647,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/
       OUTPUT_VARIABLE COMPILE_AVX_OUTPUT)
     if(${COMPILE_AVX})
       message(STATUS "%%%%%%%%%%%% COMPILE_AVX               = ${COMPILE_AVX}  %%%%<    ${AVX_FLAGS}")
-      
+
       set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS}   ${AVX_FLAGS}")
       message(STATUS "%%%%%%%%%%%% SCALFMM_CXX_FLAGS               = ${SCALFMM_CXX_FLAGS}")
       #set( SCALFMM_USE_SSE   OFF   FORCE) # ne marche pas
@@ -725,7 +760,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/
     endif(PKG_CONFIG_FOUND)
 
   endif(SCALFMM_USE_EZTRACE)
-  
+
   ##################################################################
   #
   #   To catch signals
@@ -738,7 +773,6 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/
         IF( NOT APPLE)
         SET(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS} -rdynamic")
         ENDIF()
-        
     endif()
   ##################################################################
   #                                                                #
@@ -806,6 +840,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/
   ##################################################################
   #                           Add - doc                            #
   ##################################################################
+  message(STATUS "SCALFMM_BUILD_DOC           =  ${SCALFMM_BUILD_DOC}" ) 
   if(SCALFMM_BUILD_DOC)
     add_subdirectory(Doc)
   endif()
diff --git a/CMakeModules/morse/find/FindBLAS.cmake b/CMakeModules/morse/find/FindBLAS.cmake
index cbf7769442aeb7f72f5cf59b83e106a0ba0fb177..073e2c1134ebf2b26e798996e24d616785d20ef2 100644
--- a/CMakeModules/morse/find/FindBLAS.cmake
+++ b/CMakeModules/morse/find/FindBLAS.cmake
@@ -279,6 +279,7 @@ macro(Check_Fortran_Libraries LIBRARIES _prefix _name _flags _list _thread)
             find_library(${_prefix}_${_library}_LIBRARY
                 NAMES ${_library}
                 HINTS ${_libdir}
+                NO_DEFAULT_PATH
               )
             mark_as_advanced(${_prefix}_${_library}_LIBRARY)
             # Print status if not found
@@ -293,6 +294,10 @@ macro(Check_Fortran_Libraries LIBRARIES _prefix _name _flags _list _thread)
 
     if(_libraries_work)
         # Test this combination of libraries.
+        if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND BLA_STATIC)
+            list(INSERT ${LIBRARIES} 0 "-Wl,--start-group")
+            list(APPEND ${LIBRARIES} "-Wl,--end-group")
+        endif()
         set(CMAKE_REQUIRED_LIBRARIES "${_flags};${${LIBRARIES}};${_thread}")
         set(CMAKE_REQUIRED_FLAGS "${BLAS_COMPILER_FLAGS}")
         if (BLAS_VERBOSE)
@@ -901,7 +906,7 @@ if (BLA_VENDOR STREQUAL "IBMESSL" OR BLA_VENDOR STREQUAL "All")
         BLAS
         sgemm
         ""
-        "essl;blas"
+        "essl"
         ""
         )
     endif()
diff --git a/CMakeModules/morse/find/FindBLASEXT.cmake b/CMakeModules/morse/find/FindBLASEXT.cmake
index f13b6c9fc64cd3be0c4bf13eb5d29ba52474e83a..86330f4224b9d706f35760cc03d810cf222b6cba 100644
--- a/CMakeModules/morse/find/FindBLASEXT.cmake
+++ b/CMakeModules/morse/find/FindBLASEXT.cmake
@@ -259,9 +259,17 @@ endif()
 # extract libs paths
 # remark: because it is not given by find_package(BLAS)
 set(BLAS_LIBRARY_DIRS "")
+string(REPLACE " " ";" BLAS_LIBRARIES "${BLAS_LIBRARIES}")
 foreach(blas_lib ${BLAS_LIBRARIES})
-    get_filename_component(a_blas_lib_dir "${blas_lib}" PATH)
-    list(APPEND BLAS_LIBRARY_DIRS "${a_blas_lib_dir}" )
+    string(REPLACE "-L" "" blas_lib "${blas_lib}")
+    if (EXISTS "${blas_lib}")
+        list(APPEND BLAS_LIBRARY_DIRS "${blas_lib}" )
+    else()
+        get_filename_component(a_blas_lib_dir "${blas_lib}" PATH)
+        if (EXISTS "${a_blas_lib_dir}")
+            list(APPEND BLAS_LIBRARY_DIRS "${a_blas_lib_dir}" )
+        endif()
+    endif()
 endforeach()
 if (BLAS_LIBRARY_DIRS)
     list(REMOVE_DUPLICATES BLAS_LIBRARY_DIRS)
diff --git a/CMakeModules/morse/find/FindFFTW.cmake b/CMakeModules/morse/find/FindFFTW.cmake
index f187b7c80d788a03a9d4df2be2f337884346501a..f259c58feb78c7f1a576456b5e68f90d807d379e 100644
--- a/CMakeModules/morse/find/FindFFTW.cmake
+++ b/CMakeModules/morse/find/FindFFTW.cmake
@@ -172,13 +172,16 @@ find_package(PkgConfig QUIET)
 if( PKG_CONFIG_EXECUTABLE AND NOT FFTW_GIVEN_BY_USER )
 
   if(FFTW_LOOK_FOR_FFTW_SIMPLE)
-    pkg_search_module(FFTW fftw3f)
+    pkg_search_module(FFTW3F fftw3f)
+    pkg_search_module(FFTW3 fftw3)
   elseif(FFTW_LOOK_FOR_FFTW_LONG)
-    pkg_search_module(FFTW fftw3)
+	pkg_search_module(FFTW3L fftw3l)
+    pkg_search_module(FFTW3 fftw3)
   elseif(FFTW_LOOK_FOR_FFTW_QUAD)
-    pkg_search_module(FFTW fftw3q)
+    pkg_search_module(FFTW3Q fftw3q)
+    pkg_search_module(FFTW3 fftw3)
   else()
-    pkg_search_module(FFTW fftw3)
+    pkg_search_module(FFTW3 fftw3)
   endif()
 
   if (NOT FFTW_FIND_QUIETLY)
@@ -198,7 +201,19 @@ if( PKG_CONFIG_EXECUTABLE AND NOT FFTW_GIVEN_BY_USER )
 
   set(FFTW_INCLUDE_DIRS_DEP "${FFTW_INCLUDE_DIRS}")
   set(FFTW_LIBRARY_DIRS_DEP "${FFTW_LIBRARY_DIRS}")
-  set(FFTW_LIBRARIES_DEP "${FFTW_LIBRARIES}")
+  set(FFTW_LIBRARIES_DEP)
+  if( FFTW3Q_LIBRARIES )
+	list(APPEND FFTW_LIBRARIES_DEP "${FFTW3Q_LIBRARIES}")
+  endif()
+  if( FFTW3L_LIBRARIES )
+	list(APPEND FFTW_LIBRARIES_DEP "${FFTW3L_LIBRARIES}")
+  endif()
+  if( FFTW3F_LIBRARIES )
+	list(APPEND FFTW_LIBRARIES_DEP "${FFTW3F_LIBRARIES}")
+  endif()
+if( FFTW3_LIBRARIES )
+	list(APPEND FFTW_LIBRARIES_DEP "${FFTW3_LIBRARIES}")
+endif()
   set(FFTW_WORKS TRUE)
 
 endif( PKG_CONFIG_EXECUTABLE AND NOT FFTW_GIVEN_BY_USER )
@@ -551,7 +566,13 @@ endif()
 # check that FFTW has been found
 # -------------------------------
 include(FindPackageHandleStandardArgs)
+if( (NOT PKG_CONFIG_EXECUTABLE) OR (PKG_CONFIG_EXECUTABLE AND NOT FFTW_FOUND) OR (FFTW_GIVEN_BY_USER) )
 find_package_handle_standard_args(FFTW DEFAULT_MSG
   FFTW_LIBRARIES
   FFTW_INCLUDE_DIRS
   FFTW_WORKS)
+else()
+find_package_handle_standard_args(FFTW DEFAULT_MSG
+  FFTW_LIBRARIES
+  FFTW_WORKS)
+endif()
diff --git a/CMakeModules/morse/find/FindLAPACK.cmake b/CMakeModules/morse/find/FindLAPACK.cmake
index 81e3869f731bc7c096a17b2d8749ad64a2332a50..668453dab6eb413fe24cf3a1ea90ee4b37dae8fd 100644
--- a/CMakeModules/morse/find/FindLAPACK.cmake
+++ b/CMakeModules/morse/find/FindLAPACK.cmake
@@ -154,7 +154,7 @@ macro(Check_Lapack_Libraries LIBRARIES _prefix _name _flags _list _blas _threads
 
 # N.B. _prefix is the prefix applied to the names of all cached variables that
 # are generated internally and marked advanced by this macro.
-
+set(_libdir ${ARGN})
 set(_libraries_work TRUE)
 set(${LIBRARIES})
 set(_combined_name)
@@ -263,6 +263,7 @@ foreach(_library ${_list})
     find_library(${_prefix}_${_library}_LIBRARY
       NAMES ${_library}
       HINTS ${_libdir}
+      NO_DEFAULT_PATH
       )
     mark_as_advanced(${_prefix}_${_library}_LIBRARY)
     # Print status if not found
@@ -277,6 +278,10 @@ endforeach(_library ${_list})
 
 if(_libraries_work)
   # Test this combination of libraries.
+  if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND BLA_STATIC)
+    list(INSERT ${LIBRARIES} 0 "-Wl,--start-group")
+    list(APPEND ${LIBRARIES} "-Wl,--end-group")
+  endif()
   if(UNIX AND BLA_STATIC)
     set(CMAKE_REQUIRED_LIBRARIES ${_flags} "-Wl,--start-group" ${${LIBRARIES}} ${_blas} "-Wl,--end-group" ${_threads})
   else(UNIX AND BLA_STATIC)
diff --git a/CMakeModules/morse/find/FindLAPACKEXT.cmake b/CMakeModules/morse/find/FindLAPACKEXT.cmake
index dc608cc741221f20010cf11b5f42839ac7e3b0db..420b898d9f10b9c2f8588ca2ea700f639c4e14cf 100644
--- a/CMakeModules/morse/find/FindLAPACKEXT.cmake
+++ b/CMakeModules/morse/find/FindLAPACKEXT.cmake
@@ -211,9 +211,17 @@ endif()
 # extract libs paths
 # remark: because it is not given by find_package(LAPACK)
 set(LAPACK_LIBRARY_DIRS "")
+string(REPLACE " " ";" LAPACK_LIBRARIES "${LAPACK_LIBRARIES}")
 foreach(lapack_lib ${LAPACK_LIBRARIES})
-    get_filename_component(a_lapack_lib_dir "${lapack_lib}" PATH)
-    list(APPEND LAPACK_LIBRARY_DIRS "${a_lapack_lib_dir}" )
+    string(REPLACE "-L" "" lapack_lib "${lapack_lib}")
+    if (EXISTS "${lapack_lib}")
+        list(APPEND LAPACK_LIBRARY_DIRS "${lapack_lib}" )
+    else()
+        get_filename_component(a_lapack_lib_dir "${lapack_lib}" PATH)
+        if (EXISTS "${a_lapack_lib_dir}")
+            list(APPEND LAPACK_LIBRARY_DIRS "${a_lapack_lib_dir}" )
+        endif()
+    endif()
 endforeach()
 if (LAPACK_LIBRARY_DIRS)
     list(REMOVE_DUPLICATES LAPACK_LIBRARY_DIRS)
diff --git a/CMakeModules/morse/find/FindPASTIX.cmake b/CMakeModules/morse/find/FindPASTIX.cmake
index a4c6e742dc9af5c5f58bf6d6d8e9f2ff60f0042f..f6f4c9573a51669bfc70bab976b60b6a343ced76 100644
--- a/CMakeModules/morse/find/FindPASTIX.cmake
+++ b/CMakeModules/morse/find/FindPASTIX.cmake
@@ -17,6 +17,7 @@
 #
 #  PASTIX depends on the following libraries:
 #   - Threads, m, rt
+#   - MPI
 #   - HWLOC
 #   - BLAS
 #
diff --git a/CMakeModules/morse/find/FindSTARPU.cmake b/CMakeModules/morse/find/FindSTARPU.cmake
index 0e8d82382f6b4401df0441bc823ab674e8d38765..a2b1ed209b7444fc4c10f31d248cd2d78972795e 100644
--- a/CMakeModules/morse/find/FindSTARPU.cmake
+++ b/CMakeModules/morse/find/FindSTARPU.cmake
@@ -225,6 +225,10 @@ if(PKG_CONFIG_EXECUTABLE AND NOT STARPU_GIVEN_BY_USER)
             #        "Perhaps the path to starpu headers is already present in your"
             #        "C(PLUS)_INCLUDE_PATH environment variable.${ColourReset}")
             #endif()
+            set(STARPU_VERSION_STRING "${STARPU_SHM_VERSION}")
+            string(REPLACE "." ";" STARPU_VERSION_STRING_LIST ${STARPU_VERSION_STRING})
+            list(GET STARPU_VERSION_STRING_LIST 0 STARPU_VERSION_MAJOR)
+            list(GET STARPU_VERSION_STRING_LIST 1 STARPU_VERSION_MINOR)
         else()
             message("${Magenta}Looking for STARPU - not found using PkgConfig."
                 "Perhaps you should add the directory containing libstarpu.pc"
@@ -461,14 +465,23 @@ if( (NOT PKG_CONFIG_EXECUTABLE) OR (PKG_CONFIG_EXECUTABLE AND NOT STARPU_FOUND)
                 find_path(STARPU_${starpu_hdr}_INCLUDE_DIRS
                           NAMES ${starpu_hdr}
                           HINTS ${STARPU_DIR}
-                          PATH_SUFFIXES "include/starpu/${STARPU_VERSION_STRING}")
+                          PATH_SUFFIXES "include"
+                          "include/starpu/1.0"
+                          "include/starpu/1.1"
+                          "include/starpu/1.2"
+                          "include/starpu/1.3")
             endforeach()
         else()
             foreach(starpu_hdr ${STARPU_hdrs_to_find})
                 set(STARPU_${starpu_hdr}_INCLUDE_DIRS "STARPU_${starpu_hdr}_INCLUDE_DIRS-NOTFOUND")
                 find_path(STARPU_${starpu_hdr}_INCLUDE_DIRS
                           NAMES ${starpu_hdr}
-                          HINTS ${_inc_env})
+                          HINTS ${_inc_env}
+                          PATH_SUFFIXES
+                          "starpu/1.0"
+                          "starpu/1.1"
+                          "starpu/1.2"
+                          "starpu/1.3")
             endforeach()
         endif()
     endif()
diff --git a/Data/test20k.tsm.fma b/Data/test20k.tsm.fma
index e46c371875748ab60d2704a31554d912d54f1ab5..5c0b2ea3b365b43fa316114fe37199b59c60ecb7 100644
--- a/Data/test20k.tsm.fma
+++ b/Data/test20k.tsm.fma
@@ -1,6 +1,4 @@
-8 4
-20000
-0.5	0.5	0.5	0.5
+20000 1 0.5	0.5	0.5
 0.840188	0.394383	0.783099	0.01	1
 0.911647	0.197551	0.335223	0.01	1
 0.277775	0.55397	0.477397	0.01	1
@@ -20000,4 +19998,4 @@
 0.00448784	0.00539908	0.182474	0.01	0
 0.0237434	0.139661	0.412617	0.01	1
 0.514349	0.627817	0.0209046	0.01	1
-0.56572	0.990817	0.904442	0.01	0
\ No newline at end of file
+0.56572	0.990817	0.904442	0.01	0
diff --git a/Doc/CMakeLists.txt b/Doc/CMakeLists.txt
index 0c71df2038f54e13e54647e176a721fe19355158..d0634f5264b2b1b9cce465f10b2374ec5e4b02c4 100644
--- a/Doc/CMakeLists.txt
+++ b/Doc/CMakeLists.txt
@@ -1,8 +1,9 @@
 # add a target to generate API documentation with Doxygen
 find_package(Doxygen)
 if(DOXYGEN_FOUND)
-    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY)
-    add_custom_target(
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in
+                 ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY)
+  add_custom_target(
         doc
         ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
@@ -11,5 +12,5 @@ if(DOXYGEN_FOUND)
     # INSTALL(FILES   ${SCALFMM_BINARY_DIR}/Doc/scalfmm.tag DESTINATION doc/  )
     # INSTALL(DIRECTORY   ${SCALFMM_BINARY_DIR}/Doc/html  DESTINATION doc/  )
 else()
-    message( WARNING "You ask to enable the doc generation but Doxygen cannot be found." )    
+    message( FATAL_ERROR "You ask to enable the doc generation but Doxygen cannot be found." )    
 endif(DOXYGEN_FOUND)
diff --git a/Doc/Site_dox/FInterpolationFMM.dox b/Doc/Site_dox/FInterpolationFMM.dox
new file mode 100644
index 0000000000000000000000000000000000000000..8cd90f2ac5ff93cab811338fd62c2ebad583efbe
--- /dev/null
+++ b/Doc/Site_dox/FInterpolationFMM.dox
@@ -0,0 +1,25 @@
+/*! \page interFMM Kernel Independent FMM
+
+ * In this section, we briefly discuss the 
+
+ * \section general
+
+ * \section MatrixKernel
+ *
+  * \subsection AddKernel How add a new Matrix Kernel
+
+   * \subsection predKernel predefined Matrix Kernel
+   * Different kernels are predefined in ScalFMM. The kernels are in located
+   * in FInterpMatrixKernel.hpp
+   *<ul>
+   * <li> Laplacian kernel K(x,y)= 1/r with r=|x-y|          <--> class FInterpMatrixKernelR
+   * <li> Laplacian kernel K(x,y)=1/rh with rh=sqrt(L_i*(x_i-y_i)^2)   <--> class FInterpMatrixKernelRH
+   * <li>                  K(x,y)=1/r^2 with r=|x-y|     <--> class FInterpMatrixKernelRR
+   * <li> Lennard Jones    K(x,y)=1/r^12 - 1/r^6 with r=|x-y|"   <--> class  FInterpMatrixKernelLJ
+   * <li>Modified  Laplacian kernel  K(x,y)=1/r exp(-lambda r) with r=|x-y| <--> FInterpMatrixKernelML
+   * <li> K(x,y)=1/(r^2 + coreWidth) with r=|x-y|  <-->  FInterpMatrixKernelAPLUSRR
+   * </ul>
+ 
+ 
+
+*/
diff --git a/Doc/noDist/Notes/distribution.pdf b/Doc/noDist/Notes/distribution.pdf
index 2f70d6c5854372d66fc3bd9b6a60a9363b8e2e39..3a94147abfbf8b57536bb69d7677c7ef8e247a71 100644
Binary files a/Doc/noDist/Notes/distribution.pdf and b/Doc/noDist/Notes/distribution.pdf differ
diff --git a/Doc/noDist/Notes/distribution.tex b/Doc/noDist/Notes/distribution.tex
index 04ec7836a1887d65629e7b2cec29f50ee2c184b1..c822fe6cd37083d4c6204803f11f5bf0d75a45a8 100644
--- a/Doc/noDist/Notes/distribution.tex
+++ b/Doc/noDist/Notes/distribution.tex
@@ -107,9 +107,9 @@ If you consider the
 \subsection{Plummer Model}
 This is a hard test case in astrophysics problem, and it models a globular cluster of stars, which is highly non uniform.  It is called   the plummer distribution. To construct such distribution, first we construct a uniform points distribution on the unit sphere. Second, the radius is chosen according to the plummer distribution (double power law in astrophysics). We consider $u$ a random number between 0 and 1, then the associated radius is given by
 \begin{equation*}
-r = \sqrt{\frac{u^{2/3}}{u^{2/3}-1}}
+r = 1.0/\sqrt{u^{-2/3}-1},
 \end{equation*}
-
+and the total mass is one. Then, $m_i = \frac{1}{Npt}$.
 \begin{figure}[h]
   \centering
   \begin{minipage}{0.45\textwidth}%
@@ -140,6 +140,8 @@ The corresponding potential is
 \begin{equation}
 \Phi_P(r) = - \frac{G M}{\sqrt{r^2+a^2}}
 \end{equation}
+
+In N-body units, $G = M = 1$ and $a = 3\pi/16 \sim 0.589$
 \subsection{Diagonal Model}
 
 %, shape end size=.5cm},decoration={shape start size=.5cm, shape end size=.125cm
diff --git a/Examples/LagrangeInterpolationFMM.cpp b/Examples/LagrangeInterpolationFMM.cpp
index efb3d1b19361729c12650c5b48a586ea6636a7cf..2562741529393cf234c8ecfc2dec0792bc01834d 100755
--- a/Examples/LagrangeInterpolationFMM.cpp
+++ b/Examples/LagrangeInterpolationFMM.cpp
@@ -28,17 +28,21 @@
 #include <string>
 
 #include "ScalFmmConfig.h"
+#include "Utils/FGlobal.hpp"
 
-#include "Files/FFmaGenericLoader.hpp"
+#include "Utils/FParameters.hpp"
+#include "Utils/FParameterNames.hpp"
 
+#include "Files/FFmaGenericLoader.hpp"
+// UFMM
 #include "Kernels/Uniform/FUnifCell.hpp"
 #include "Kernels/Interpolation/FInterpMatrixKernel.hpp"
 #include "Kernels/Uniform/FUnifKernel.hpp"
-
+// Leaves 
 #include "Components/FSimpleLeaf.hpp"
 #include "Kernels/P2P/FP2PParticleContainerIndexed.hpp"
 
-#include "Utils/FParameters.hpp"
+
 
 #include "Containers/FOctree.hpp"
 
@@ -48,7 +52,6 @@
 #include "Core/FFmmAlgorithm.hpp"
 #endif
 
-#include "Utils/FParameterNames.hpp"
 
 #include <memory>
 
diff --git a/Examples/changeFmaFormat.cpp b/Examples/changeFmaFormat.cpp
index 800555c886b77d12de6b3a73304ed984d477916f..45932744e733e6d5090c0bd19c549872cf1868eb 100644
--- a/Examples/changeFmaFormat.cpp
+++ b/Examples/changeFmaFormat.cpp
@@ -11,16 +11,15 @@
 #include <string>
 #include <cstdlib>
 //
-#include "Files/FFmaGenericLoader.hpp"
-#include "Files/FDlpolyLoader.hpp"
 //
 #include "Utils/FGlobal.hpp"
-#include "Utils/FPoint.hpp"
+
 #include "Utils/FParameters.hpp"
-#include "Files/FGenerateDistribution.hpp"
+#include "Utils/FParameterNames.hpp"
+
+#include "Files/FFmaGenericLoader.hpp"
 #include "Files/FExportWriter.hpp"
 
-#include "Utils/FParameterNames.hpp"
 
 //
 /// \file  changeFmaFormat.cpp
@@ -88,7 +87,7 @@ int main(int argc, char ** argv){
     //   Generate file for visualization purpose
     //
     if(FParameters::existParameter(argc, argv, FParameterDefinitions::OutputVisuFile.options)){
-        std::string outfilename(FParameters::getStr(argc,argv,FParameterDefinitions::OutputFile.options,   "output.vtp"));
+        std::string outfilename(FParameters::getStr(argc,argv,FParameterDefinitions::OutputVisuFile.options,   "output.vtp"));
         driverExportData(outfilename, particles , NbPoints,loader.getNbRecordPerline() );
     }
     //
diff --git a/Examples/fuseDistributions.cpp b/Examples/fuseDistributions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d5a6535cc3aa4e249836e23ace2db64b856aed39
--- /dev/null
+++ b/Examples/fuseDistributions.cpp
@@ -0,0 +1,287 @@
+/**
+ * \file
+ * \brief Fuses FMA files to create a new distribution
+ *
+ * \author Quentin Khan
+ * \copyright ScalFmm 2016 INRIA
+ * \copyright [CeCILL-C licence](http://www.cecill.info)
+ *
+ *
+ */
+
+#include <algorithm>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+
+#include "Files/FFmaGenericLoader.hpp"
+
+
+void usage(const std::string& progname) {
+    std::size_t start = progname.find_last_of('/');
+    std::string name = progname.substr(start+1);
+    std::cout <<
+        "usage: " << name <<
+        " --file [[-s scale] [-c cx:cy:cz] [-g gx:gy:gz]] filename"
+        " -fout output_file"
+        " [--extra-length length]"
+        "\n"
+        "\n"
+        "Fuses multiple particle distributions into a bigger one."
+        "\n"
+        "\n"
+        "Options:\n"
+        "  -fout output_file\n"
+        "    The output file name, must hase .bfma or .fma extension\n"
+        "\n"
+        "  --file [opts] filename [opts]\n"
+        "    Add a .fma or .bfma distibution file. Multiple files may be specified by\n"
+        "    adding more --file options. 'opts' is a combination of:\n"
+        "      -s scale\n"
+        "        Scale the distribution by 'scale' factor.\n"
+        "      -c cx:cy:cz\n"
+        "        Center the distribution at given coordinates. cx, cy and cz are\n"
+        "        floating point numbers.\n"
+        "      -g gx:gy:gz\n"
+        "        Duplicate the distribution inside a grid of gx by gy by gz dimensions.\n"
+        "        gx, gy and gz are integers. The grid center is governed by the -c \n"
+        "        option.\n"
+        "      -r rx:ry:rz\n"
+        "        Rotate the distribution around its x, y and z axes. The rotation \n"
+        "        center is the distribution center. rx, ry and rz are in radians.\n"
+        "\n"
+        "  --extra-length length\n"
+        "    Length to be added to the final box width.\n"
+        "\n"
+        "  --help\n"
+        "    Print this message."
+        "\n"
+        ;
+}
+
+
+using FReal = double;
+
+struct Particle {
+    FPoint<FReal> pos;
+    FReal val;
+};
+
+/// Distribution options
+struct distribution {
+    /// Distribution filename
+    std::string filename = "";
+    /// Distribution offset from center
+    FPoint<FReal> offset = {0,0,0};
+    /// Distribution rotation around its center
+    FPoint<FReal> rot = {0,0,0};
+    /// Distribution scale factor
+    FReal scale = 1;
+};
+
+
+
+struct parameters {
+    std::string output_filename;
+    std::vector<distribution> distributions;
+    FReal extra_length;
+};
+
+std::vector<distribution> subparse_file(const std::vector<std::string>& args, std::size_t& i) {
+    std::stringstream sstr;
+    // Grid layout
+    unsigned int gx = 1, gy = 1, gz = 1;
+    // Final distributions, one per grid part
+    std::vector<distribution> distributions;
+    // Distributions options
+    distribution dist;
+
+    while(i < args.size() && args[i] != "--file") {
+        sstr.clear();
+        if(false) {
+        } else if(args[i] == "-s") {
+            ++i;
+            sstr.str(args.at(i));
+            sstr >> dist.scale;
+        } else if (args[i] == "-c") {
+            ++i;
+            char c; // Used to discard the ':' from argument format
+            sstr.str(args.at(i));
+            sstr >> dist.offset[0] >> c >> dist.offset[1] >> c >> dist.offset[2];
+        } else if(args[i] == "-g") {
+            ++i;
+            char c; // Used to discard the ':' from argument format
+            sstr.str(args.at(i));
+            sstr >> gx >> c >> gy >> c >> gz;
+        } else if(args[i] == "-r") {
+            ++i;
+            char c; // Used to discard the ':' from argument format
+            sstr.str(args.at(i));
+            sstr >> dist.rot[0] >> c >> dist.rot[1] >> c >> dist.rot[2];
+        } else {
+            if(dist.filename != "") {
+                --i;
+                break;
+            }
+            dist.filename = args[i];
+        }
+        ++i;
+    }
+
+    if(gx > 1 || gy > 1 || gz > 1) {
+
+        // Compute offset of lowest left grid offset
+        FFmaGenericLoader<FReal> loader(dist.filename);
+        FReal box_width = loader.getBoxWidth() * dist.scale;
+        dist.offset[0] -= (gx-1) * box_width / 2;
+        dist.offset[1] -= (gy-1) * box_width / 2;
+        dist.offset[2] -= (gz-1) * box_width / 2;
+
+        // Create one distribution for each part of the grid layout
+        for(unsigned int x = 0; x < gx; ++x) {
+            for(unsigned int y = 0; y < gy; ++y) {
+                for(unsigned int z = 0; z < gz; ++z) {
+                    distribution tmp_dist = dist;
+                    tmp_dist.offset[0] += x * box_width;
+                    tmp_dist.offset[1] += y * box_width;
+                    tmp_dist.offset[2] += z * box_width;
+                    distributions.push_back(tmp_dist);
+                }
+            }
+        }
+    } else {
+        distributions.push_back(dist);
+    }
+
+    return distributions;
+}
+
+
+parameters parse(const std::vector<std::string>& args) {
+    parameters params;
+    std::stringstream sstr;
+    for(std::size_t i = 1; i < args.size(); ++i) {
+        if(args[i] == "--help") {
+            usage(args[0]);
+            exit(0);
+        } else if(args[i] == "--file") {
+            ++i;
+            auto ds = subparse_file(args, i);
+            params.distributions.insert(params.distributions.end(),
+                                        ds.begin(), ds.end());
+        } else if(args[i] == "--extra-length") {
+            ++i;
+            sstr.str(args.at(i));
+            sstr >> params.extra_length;
+        } else if(args[i] == "-fout") {
+            ++i;
+            params.output_filename = args.at(i);
+        } else {
+            std::cerr << "Unknown or misplaced parameters: " << args[i] << '\n';
+        }
+    }
+    return params;
+}
+
+
+void rotate(Particle& p, const distribution& dist) {
+    // Rotate around x axis
+    if(dist.rot[0] > 1e-5 || dist.rot[0] < -1e-5) {
+        FReal alpha = dist.rot[0];
+        p.pos[1] = p.pos[1] * cos(alpha) - p.pos[2] * sin(alpha);
+        p.pos[2] = p.pos[1] * sin(alpha) + p.pos[2] * cos(alpha);
+    }
+    // Rotate around y axis
+    if(dist.rot[1] > 1e-5 || dist.rot[1] < -1e-5) {
+        FReal alpha = dist.rot[1];
+        p.pos[0] =  p.pos[0] * cos(alpha) + p.pos[2] * sin(alpha);
+        p.pos[2] = -p.pos[0] * sin(alpha) + p.pos[2] * cos(alpha);
+    }
+    // Rotate around z axis
+    if(dist.rot[2] > 1e-5 || dist.rot[2] < -1e-5) {
+        FReal alpha = dist.rot[1];
+        p.pos[0] = p.pos[0] * cos(alpha) - p.pos[1] * sin(alpha);
+        p.pos[1] = p.pos[0] * sin(alpha) + p.pos[1] * cos(alpha);
+    }
+}
+
+
+
+
+
+
+int main(int argc, char** argv) {
+    auto params = parse({argv,argv+argc});
+
+    // Fail early if output file raises an error
+    FFmaGenericWriter<FReal> writer(params.output_filename);
+
+    // Fuse particle distributions
+    std::vector<Particle> particles;
+    FReal axis_max = 0;
+
+    for(distribution& dist : params.distributions) {
+        // Load particles into array
+        FFmaGenericLoader<FReal> loader(dist.filename);
+        const std::size_t count = loader.getParticleCount();
+        // Particle array: x1, y1, z1, val1, x2, y2...
+        particles.reserve(particles.size() + count);
+
+        FPoint<FReal> center = loader.getBoxCenter();
+
+        // Temp particle
+        Particle p;
+        for(std::size_t i = 0; i < count; ++i) {
+            loader.fillParticle(&p.pos, &p.val);
+            // Move distribution center to origin
+            p.pos -= center;
+            // Scale distribution
+            p.pos *= dist.scale;
+            // Rotate distribution
+            rotate(p, dist);
+            // Move to new position
+            p.pos += dist.offset;
+            // Add particle to list
+            particles.push_back(p);
+
+            // Save particle x,y,z min/max to compute final box
+            axis_max = std::max(std::abs(p.pos[0]), axis_max);
+            axis_max = std::max(std::abs(p.pos[1]), axis_max);
+            axis_max = std::max(std::abs(p.pos[2]), axis_max);
+        }
+    }
+
+
+    // Write final distribution
+    FPoint<FReal> center(0,0,0);
+    // Compute final box width
+    FReal box_width = 2 * (axis_max + params.extra_length);
+
+    // Write header
+    writer.writeHeader(center, box_width, particles.size(), 8, 4);
+
+    // Write all particles
+
+    // Buffer avoids duplicating particle vector
+    std::vector<FReal> buffer;
+    buffer.reserve(4*1024); // Avoid reallocations, size is a multiple of 4
+
+    auto cur = particles.begin();
+    auto sentinel = particles.end();
+
+    // Fill and write buffer until we're done
+    while(cur != sentinel) {
+        buffer.clear();
+        while(buffer.size() != buffer.capacity() && cur != sentinel) {
+            buffer.push_back(cur->pos[0]);
+            buffer.push_back(cur->pos[1]);
+            buffer.push_back(cur->pos[2]);
+            buffer.push_back(cur->val);
+            ++cur;
+        }
+        writer.writeArrayOfReal(buffer.data(), 4, buffer.size()/4);
+    }
+
+}
diff --git a/Examples/generateDistributions.cpp b/Examples/generateDistributions.cpp
index e06911f4a613dbb866756e3c70791025ccd92246..ff3deb9234808c2d2a12a841140c5d7da2f935bb 100644
--- a/Examples/generateDistributions.cpp
+++ b/Examples/generateDistributions.cpp
@@ -5,12 +5,12 @@
  *      Author: Olivier Coulaud
  */
 
-
+#include <algorithm>
 #include <iostream>
 #include <fstream>
 #include <sstream>
 #include <string>
-//
+
 #include "Utils/FGlobal.hpp"
 #include "Utils/FMath.hpp"
 #include "Utils/FPoint.hpp"
@@ -20,240 +20,244 @@
 
 #include "Utils/FParameterNames.hpp"
 
-//
-/// \file  generateDistributions.cpp
-//!
-//! \brief generateDistributions: Driver to generate N points (non)uniformly distributed on a given geometry
-//!
-//! The goal of this driver is to generate uniform or non uniform points on the following geometries
-//!
-//!   Uniform : cube, cuboid, sphere, prolate,
-//!
-//!   Non uniform : ellipsoid, prolate
-//!
-//!  You can set two kind of physical values depending of your problem. By default all values are between 0 and 1.
-//!   If you select the argument -charge (see bellow) the values are between -1 and 1.
-//!  The arguments available are
-//!
-//!  <b> General arguments:</b>
-//!     \param   -help (-h)      to see the parameters available in this driver
-//!     \param  -N     The number of points in the distribution (default 20000)
-//!     \param   -fout name: generic name for files (with extension) and save data
-//!                  with following format in name.fma or name.bfma in -bin is set"
-//!      \param  -fvisuout Filename for the visu file (vtk, vtp, cvs or cosmo). vtp is the default
-//!      \param -extraLength   value    extra length to add to the boxWidth (default 0.0)
-//!  <b> Geometry arguments:</b>
-//!      \param  -unitCube uniform distribution on unit cube
-//!      \param  -cube uniform distribution on a cube
-//!          \arg         -length  R - default value for R is 2.0
-//!      \param  -unitSphere uniform distribution on unit sphere
-//!      \param  -sphere  uniform distribution on  sphere of radius given by
-//!          \arg         -radius  R - default value for R is 2.0
-//!        \param   -ellipsoid non uniform distribution on  an ellipsoid of aspect ratio given by
-//!              \arg          -size a:b:c   with a, b and c > 0
-//!         \param  -prolate ellipsoid with aspect ratio a:a:c  given by
-//!                \arg             -size a:a:c   with  c > a > 0
-//!          \param   -plummer (Highly non uniform) plummer distribution (astrophysics)
-//!                   \arg         -radius  R - default value 10.0"
-//!
-//!
-//!  <b> Physical values argument:</b>
-//!         \param -charge generate physical values between -1 and 1 otherwise generate between 0 and 1
-//!         \param -zeromean  the average of the physical values is zero
-//!
-//!
-//! \b examples
-//!
-//!   generateDistributions -prolate -size 2:2:4   -N 20000 -fout prolate
-//!
-//! or
-//!
-//!  generateDistributions -cuboid 2:2:4 -N 100000 -fout cuboid.bfma  -fvisuout cuboid.vtp -charge  -zeromean
-//!
+/**
+ * \file
+ *
+ * \brief Generates points (non)uniformly distributed on a given geometry
+ *
+ * The goal of this driver is to generate uniform or non uniform points on the
+ * following geometries
+ *
+ *   - Uniform : cube, cuboid, sphere, prolate,
+ *   - Non uniform : ellipsoid, prolate
+ *
+ *  You can set two kind of physical values depending of your problem. By
+ *   default all values are between 0 and 1.  If you select the argument -charge
+ *   (see bellow) the values are between -1 and 1.  The arguments available are
+ *
+ * <b> General arguments:</b>
+ * \param -help (-h)      to see the parameters available in this driver
+ * \param -N     The number of points in the distribution (default 20000)
+ * \param -fout name: generic name for files (with extension) and save data with
+ *                   following format in name.fma or name.bfma in -bin is set"
+ * \param -fvisuout Filename for the visu file (vtk, vtp, cvs or cosmo). vtp is
+ *                  the default
+ * \param -extraLength value extra length to add to the boxWidth (default 0.0)
+ * <b> Geometry arguments:</b>
+ * \param -unitCube uniform distribution in unit cube
+ * \param -cube uniform distribution in a cube
+ *     \arg -size LX:LY:LZ - default value for R is 1.0:1.0:2.0
+ * \param -unitSphere uniform distribution on unit sphere
+ * \param -sphere uniform distribution on sphere of radius given by
+ *     \arg -radius R - default value for R is 2.0
+ * \param -ball uniform distribution in ball of radius given by
+ *     \arg -radius R - default value for R is 2.0
+ * \param -ellipsoid non uniform distribution on an ellipsoid of aspect ratio
+ *                   given by
+ *     \arg -size a:b:c with a, b and c > 0
+ * \param -prolate ellipsoid with aspect ratio a:a:c given by
+ *     \arg -size a:a:c with c > a > 0
+ * \param -plummer (Highly non uniform) plummer distribution (astrophysics)
+ *     \arg -radius R - default value 10.0"
+ *
+ *
+ * <b> Physical values argument:</b>
+ * \param -charge generate physical values between -1 and 1 otherwise generate between 0 and 1
+ * \param -zeromean  the average of the physical values is zero
+ *
+ *
+ * <b> examples</b>
+ *
+ *    generateDistributions -prolate -size 2:2:4   -N 20000 -fout prolate
+ *
+ *  or
+ *
+ *    generateDistributions -cuboid 2:2:4 -N 100000 -fout cuboid.bfma  -fvisuout cuboid.vtp -charge  -zeromean
+ *
+ */
 
-int main(int argc, char ** argv){
-    const FParameterNames LocalOptionEllipsoid = {{"-ellipsoid"} ,
-        " non uniform distribution on  an ellipsoid of aspect ratio given by -size a:b:c   with a, b and c > 0"},
-    LocalOptionUnitCube ={ {"-unitCube"} ,
-            " uniform distribution on unit cube"},
-    LocalOptionCube ={ {"-cuboid"} ,
-            " uniform distribution on rectangular cuboid of size -lengths a:b:c  - default values are 1.0:1.0:2.0 "},
-	LocalOptionSize ={{"-size"} ,
-		            " Size of the geometry a:b:c  - default values are 1.0:1.0:2.0"},
-	LocalOptionUnitSphere ={ {"-unitSphere"} ,
-				            " uniform distribution on unit sphere"},
-	LocalOptionSphere ={ {"-sphere"} ,
-						 " uniform distribution on  sphere of radius given by -radius  R - default value for R is 2.0"},
-	LocalOptionProlate ={ {"-prolate"} ," ellipsoid with aspect ratio a:a:cs given by  -size a:a:c   with  c > a > 0"},
-	LocalOptionPlummer ={ {"-plummer"} ," (Highly non uniform) plummer distribution (astrophysics)  -radius  R - default value 10.0"},
-	LocalOptionRadius ={ {"-radius"} ,
-						" used to specified the radius of the sphere an dthe plummer distribution or  R - default value for R is 2.0"},
-	LocalOptionCharge ={{"-charge"} ," generate physical values between -1 and 1 otherwise generate between 0 and 1"},
-	LocalOptionZM ={{"-zeromean"} , " the average of the physical values is zero"},
-	LocalOptionEL ={{"-extraLength"} ,
-		            " -extraLength   value    extra length to add to the boxWidth"};
-;
 
-    FHelpDescribeAndExit(argc, argv,
-                         ">> Driver to generate N points (non)uniformly distributed on a given geometry.\n"
-                         "Options  \n"
-                         "   -help       to see the parameters    ",
-                          FParameterDefinitions::OutputFile,
-						  FParameterDefinitions::NbParticles,FParameterDefinitions::OutputVisuFile,LocalOptionUnitCube,LocalOptionCube,
-						  LocalOptionUnitSphere,LocalOptionSphere,LocalOptionRadius,LocalOptionEllipsoid,LocalOptionProlate,LocalOptionSize,
-						  LocalOptionPlummer,LocalOptionCharge,LocalOptionZM,LocalOptionEL);
+namespace Param {
+    const FParameterNames Ellipsoid
+    = {{"-ellipsoid"}, "non uniform distribution on an ellipsoid of aspect ratio given by -size a:b:c with a, b and c > 0"};
+    const FParameterNames UnitCube
+    = {{"-unitCube"}, "uniform distribution on unit cube"};
+    const FParameterNames Cube
+    = {{"-cuboid"}, "uniform distribution on rectangular cuboid of size -size a:b:c - default values are 1.0:1.0:2.0 "};
+    const FParameterNames UnitSphere
+    = {{"-unitSphere"}, "uniform distribution on unit sphere"};
+    const FParameterNames Ball
+    = {{"-ball"}, "uniform distribution in a ball of radius given by -radius R - default value for R is 2.0"};
+    const FParameterNames Sphere
+    = {{"-sphere"}, "uniform distribution on sphere of radius given by -radius R - default value for R is 2.0"};
+    const FParameterNames Prolate
+    = {{"-prolate"}, "ellipsoid with aspect ratio a:a:c given by -size a:a:c with c > a > 0"};
+    const FParameterNames Plummer
+    = {{"-plummer"}, "(Highly non uniform) plummer distribution (astrophysics) -radius R - default value 10.0"};
+    const FParameterNames Size
+    = {{"-size"}, "Size of the geometry a:b:c - default values are 1.0:1.0:2.0"};
+    const FParameterNames Radius
+    = {{"-radius"}, "used to specified the radius of the sphere and the plummer distribution or R - default value for R is 2.0"};
+    const FParameterNames Charge
+    = {{"-charge"}, "generate physical values between -1 and 1 otherwise generate between 0 and 1"};
+    const FParameterNames ZM
+    = {{"-zeromean"}, "the average of the physical values is zero"};
+    const FParameterNames EL
+    = {{"-extraLength"}, "-extraLength value extra length to add to the boxWidth"};
+}
+
+#define getParamV(name, default)                                \
+    FParameters::getValue(argc,argv,(name).options,(default))
+
+#define getParamS(name, default)                                \
+    FParameters::getStr(argc,argv,(name).options,(default))
+
+int main(int argc, char ** argv){
 
+    FHelpDescribeAndExit(
+        argc, argv,
+        ">> Driver to generate N points (non)uniformly distributed on a given geometry.\n"
+        "Options  \n"
+        "   -help       to see the parameters    ",
+        FParameterDefinitions::OutputFile, FParameterDefinitions::NbParticles,
+        FParameterDefinitions::OutputVisuFile,
+        Param::UnitCube, Param::Cube,      Param::UnitSphere, Param::Sphere,
+        Param::Radius,   Param::Ellipsoid, Param::Prolate,    Param::Plummer,
+        Param::Ball,
+        Param::Charge,   Param::ZM,        Param::EL,    Param::Size
+        );
 
-    
-    typedef double FReal;
-    FReal       extraRadius = 0.000 ;
+    using FReal = double;
 
-    const FSize NbPoints  = FParameters::getValue(argc,argv,FParameterDefinitions::NbParticles.options,   FSize(20000));
-    const std::string genericFileName(FParameters::getStr(argc,argv,FParameterDefinitions::OutputFile.options,   "unifPointDist"));
+    const FSize NbPoints = getParamV(FParameterDefinitions::NbParticles, FSize(20000));
 
+    FReal extraRadius = 0.000;
     FReal BoxWith = 0.0;
     FPoint<FReal> Centre(0.0, 0.0,0.0);
-	//
-	// Allocation
-	//
-	FReal * particles ;
-	particles = new FReal[4*NbPoints] ;
-	memset(particles,0,4*NbPoints*sizeof(FReal));
-    FmaRWParticle<FReal, 4,4> *ppart = (FmaRWParticle<FReal, 4,4>*)(&particles[0]);
 
-	//
-	// Generate physical values
-	//
+    // Allocate particle array
+    FReal * particles;
+    particles = new FReal[4*NbPoints] ;
+    memset(particles, 0, 4*NbPoints*sizeof(FReal));
+    FmaRWParticle<FReal, 4, 4>* ppart = (FmaRWParticle<FReal, 4, 4>*)(&particles[0]);
+
+    // Generate physical values
+    FReal sum = 0;
+    FReal a = 1.0;
+    FReal b = 0.0;
+    if(FParameters::existParameter(argc, argv, "-charge")){
+        a = 2.0; b = -1.0;
+    }
 
-	FReal phyVal, sum,a,b ;
-	if(FParameters::existParameter(argc, argv, "-charge")){
-		a= 2.0 ; b = -1.0 ;
-	}
-	else {
-		a= 1.0 ; b = 0.0 ;
-	}
-	sum = 0.0 ;
-	int j = 3 ;
-	for(int i = 0 ; i< NbPoints; ++i, j+=4){
-        phyVal            = a*getRandom<FReal>() +b  ;
-		sum              += phyVal ;
-		particles[j]       = phyVal ;
-	}
-	if(FParameters::existParameter(argc, argv, "-zeromean")){
-        FReal  rm = FReal(sum)/FReal(NbPoints) ; sum = 0.0 ;
-		j = 3 ;
-		for(int i = 0 ; i< NbPoints; ++i, j+=4){
-			particles[j]    -= rm ;
-			sum              += particles[j]   ;
-		}
-	}
-    std::cout << "Sum physical value "<< sum << "   Mean Value " << sum/FReal(NbPoints)<<std::endl ;
-	//
-	// Point  generation
-	//
-	if(FParameters::existParameter(argc, argv, "-unitCube")){
-		unifRandonPointsOnUnitCube(NbPoints, particles) ;
-		Centre.setPosition(0.5,0.5,0.5);
-		BoxWith = 1.0 ;
-		std::cout << "Unit cube "<<std::endl;
-	}
-	else if(FParameters::existParameter(argc, argv, "-cuboid")){
-		std::string  dd(":"),aspectRatio  = FParameters::getStr(argc,argv,"-size",  "1:1:2");
-		FReal A,B,C ;
-		size_t pos = aspectRatio.find(":");		aspectRatio.replace(pos,1," ");
-		pos = aspectRatio.find(":");		         aspectRatio.replace(pos,1," ");
-		std::stringstream ss(aspectRatio); ss >>A >> B >> C ;
-		unifRandonPointsOnCube(NbPoints, A,B,C,particles) ;
-		BoxWith = FMath::Max(A,FMath::Max(B,C) );
-		FReal halfBW = BoxWith*0.5;
-		Centre.setPosition(halfBW,halfBW,halfBW);
-		std::cout << "Cuboid "<< A << ":"<< B<<":"<<C<<std::endl;
-	}
-	else if(FParameters::existParameter(argc, argv, "-unitSphere")){
-		unifRandonPointsOnUnitSphere(NbPoints, particles) ;
-		BoxWith = 2.0 ;
-	}
-	else if(FParameters::existParameter(argc, argv, "-sphere")){
-		const FReal Radius  = FParameters::getValue(argc,argv,"-radius",  2.0);
-		unifRandonPointsOnSphere(NbPoints, Radius,particles) ;
-		BoxWith = 2.0*Radius ;
-		std::cout << "Sphere radius: "<<Radius<<std::endl;
-	}
-	else if(FParameters::existParameter(argc, argv, "-prolate")){
-		std::string  dd(":"),aspectRatio  = FParameters::getStr(argc,argv,"-size",  "1:1:2");
-		FReal A,B,C ;
-		size_t pos = aspectRatio.find(":");		aspectRatio.replace(pos,1," ");
-		pos = aspectRatio.find(":");		aspectRatio.replace(pos,1," ");
-		std::stringstream ss(aspectRatio); ss >>A >> B >> C ;
-		if(A != B){
-			std::cerr << " A /= B in prolate ellipsoide A =B. Your aspect ratio: "<< aspectRatio<<std::endl;
-		}
-		std::cout << "Prolate A: "<<A<<" B: "<< B << " C: " << C<<std::endl;
-		unifRandonPointsOnProlate(NbPoints,A,C,particles);
-		BoxWith =  2.0*C;
-	}    //const FSize NbPoints  = FParameters::getValue(argc,argv,FParameterDefinitions::NbParticles.options,   FSize(20000));
-    else if(FParameters::existParameter(argc, argv, "-hyperpara")){
-        std::string  dd(":"),aspectRatio  = FParameters::getStr(argc,argv,"-size",  "1:1:2");
-        FReal A,B,C ;
-        size_t pos = aspectRatio.find(":");     aspectRatio.replace(pos,1," ");
-        pos = aspectRatio.find(":");        aspectRatio.replace(pos,1," ");
-        std::stringstream ss(aspectRatio); ss >>A >> B >> C ;
-        unifRandonPointsOnHyperPara(NbPoints,A,B,C,particles);
-        BoxWith =  2.0*FMath::Max( A,FMath::Max( B,C)) ;
-		std::cout << "Hyperpara "<< A << ":"<< B<<":"<<C<<std::endl;
-        std::cout << "BoxWith: " << BoxWith<<std::endl;
+    for(int i = 0, j = 3 ; i< NbPoints; ++i, j+=4){
+        particles[j] = a * getRandom<FReal>() + b;
+        sum += particles[j] ;
+    }
 
+    if(FParameters::existParameter(argc, argv, "-zeromean")){
+        FReal rm = FReal(sum) / FReal(NbPoints) ;
+        sum -= static_cast<FReal>(NbPoints) * rm;
+        for(int i = 0, j = 3 ; i< NbPoints; ++i, j+=4){
+            particles[j] -= rm ;
+        }
+    }
+
+    std::cout << "Physical value sum: " << sum
+              << " mean: " << sum / FReal(NbPoints)
+              << std::endl;
+
+    // Read arguments
+    // Radius
+    const FReal Radius = getParamV(Param::Radius,  2.0);
+    // Aspect ratio
+    std::string aspectRatio = getParamS(Param::Size, "1:1:2");
+    std::replace(aspectRatio.begin(), aspectRatio.end(), ':', ' ');
+    FReal A, B, C;
+    std::stringstream(aspectRatio) >> A >> B >> C;
+
+    // Point  generation
+    if(FParameters::existParameter(argc, argv, "-unitCube")) {
+        unifRandomPointsInCube<FReal>(NbPoints, 1, 1, 1, particles);
+        Centre.setPosition(0.5,0.5,0.5);
+        BoxWith = 1.0;
+        std::cout << "Unit cube "<< std::endl;
+    }
+    else if(FParameters::existParameter(argc, argv, "-ball")) {
+        unifRandomPointsInBall<FReal>(NbPoints, Radius, particles);
+        BoxWith = 2.0 * Radius;
+        std::cout << "Ball radius: " << Radius << std::endl;
+    }
+    else if(FParameters::existParameter(argc, argv, "-cuboid")) {
+        unifRandomPointsInCube(NbPoints, A, B, C, particles);
+        BoxWith = FMath::Max(A, FMath::Max(B,C));
+        FReal halfBW = BoxWith * 0.5;
+        Centre.setPosition(halfBW, halfBW, halfBW);
+        std::cout << "Cuboid: "<< A << ":" << B << ":" << C << std::endl;
+    }
+    else if(FParameters::existParameter(argc, argv, "-unitSphere")) {
+        unifRandomPointsOnSphere<FReal>(NbPoints, 1.0, particles);
+        BoxWith = 2.0;
+    }
+    else if(FParameters::existParameter(argc, argv, "-sphere")) {
+        unifRandomPointsOnSphere(NbPoints, Radius, particles);
+        BoxWith = 2.0 * Radius;
+        std::cout << "Sphere radius: " << Radius << std::endl;
+    }
+    else if(FParameters::existParameter(argc, argv, "-prolate")) {
+        if(A != B){
+            std::cerr << " A != B in prolate ellipsoid. Your aspect ratio: "
+                      << aspectRatio << std::endl;
+        }
+        std::cout << "Prolate A: " << A << " B: " << B << " C: " << C << std::endl;
+        unifRandomPointsOnProlate(NbPoints, A, C, particles);
+        BoxWith = 2.0 * C;
+    }
+    else if(FParameters::existParameter(argc, argv, "-hyperpara")) {
+        unifRandomPointsOnHyperPara(NbPoints, A, B, C, particles);
+        BoxWith = 2.0 * FMath::Max(A, FMath::Max(B, C));
+        std::cout << "Hyperpara "<< A << ":"<< B<<":"<<C<<std::endl;
+        std::cout << "BoxWith: " << BoxWith << std::endl;
+
+    }
+    else if(FParameters::existParameter(argc, argv, "-ellipsoid")){
+        nonunifRandomPointsOnElipsoid(NbPoints, A, B, C, particles);
+        BoxWith =  2.0 * FMath::Max(A, FMath::Max(B, C));
+        std::cout << "Ellipsoid " << A << ":" << B << ":" << C << std::endl;
+    }
+    else if(FParameters::existParameter(argc, argv, "-plummer")){
+        unifRandomPlummer(NbPoints, Radius, particles);
+        BoxWith = 2.0 * Radius;
+        std::cout << "Plummer radius: " << Radius << std::endl;
+    }
+    else {
+        std::cout << "Bad geometry option"<< std::endl;
+        exit(-1);
     }
-	else if(FParameters::existParameter(argc, argv, "-ellipsoid")){
-//		else if(FParameters::existParameter(argc, argv, "-ellipsoid")){
-		std::string  dd(":"),aspectRatio  = FParameters::getStr(argc,argv,"-size",  "1:1:2");
-//		std::string  dd(":"),aspectRatio  = FParameters::getStr(argc,argv,"-ar",  "1:1:2");
-		FReal A,B,C ;
-		size_t pos = aspectRatio.find(":");		aspectRatio.replace(pos,1," ");
-		pos = aspectRatio.find(":");		aspectRatio.replace(pos,1," ");
-		std::stringstream ss(aspectRatio); ss >>A >> B >> C ;
-		nonunifRandonPointsOnElipsoid(NbPoints,A,B,C,particles);
-		BoxWith =  2.0*FMath::Max( A,FMath::Max( B,C)) ;
-		std::cout << "Ellipsoid "<< A << ":"<< B<<":"<<C<<std::endl;
-	}
-	else if(FParameters::existParameter(argc, argv, "-plummer")){
-		const FReal Radius  = FParameters::getValue(argc,argv,"-radius",  10.0);
-		unifRandonPlummer(NbPoints, Radius, sum, particles) ;
-		BoxWith = 2.0*Radius ;
-		std::cout << "Plummer radius: "<<Radius<<std::endl;
-	}
 
-	else {
-		std::cout << "Bad geometry option"<< std::endl;
-		exit(-1) ;
-	}
     /////////////////////////////////////////////////////////////////////////
-	//                                           Save data
+    //                                           Save data
     /////////////////////////////////////////////////////////////////////////
-	//
     //  Generate FMA file for FFmaGenericLoader<FReal> Loader
-	//
-	if(FParameters::existParameter(argc, argv, "-extraLength")){
-		extraRadius  = FParameters::getValue(argc,argv,"-extraLength",  0.0);
-		BoxWith += 2*extraRadius ;
-	}
-	std::string name(genericFileName);
-	std::cout << "Write "<< NbPoints <<" Particles in file " << name << std::endl;
-    FFmaGenericWriter<FReal>  writer(name) ;
-	writer.writeHeader(Centre,BoxWith, NbPoints, *ppart) ;
-	writer.writeArrayOfParticles(ppart, NbPoints);
-	std::cout << "    End of writing "<<std::endl;
-
-	//
-	//  Generate  file for visualization
-	//
-    if(FParameters::existParameter(argc, argv, FParameterDefinitions::OutputVisuFile.options)){
-        std::string visufile(FParameters::getStr(argc,argv,FParameterDefinitions::OutputVisuFile.options,   "output.vtp"));
-         driverExportData(visufile, particles , NbPoints);
-	}
-	//
-	delete [] particles ;
+    if(FParameters::existParameter(argc, argv, "-extraLength")){
+        extraRadius = FParameters::getValue(argc, argv, "-extraLength",  0.0);
+        BoxWith += 2 * extraRadius;
+    }
+    const std::string name(getParamS(FParameterDefinitions::OutputFile, "unifPointDist"));
+    std::cout << "Write "<< NbPoints <<" particles to '" << name << "'" << std::endl;
+    FFmaGenericWriter<FReal> writer(name);
+    writer.writeHeader(Centre, BoxWith, NbPoints, *ppart);
+    writer.writeArrayOfParticles(ppart, NbPoints);
+    std::cout << "End of writing" <<std::endl;
 
-	//
-	return 1;
+    //  Generate  file for visualization
+//    if(FParameters::existParameter(argc, argv, FParameterDefinitions::OutputVisuFile.options)){
+//        std::string outfilename(FParameters::getStr(argc,argv,FParameterDefinitions::OutputFile.options,   "output.vtp"));
+//        driverExportData(outfilename, particles , NbPoints,loader.getNbRecordPerline() );
+//    }
+    if(FParameters::existParameter(argc, argv, FParameterDefinitions::OutputVisuFile.options)) {
+        std::string visufile(FParameters::getStr(argc, argv, FParameterDefinitions::OutputVisuFile.options, "output.vtp"));
+        driverExportData(visufile, particles , NbPoints);
+    }
+    //
+    delete [] particles;
 }
diff --git a/Src/Arranger/FOctreeArranger.hpp b/Src/Arranger/FOctreeArranger.hpp
index d5fdec3151824adcdd23a8306aee5dde35d199ea..30b453ecad89cac33e28f28ea29c8591a0880ead 100644
--- a/Src/Arranger/FOctreeArranger.hpp
+++ b/Src/Arranger/FOctreeArranger.hpp
@@ -85,7 +85,7 @@ public:
                     const MortonIndex particuleIndex = tree->getMortonFromPosition(currentPart);
                     if(particuleIndex != currentMortonIndex){
                         //Need to move this one
-                        interface->removeFromLeafAndKeep(particles,currentPart,idxPart,FParticleTypeSource);
+                        interface->removeFromLeafAndKeep(particles,currentPart,idxPart,FParticleType::FParticleTypeSource);
                     }
                     else{
                         //Need to increment idx;
@@ -102,7 +102,7 @@ public:
                         const MortonIndex particuleIndex = tree->getMortonFromPosition(currentPart);
                         if(particuleIndex != currentMortonIndex){
                             //Need to move this one
-                            interface->removeFromLeafAndKeep(particleTargets,currentPart,idxPart, FParticleTypeTarget);
+                            interface->removeFromLeafAndKeep(particleTargets,currentPart,idxPart, FParticleType::FParticleTypeTarget);
                         }
                         else{
                             //Need to increment idx;
diff --git a/Src/Arranger/FParticleTypedIndexedMover.hpp b/Src/Arranger/FParticleTypedIndexedMover.hpp
index e4e1b8a5591ab3006b35bc595e133f72f1fa2e01..a8286323d464b1cfb822cfdc5fe2651cddf3ba02 100644
--- a/Src/Arranger/FParticleTypedIndexedMover.hpp
+++ b/Src/Arranger/FParticleTypedIndexedMover.hpp
@@ -33,11 +33,11 @@ public:
         for(int idxAttr = 0 ; idxAttr < ContainerClass::NbAttributes ; ++idxAttr){
             particleValues[idxAttr] = lf->getAttribute(idxAttr)[idxPart];
         }
-        if(type == FParticleTypeTarget){
-            toStoreRemovedTargetParts.push(particlePos,FParticleTypeTarget,lf->getIndexes()[idxPart],particleValues);
+        if(type == FParticleType::FParticleTypeTarget){
+            toStoreRemovedTargetParts.push(particlePos,FParticleType::FParticleTypeTarget,lf->getIndexes()[idxPart],particleValues);
         }
         else{
-            toStoreRemovedSourceParts.push(particlePos,FParticleTypeSource,lf->getIndexes()[idxPart],particleValues);
+            toStoreRemovedSourceParts.push(particlePos,FParticleType::FParticleTypeSource,lf->getIndexes()[idxPart],particleValues);
         }
         lf->removeParticles(&idxPart,1);
     }
@@ -53,7 +53,7 @@ public:
             const FPoint<FReal> particlePos(toStoreRemovedSourceParts.getPositions()[0][idxToInsert],
                                      toStoreRemovedSourceParts.getPositions()[1][idxToInsert],
                                      toStoreRemovedSourceParts.getPositions()[2][idxToInsert]);
-            tree->insert(particlePos, FParticleTypeSource, toStoreRemovedSourceParts.getIndexes()[idxToInsert], particleValues);
+            tree->insert(particlePos, FParticleType::FParticleTypeSource, toStoreRemovedSourceParts.getIndexes()[idxToInsert], particleValues);
         }
 
         for(FSize idxToInsert = 0; idxToInsert<toStoreRemovedTargetParts.getNbParticles() ; ++idxToInsert){
@@ -64,7 +64,7 @@ public:
                                      toStoreRemovedTargetParts.getPositions()[1][idxToInsert],
                                      toStoreRemovedTargetParts.getPositions()[2][idxToInsert]);
 
-            tree->insert(particlePos, FParticleTypeTarget, toStoreRemovedTargetParts.getIndexes()[idxToInsert], particleValues);
+            tree->insert(particlePos, FParticleType::FParticleTypeTarget, toStoreRemovedTargetParts.getIndexes()[idxToInsert], particleValues);
         }
 
         toStoreRemovedSourceParts.clear();
diff --git a/Src/Components/FParticleType.hpp b/Src/Components/FParticleType.hpp
index 89f799ccce20d1221d23fd5d37e0a1910df3623b..98d5a7bc6858d0b428568295ed809a8f5f700ab1 100644
--- a/Src/Components/FParticleType.hpp
+++ b/Src/Components/FParticleType.hpp
@@ -19,7 +19,7 @@
 /**
  * @brief The FParticleType enum is to make a difference between Target and Source (Tsm)
  */
-enum FParticleType {
+enum class FParticleType {
     FParticleTypeSource = 0,
     FParticleTypeTarget = 1
 };
diff --git a/Src/Components/FTypedLeaf.hpp b/Src/Components/FTypedLeaf.hpp
index f29332e5f5092c7a993e5cd2f692e046d6a9f0a3..b4510a24bed0f47ddb33739418974c762ac0de3d 100644
--- a/Src/Components/FTypedLeaf.hpp
+++ b/Src/Components/FTypedLeaf.hpp
@@ -51,8 +51,8 @@ public:
         */
     template<typename... Args>
     void push(const FPoint<FReal>& inParticlePosition, const FParticleType type, Args ... args){
-        if(type == FParticleTypeTarget) targets.push(inParticlePosition, FParticleTypeTarget, args...);
-        else sources.push(inParticlePosition, FParticleTypeSource, args...);
+        if(type == FParticleType::FParticleTypeTarget) targets.push(inParticlePosition, FParticleType::FParticleTypeTarget, args...);
+        else sources.push(inParticlePosition, FParticleType::FParticleTypeSource, args...);
     }
 
     /**
diff --git a/Src/Core/FFmmAlgorithmOmp4.hpp b/Src/Core/FFmmAlgorithmOmp4.hpp
index 54d288a0118b3cb9dbd48d0e25a1eb609ebacb40..b8bbd2790c270ed744516640c64006e5f2a3b542 100644
--- a/Src/Core/FFmmAlgorithmOmp4.hpp
+++ b/Src/Core/FFmmAlgorithmOmp4.hpp
@@ -1,6 +1,8 @@
 #ifndef FFMMALGORITHMOMP4_HPP
 #define FFMMALGORITHMOMP4_HPP
 
+#include <omp.h>
+
 #include "../Utils/FGlobal.hpp"
 #include "../Utils/FAssert.hpp"
 #include "../Utils/FLog.hpp"
@@ -1007,4 +1009,3 @@ protected:
 
 
 #endif // FFMMALGORITHMOMP4_HPP
-
diff --git a/Src/Files/FFmaGenericLoader.hpp b/Src/Files/FFmaGenericLoader.hpp
index f08e3fc729a43623155c0a4d8eb5c9943799d36c..8f5a8d6fe2e79d646abaa355c6d7f8b191b75fb6 100644
--- a/Src/Files/FFmaGenericLoader.hpp
+++ b/Src/Files/FFmaGenericLoader.hpp
@@ -4,13 +4,13 @@
 // This software is a computer program whose purpose is to compute the FMM.
 //
 // This software is governed by the CeCILL-C and LGPL licenses and
-// abiding by the rules of distribution of free software.  
-// 
+// abiding by the rules of distribution of free software.
+//
 // This program is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public and CeCILL-C Licenses for more details.
-// "http://www.cecill.info". 
+// "http://www.cecill.info".
 // "http://www.gnu.org/licenses".
 // ===================================================================================
 // author Berenger Bramas and Olivier Coulaud
@@ -36,7 +36,7 @@
 /** \brief Particle class used in FMA loader and writer.
  *
  *
- * The pieces of data are : PosX, PosY, PosZ, physicalValue, 
+ * The pieces of data are : PosX, PosY, PosZ, physicalValue,
  * Potential, forceX, forceY, forceZ. The first 4 are mandatory.
  * Data is stored as FReal.
  *
@@ -165,10 +165,10 @@ public:
  * example below shows how to use the loader to read from a file.
  *
  *
- * \code 
- * // Instanciate the loader with the particle file. 
- * FFmaGenericLoader<FReal> loader("../Data/unitCubeXYZQ20k.fma"); // extension fma -> ascii format 
- * // Retrieve the number of particles 
+ * \code
+ * // Instanciate the loader with the particle file.
+ * FFmaGenericLoader<FReal> loader("../Data/unitCubeXYZQ20k.fma"); // extension fma -> ascii format
+ * // Retrieve the number of particles
  * FSize nbParticles = loader.getNumberOfParticles();
  *
  * // Create an array of particles, initialize to 0.
@@ -190,7 +190,7 @@ public:
  * \endcode
  *
  * `DatatypeSize` can have one of two values:
- *  - 4, float ; 
+ *  - 4, float ;
  *  - 8, double.
  *
  * `Number_of_records_per_line` gives the data count for each line of
@@ -241,7 +241,7 @@ public:
      * @param binary   true if the file to open is in binary mode
      */
     FFmaGenericLoader(const std::string & filename,const bool binary ):
-        file(nullptr), binaryFile(binary), centerOfBox(0.0,0.0,0.0), boxWidth(0.0), 
+        file(nullptr), binaryFile(binary), centerOfBox(0.0,0.0,0.0), boxWidth(0.0),
         nbParticles(0), tmpVal(nullptr), otherDataToRead(0)
         {
             this->open_file(filename, binary);
@@ -254,7 +254,7 @@ public:
      *
      * - The opening mode is guessed from the file extension : `.fma` will open
      * in ASCII mode, `.bfma` will open in binary mode.
-     * - All information accessible in the header can be retreived after this call.  
+     * - All information accessible in the header can be retreived after this call.
      * - To test if the file has successfully been opened, call hasNotFinished().
      *
      * @param filename the name of the file to open. Must end with `.fma` or `.bfma`.
@@ -268,7 +268,7 @@ public:
             binaryFile = false;
         } else  {
             std::cout << "FFmaGenericLoader: "
-                      << "Only .fma or .bfma input file are allowed. Got " 
+                      << "Only .fma or .bfma input file are allowed. Got "
                       << filename << "."
                       << std::endl;
             std::exit ( EXIT_FAILURE) ;
@@ -297,10 +297,9 @@ public:
 
     /**
      * To get the number of particles from this loader
-     * @param the number of particles the loader can fill
      */
     FSize getNumberOfParticles() const{
-        return this->nbParticles;
+        return this->getParticleCount();
     }
 
     /**
@@ -308,8 +307,25 @@ public:
      * @return box center
      */
     FPoint<FReal> getCenterOfBox() const{
+        return this->getBoxCenter();
+    }
+
+    /**
+     * \brief Get the distribution particle count
+     * \return The distribution particle count
+     */
+    FSize getParticleCount() const {
+        return this->nbParticles;
+    }
+
+    /**
+     * \brief Get distribution center
+     * \return A point representing the box center
+     */
+    FPoint<FReal> getBoxCenter() const{
         return this->centerOfBox;
     }
+
     /**
      * The box width from the simulation file opened by the loader
      * @return box width
@@ -502,7 +518,7 @@ private:
 };
 
 
-/**\class FFmaGenericWriter
+/**
  * \warning This class only works in shared memory (doesn't work with MPI).
  *
  * \brief Writes a set of particles to an FMA formated file.
@@ -529,14 +545,13 @@ private:
  * \endcode
  *
  * `DatatypeSize` can have one of two values:
- *  - 4, float ; 
+ *  - 4, float;
  *  - 8, double.
  *
  * `Number_of_records_per_line` gives the data count for each line of
  * the `Particle_values`. For example :
- *  - 4, the particle values are X Y Z Q;
- *  - 8, the particle values are X Y Z Q  P FX FY FZ<br>
-
+ *  - 4, the particle values are `X Y Z Q`;
+ *  - 8, the particle values are `X Y Z Q  P FX FY FZ`.
  */
 template <class FReal>
 class FFmaGenericWriter {
@@ -642,9 +657,9 @@ public:
 
     /**
      * Writes the header of FMA file.
-     * 
+     *
      * Should be used if we write the particles with writeArrayOfReal method
-     * 
+     *
      * @param centerOfBox      The center of the Box (FPoint<FReal> class)
      * @param boxWidth         The width of the box
      * @param nbParticles      Number of particles in the box (or to save)
@@ -672,7 +687,7 @@ public:
      * @tparam dataPart   The class of the particle array.
      * @param dataToWrite Array of particles of type dataPart
      * @param N           Number of element in the array
-     * 
+     *
      * Example 1
      * \code
      * FmaRParticle *  particles = new FmaRParticle[nbParticles];
@@ -682,7 +697,7 @@ public:
      * Fwriter.writeHeader(Centre,BoxWith, nbParticles,*particles) ;
      * Fwriter.writeArrayOfParticles(particles, nbParticles);
      * \endcode
-     * 
+     *
      * Example2
      * \code
      * FReal *  particles = new FReal[4*NbPoints] ; // store 4 data per particle
@@ -736,13 +751,13 @@ public:
 
     /**
      *  Write an array of data in a file Fill
-     * 
+     *
      * @param dataToWrite array of particles of type FReal
      * @param nbData number of data per particle
      * @param N number of particles
-     * 
+     *
      *   The size of the array is N*nbData
-     * 
+     *
      *   example
      * \code
      * FmaRParticle * const particles = new FmaRParticle[nbParticles];
@@ -853,5 +868,3 @@ private:
 
 
 #endif //FFmaGenericLoader_HPP
-
-
diff --git a/Src/Files/FFmaTsmLoader.hpp b/Src/Files/FFmaTsmLoader.hpp
index eadf1137fa1db31fcc6329a7a3b37e072f42168d..d45a64d0f71dcbeba6e3c43c69e27cf2b9505890 100644
--- a/Src/Files/FFmaTsmLoader.hpp
+++ b/Src/Files/FFmaTsmLoader.hpp
@@ -131,8 +131,8 @@ public:
 
         inParticlePositions->setPosition(x,y,z);
         *inPhysicalValue = data;
-        if(isTarget) (*particleType) = FParticleTypeTarget;
-        else (*particleType) = FParticleTypeSource;
+        if(isTarget) (*particleType) = FParticleType::FParticleTypeTarget;
+        else (*particleType) = FParticleType::FParticleTypeSource;
     }
 
 };
diff --git a/Src/Files/FGenerateDistribution.hpp b/Src/Files/FGenerateDistribution.hpp
index 64d27f0d13173a045ce428a9064e11600dd02a20..7d31f63c16bbc81ff044b08deb23246954e29ac5 100644
--- a/Src/Files/FGenerateDistribution.hpp
+++ b/Src/Files/FGenerateDistribution.hpp
@@ -16,7 +16,12 @@
 #ifndef FGENERATEDISTRIBUTION_HPP
 #define FGENERATEDISTRIBUTION_HPP
 
-// @author O. Coulaud
+/**
+ * \file
+ * \brief Distribution generation implementations
+ * \author O. Coulaud
+ */
+
 
 #include <cstdlib>
 #include <ctime>
@@ -27,250 +32,273 @@
 #include "Utils/FMath.hpp"
 #include "Utils/FParameters.hpp"
 
-/**  return a random number between 0 and 1 */
-
+/**
+ * \brief Seed the random number generator using current time
+ */
 void initRandom() {
-	srand48( static_cast<long int>(time(nullptr))) ;
-} ;
-template <class FReal>
-FReal getRandom() {
-	return static_cast<FReal>(drand48());
-	//return static_cast<FReal>(rand()/FReal(RAND_MAX));
-} ;
-//!  \fn   unifRandonPointsOnUnitCube(const int N , FReal * points)
-
-//! \brief Generate N points uniformly distributed on the unit cube
+    srand48(static_cast<long int>(time(nullptr)));
+}
 
-//!
-//! \param N the number of points uniformly randomly sample on the unit cube
-//! \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0....
-//! \example  generateDistributions.cpp
+/**
+ * \brief Generate a random number
+ * \tparam FReal Floating point type
+ * \return A random number in [0,1]
+ */
 template <class FReal>
-void unifRandonPointsOnUnitCube(const FSize N , FReal * points) {
-	//
-	initRandom() ;
-	int j = 0;
-    for (FSize i = 0 ; i< N ; ++i, j+=4)  {
-		//
-        points[j]	  =	getRandom<FReal>()  ;
-        points[j+1] =	getRandom<FReal>()  ;
-        points[j+2] =	getRandom<FReal>()  ;
-		//
-	}
-};
-//!  \fn   unifRandonPointsOnCube(const int N , FReal * points)
-
-//! \brief Generate N points uniformly distributed on the cube of length R
+FReal getRandom() {
+    return static_cast<FReal>(drand48());
+}
 
-//!
-//! \param N the number of points uniformly randomly sample on the unit cube
-//! \param Lx the the X-length of the  cube
-//! \param Ly the the Y-length of the  cube
-//! \param Lz the the Z-length of the  cube
-//! \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0....
-//! \example  generateDistributions.cpp
+/**
+ * \brief Generate points uniformly inside a cuboid
+ *
+ * \tparam FReal Floating point type
+ *
+ * \param N the number of points uniformly randomly sample on the unit cube
+ * \param Lx the the X-length of the cuboid
+ * \param Ly the the Y-length of the cuboid
+ * \param Lz the the Z-length of the cuboid
+ * \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0...
+ */
 template <class FReal>
-void unifRandonPointsOnCube(const FSize N , const FReal& Lx,  const FReal &Ly,  const FReal& Lz, FReal * points) {
-	//
-	unifRandonPointsOnUnitCube(N , points) ;
-    FSize j =0 ;
-    for (FSize i = 0 ; i< N ; ++i, j+=4)  {
-		points[j]	   *= Lx ;
-		points[j+1]  *= Ly ;
-		points[j+2]  *= Lz ;
-	}
-};
-//!  \fn   unifRandonPointsOnUnitSphere(const int N , FReal * points)
+void unifRandomPointsInCube(const FSize N, const FReal& Lx, const FReal& Ly,
+                            const FReal& Lz, FReal* points)
+{
+    initRandom();
+    for(FSize i = 0, j = 0 ; i< N ; ++i, j+=4)  {
+        points[j]   = getRandom<FReal>() * Lx;
+        points[j+1] = getRandom<FReal>() * Ly;
+        points[j+2] = getRandom<FReal>() * Lz;
+    }
+}
 
-//! \brief Generate N points uniformly distributed on the unit sphere
+/**
+ * \brief Generate points uniformly inside a ball
+ *
+ * \tparam FReal Floating point type
+ *
+ * \param R the ball radius
+ * \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0...
+ */
+template<class FReal>
+void unifRandomPointsInBall(const FSize N, const FReal R, FReal* points) {
+    initRandom();
 
-//!
-//! \param N the number of points uniformly randomly sample on the unit sphere
-//! \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0....
-//! \example  generateDistributions.cpp
-template <class FReal>
-void unifRandonPointsOnUnitSphere(const FSize N , FReal * points) {
-	FReal u, v, theta, phi, sinPhi ;
-	//
-	initRandom() ;
-    FSize j = 0 ;
-    for (FSize i = 0 ; i< N ; ++i, j+=4)  {
-		//
-        u = getRandom<FReal>() ;  v = getRandom<FReal>() ;
-        theta  = FMath::FTwoPi<FReal>()*u ;
-		phi     = FMath::ACos(2*v-1);
-		sinPhi = FMath::Sin(phi);
-		//
-		points[j]	  =	FMath::Cos(theta)*sinPhi ;
-		points[j+1] =	FMath::Sin(theta)*sinPhi ;
-		points[j+2] =	2*v-1 ;
-		//
-	}
-};
-//!  \fn  nonunifRandonPointsOnElipsoid(const int N , const FReal &a, const FReal &b, const FReal &c, FReal * points)
+    auto is_in_sphere = [&R](FReal* p) {
+        return p[0]*p[0] + p[1]*p[1] + p[2]*p[2] < R*R;
+    };
 
-//! \brief  Generate N points non uniformly distributed on the ellipsoid of  aspect ratio a:b:c
+    for(FSize i = 0, j = 0 ; i< N ; ++i, j+=4)  {
+        do {
+            points[j]   = (getRandom<FReal>() - 0.5) * 2 * R;
+            points[j+1] = (getRandom<FReal>() - 0.5) * 2 * R;
+            points[j+2] = (getRandom<FReal>() - 0.5) * 2 * R;
+        } while(! is_in_sphere(points + j));
+    }
+}
 
-//!
-//! \param N the number of points
-//! \param a  the x  semi-axe length
-//! \param b  the y  semi-axe length
-//! \param c  the z  semi-axe length
-//! \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0....
-//!
+/**
+ * \brief Generate N points non uniformly distributed on the ellipsoid of aspect ratio a:b:c
+ *
+ * \tparam FReal Floating point type
+ *
+ * \param N the number of points
+ * \param a  the x semi-axe length
+ * \param b  the y semi-axe length
+ * \param c  the z semi-axe length
+ * \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0....
+ */
 template <class FReal>
-void nonunifRandonPointsOnElipsoid(const FSize N , const FReal &a, const FReal &b, const FReal &c, FReal * points) {
-	//
-	FReal u, v , cosu ;
-    FSize j =0 ;
-    for (FSize i = 0 ; i< N ; ++i, j+=4)  {
-        u = getRandom<FReal>() ;  v = getRandom<FReal>() ;
-        u  = FMath::FPi<FReal>()*u - FMath::FPiDiv2<FReal>();   v   = FMath::FTwoPi<FReal>()*v - FMath::FPi<FReal>();
-		cosu = FMath::Cos(u) ;
-		points[j]	   = a*cosu*FMath::Cos(v)  ;
-		points[j+1]  = b*cosu*FMath::Sin(v)  ;
-		points[j+2]  = c*FMath::Sin(u)  ;
-	}
-};
-//!  \fn  nonunifRandonPointsOnElipsoid(const int N , const FReal &a, const FReal &c, FReal * points)
+void nonunifRandomPointsOnElipsoid(const FSize N, const FReal& a, const FReal& b,
+                                   const FReal& c, FReal* points)
+{
+    FReal u, v, cosu;
+    for (FSize i = 0, j = 0 ; i< N ; ++i, j+=4)  {
+        u = FMath::FPi<FReal>() * getRandom<FReal>() - FMath::FPiDiv2<FReal>();
+        v = FMath::FTwoPi<FReal>() * getRandom<FReal>() - FMath::FPi<FReal>();
+        cosu = FMath::Cos(u);
+        points[j]   = a * cosu * FMath::Cos(v);
+        points[j+1] = b * cosu * FMath::Sin(v);
+        points[j+2] = c * FMath::Sin(u);
+    }
+}
 
-//! \brief  Generate N points uniformly distributed on the ellipsoid of  aspect ratio a:a:c
 
-//!
-//! \param N the number of points
-//! \param a  the x  semi-axe length
-//! \param c  the z  semi-axe length
-//! \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0....
-//!
+/**
+ * \brief Generate N points uniformly distributed on the ellipsoid of aspect ratio a:a:c
+ *
+ * \tparam FReal Floating point type
+ *
+ * \param N the number of points
+ * \param a  the x  semi-axe length
+ * \param c  the z  semi-axe length
+ * \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0....
+*/
 template <class FReal>
-void unifRandonPointsOnProlate(const FSize N , const FReal &a, const FReal &c, FReal * points){
-	//
-	FReal u, w,v ,ksi ;
-	FReal e = (a*a*a*a)/(c*c*c*c) ;
-	bool isgood = false;
-    FSize j =0 , cpt =0 ;
-	//
-    for (FSize i = 0 ; i< N ; ++i, j+=4)  {
-		// Select a random point on the prolate
-		do {
-			cpt++	;
-            u = getRandom<FReal>() ;  v = getRandom<FReal>() ;
-            u  = 2.0*u - 1.0;   v   = FMath::FTwoPi<FReal>()*v;
-			w =FMath::Sqrt(1-u*u) ;
-			points[j]	   = a*w*FMath::Cos(v)  ;
-			points[j+1]  = a*w*FMath::Sin(v)  ;
-			points[j+2]  = c*u ;
-			// Accept the position ?
-            ksi = a*getRandom<FReal>()  ;
-			//			std::cout << "Gradf  "<<  points[j]*points[j] + points[j+1] *points[j+1]  +e*points[j+2] *points[j+2]  << std::endl;
-			isgood = (points[j]*points[j] + points[j+1] *points[j+1]  +e*points[j+2] *points[j+2]  < ksi*ksi );
-		} while (isgood);
-	}
-	std::cout.precision(4);
-    std::cout << "Total tested points: "<< cpt << " % of rejected points: "<<100*static_cast<FReal>(cpt-N)/static_cast<FReal>(cpt) << " %" <<std::endl;
-
-} ;
+void unifRandomPointsOnProlate(const FSize N, const FReal& a, const FReal& c,
+                               FReal* points)
+{
+    FReal u, w, v, ksi;
+    FReal e = (a*a*a*a)/(c*c*c*c);
+    bool isgood = false;
+    FSize cpt = 0;
 
-//!  \fn  unifRandonPointsOnHyperPara(const int N , const FReal &a, const FReal &b, const FReal &c, FReal * points)
+    for (FSize i = 0, j = 0 ; i< N ; ++i, j+=4)  {
+        // Select a random point on the prolate
+        do {
+            ++cpt;
+            u = 2.0 * getRandom<FReal>() - 1.0;
+            v = FMath::FTwoPi<FReal>() * getRandom<FReal>();
+            w = FMath::Sqrt(1 - u*u);
+            points[j]	= a * w * FMath::Cos(v);
+            points[j+1] = a * w * FMath::Sin(v);
+            points[j+2] = c * u;
+            // Accept the position ?
+            ksi = a * getRandom<FReal>();
+            isgood = (points[j]*points[j]
+                      + points[j+1]*points[j+1]
+                      + e*points[j+2]*points[j+2]) < ksi*ksi;
+        } while(isgood);
+    }
+    std::cout.precision(4);
+    std::cout << "Total tested points: " << cpt
+              << " % of rejected points: "
+              << 100 * static_cast<FReal>(cpt-N) / static_cast<FReal>(cpt) << " %"
+              << std::endl;
+}
 
-//! \brief  Generate N points uniformly distributed on the hyperbolic paraboloid of  aspect ratio a:b:c
 
-//!
-//! \param N the number of points
-//! \param a  the x  semi-axe length
-//! \param b  the y  semi-axe length
-//! \param c  the z  semi-axe length
-//! \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0....
-//!
+/**
+ * \brief  Generate N points uniformly distributed on the hyperbolic paraboloid of  aspect ratio a:b:c
+ *
+ * \tparam FReal Floating point type
+ *
+ * \param N the number of points
+ * \param a  the x  semi-axe length
+ * \param b  the y  semi-axe length
+ * \param c  the z  semi-axe length
+ * \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0...
+ */
 template <class FReal>
-void unifRandonPointsOnHyperPara(const FSize N , const FReal &a, const FReal &b, const FReal &c, FReal * points) {
-    //
-    FReal u, v ;
-    FSize j =0 ;
-    for (FSize i = 0 ; i< N ; ++i, j+=4)  {
-        u = 2.0*getRandom<FReal>() - 1.0 ;  v = 2.0*getRandom<FReal>() - 1.0 ;
-        points[j]    = a*u ;
-        points[j+1]  = b*v ;
-        points[j+2]  = c*(u*u - v*v)  ;
+void unifRandomPointsOnHyperPara(const FSize N, const FReal &a, const FReal &b,
+                                 const FReal &c, FReal * points)
+{
+    FReal u, v;
+    for (FSize i = 0, j = 0 ; i< N ; ++i, j+=4)  {
+        u = 2.0 * getRandom<FReal>() - 1.0;
+        v = 2.0 * getRandom<FReal>() - 1.0;
+        points[j]   = a * u;
+        points[j+1] = b * v;
+        points[j+2] = c * (u*u - v*v);
     }
 };
 
 
-//!  \fn  unifRandonPointsOnSphere(const int N , const FReal R, FReal * points)
-
-//! \brief Generate N points uniformly distributed on the sphere of radius R
-
-//!
-//! \param N the number of points uniformly randomly sample on the sphere
-//! \param R the radius of the sphere
-//! \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0....
-//!
+/**
+ * \brief Generate N points uniformly distributed on the sphere of radius R
+ *
+ * \tparam FReal Floating point type
+ *
+ * \param N the number of points uniformly randomly sample on the sphere
+ * \param R the radius of the sphere
+ * \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0...
+ */
 template <class FReal>
-void unifRandonPointsOnSphere(const FSize N , const FReal R, FReal * points) {
-	//
-	unifRandonPointsOnUnitSphere(N , points) ;
-    FSize j =0 ;
-    for (FSize i = 0 ; i< N ; ++i, j+=4)  {
-		points[j]	   *= R ;
-		points[j+1]  *= R ;
-		points[j+2]  *= R ;
-	}
+void unifRandomPointsOnSphere(const FSize N, const FReal R, FReal* points) {
+    initRandom();
+    FReal u, v, theta, phi, sinPhi;
+    for (FSize i = 0, j = 0 ; i< N ; ++i, j+=4)  {
+        u = getRandom<FReal>();
+        v = getRandom<FReal>();
+        theta  = FMath::FTwoPi<FReal>() * u;
+        phi    = FMath::ACos(2*v - 1);
+        sinPhi = FMath::Sin(phi);
+
+        points[j]   = FMath::Cos(theta) * sinPhi * R;
+        points[j+1] = FMath::Sin(theta) * sinPhi * R;
+        points[j+2] = (2*v - 1) * R;
+    }
 };
-//!  \fn void plummerDist(int & cpt, const FReal &R)
 
-//! \brief   Radial Plummer distribution
 
-//!
-//! \param cpt : counter to know how many random selections we need to obtain a radius less than R
-//! \param R    : Radius of the sphere that contains the particles
-//! @return Return the radius according to the Plummer distribution either double type or float type
-//!
+/**
+ * \brief Radial Plummer distribution
+ *
+ * \tparam FReal Floating point type
+ *
+ * \param cpt counter to know how many random selections we need to obtain a radius less than R
+ * \param R   radius of the sphere that contains the particles
+ * \return The radius according to the Plummer distribution
+ */
 template <class FReal>
-FReal  plummerDist(FSize cpt, const FReal &R) {
-	//
-	FReal radius ,u ;
-	do  {
-		//
-        u        = FMath::pow (getRandom<FReal>() , 2.0/3.0) ;
-		radius = FMath::Sqrt (u/(1.0-u));
-		cpt++;
-		if(radius  <=R){
-			//			std::cout << radius << "    "  <<std::endl;
-			return static_cast<FReal>(radius);
-		}
-	} while (true);
+FReal plummerDist(FSize& cpt, const FReal &R) {
+    FReal radius, u;
+    while(true) {
+        u = FMath::pow(getRandom<FReal>(), 2.0/3.0);
+        radius = FMath::Sqrt(u/(1.0-u));
+        cpt++;
+        if(radius <= R) {
+            return static_cast<FReal>(radius);
+        }
+    }
 }
-//! \fn void unifRandonPlummer(const int N , const FReal R, const FReal M, FReal * points)
-
-//! \brief  Build N points following the Plummer distribution
 
-//! First we construct N points uniformly distributed on the unit sphere. Then the radius in construct according to the Plummr distribution.
-//!
-//! \param N the number of points following the Plummer distribution
-//! \param R the radius of the sphere that contains all the points
-//! \param M the total mass of all the particles inside the Sphere or radius R
-//! \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0....
+/**
+ * \brief Build N points following the Plummer distribution
+ *
+ * First we construct N points uniformly distributed on the unit sphere. Then
+ * the radius in construct according to the Plummer distribution for
+ * a  constant mass of 1/N
+ *
+ * \tparam FReal Floating point type
+ *
+ * \param N the number of points following the Plummer distribution
+ * \param R the radius of the sphere that contains all the points
+ * \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0....
+ */
 template <class FReal>
-void unifRandonPlummer(const FSize N , const FReal R, const FReal M, FReal * points) {
-	//
-	unifRandonPointsOnUnitSphere(N , points) ;
-	//
-	FReal r , rm= 0.0;
-    //	FReal Coeff =  3.0*M/(4.0*FMath::FPi<FReal>()*R*R*R) ;
-	//am1 = 0 ;//1/FMath::pow(1+R*R,2.5);
-    FSize cpt = 0 ;
-    for (FSize i = 0,j=0 ; i< N ; ++i, j+=4)  {
-		// u \in []
-		r = plummerDist(cpt,R) ;
-		rm = std::max(rm, r);
-		points[j]	   *= r ;
-		points[j+1]  *= r ;
-		points[j+2]  *= r ;
-	}
-
-	std::cout << "Total tested points: "<< cpt << " % of rejected points: "
-            <<100*static_cast<FReal>(cpt-N)/static_cast<FReal>(cpt) << " %" <<std::endl;
+void unifRandomPlummer(const FSize N, const FReal R, FReal * points) {
+    unifRandomPointsOnSphere<FReal>(N, 1, points);
+    FReal mc = 1.0/static_cast<FReal>(N);
+    for (FSize i = 0, j = 0 ; i< N ; ++i, j+=4)  {
+    	FReal m = getRandom<FReal>();
+    	FReal r = FMath::Sqrt( 1.0/(FMath::pow(m, -2.0/3.0) - 1.0)) ;
+        points[j]    *= r;
+        points[j+1]  *= r;
+        points[j+2]  *= r;
+        points[j+3]   = mc;  // the mass
+    }
+}
+/**
+ * \brief Build N points following the Plummer like distribution
+ *
+ * First we construct N points uniformly distributed on the unit sphere. Then
+ * the radius in construct according to the Plummer like distribution.
+ *
+ * \tparam FReal Floating point type
+ *
+ * \param N the number of points following the Plummer distribution
+ * \param R the radius of the sphere that contains all the points
+ * \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0....
+ */
+template <class FReal>
+void unifRandomPlummerLike(const FSize N, const FReal R, FReal * points) {
+	FReal a = 1.0 ;
+    unifRandomPointsOnSphere<FReal>(N, 1, points);
+    FReal r, rm = 0.0;
+    FSize cpt = 0;
+    for (FSize i = 0, j = 0 ; i< N ; ++i, j+=4)  {
+        r = plummerDist(cpt,R);
+        rm = std::max(rm, r);
+        points[j]    *= r;
+        points[j+1]  *= r;
+        points[j+2]  *= r;
+    }
 
-} ;
+    std::cout << "Total tested points: " << cpt << " % of rejected points: "
+              << 100 * static_cast<FReal>(cpt-N) / static_cast<FReal>(cpt)
+              << " %"
+              << std::endl;
+}
 //
 #endif
diff --git a/Src/Files/FRandomLoader.hpp b/Src/Files/FRandomLoader.hpp
index d53417c08e87681afe28e718bc8bd77ae54c64b6..73af9be3ec8622af64b66e13bd51eaf503beb4c0 100644
--- a/Src/Files/FRandomLoader.hpp
+++ b/Src/Files/FRandomLoader.hpp
@@ -124,8 +124,8 @@ public:
 
     void fillParticle(FPoint<FReal>*const inParticlePositions, FParticleType*const isTarget){
         FRandomLoader<FReal>::fillParticle(inParticlePositions);
-        if(FRandomLoader<FReal>::getRandom() > 0.5 ) (*isTarget) = FParticleTypeTarget;
-        else (*isTarget) = FParticleTypeSource;
+        if(FRandomLoader<FReal>::getRandom() > 0.5 ) (*isTarget) = FParticleType::FParticleTypeTarget;
+        else (*isTarget) = FParticleType::FParticleTypeSource;
     }
 };
 
diff --git a/Src/ScalFmmConfig.h.cmake b/Src/ScalFmmConfig.h.cmake
index e454565ee48b4eed14b493ca90f8c4f13a7b801e..0c4c2960f17f539e9ae77a78ae489ed6e066c355 100644
--- a/Src/ScalFmmConfig.h.cmake
+++ b/Src/ScalFmmConfig.h.cmake
@@ -28,7 +28,10 @@
 
 #cmakedefine SCALFMM_USE_BLAS
 #cmakedefine SCALFMM_USE_MKL_AS_BLAS
-
+// Fortran Mangling
+#cmakedefine SCALFMM_BLAS_ADD_
+#cmakedefine SCALFMM_BLAS_UPCASE
+#cmakedefine SCALFMM_BLAS_NOCHANGE
 ////////////////////////////////////////////////////////
 // FFT
 ///////////////////////////////////////////////////////
diff --git a/Src/Utils/FAlgorithmTimers.hpp b/Src/Utils/FAlgorithmTimers.hpp
index ee41b38f438d0c850e55198a5d8c0074698aae47..f9eb1f2a56a7b70e5cf03258e160a75f8d408eed 100644
--- a/Src/Utils/FAlgorithmTimers.hpp
+++ b/Src/Utils/FAlgorithmTimers.hpp
@@ -17,6 +17,13 @@
 #ifndef FALGORITHMTIMERS_HPP
 #define FALGORITHMTIMERS_HPP
 
+#include <map>
+#include <string>
+
+#include "FTic.hpp"
+
+using FTimerMap = std::map<std::string, FTic>;
+
 /**
  * @brief Collection of timers for FMM operators.
  *
@@ -25,56 +32,38 @@
  */
 class FAlgorithmTimers{
 public:
-    /// The timer names
-    enum FTimers {
-        P2MTimer,
-        M2MTimer,
-        M2LTimer,
-        L2LTimer,
-        L2PTimer,
-        P2PTimer,
-        NearTimer,
-        nbTimers   ///< Timer count
-    };
+    static constexpr const char* P2MTimer = "P2M";
+    static constexpr const char* M2MTimer = "M2M";
+    static constexpr const char* M2LTimer = "M2L";
+    static constexpr const char* L2LTimer = "L2L";
+    static constexpr const char* L2PTimer = "L2P";
+    static constexpr const char* P2PTimer = "P2P";
+    static constexpr const char* M2PTimer = "M2P";
+    static constexpr const char* P2LTimer = "P2L";
+    static constexpr const char* NearTimer = "Near";
+    enum {nbTimers = 9};
 
-protected:
-    /// Timer array
-    FTic Timers[nbTimers];
+    /// Timers
+    FTimerMap Timers;
 
-public:
     /// Constructor: resets all timers
-    FAlgorithmTimers()
-    {
-        for(int i = 0; i < nbTimers ; ++i){
-            Timers[i].reset();
-        }
-    }
+    FAlgorithmTimers() = default;
 
     /// Default copy contructor
     FAlgorithmTimers(const FAlgorithmTimers&) = default;
     /// Default move contructor
     FAlgorithmTimers(FAlgorithmTimers&&) = default;
 
-    /// Returns the timer array
-    const FTic * getAllTimers() const {
-        return Timers;
-    }
-
-    /// Returns the timer count
-    int getNbOfTimerRecorded() const {
-        return nbTimers;
-    }
-
     /// Elapsed time between last FTic::tic() and FTic::tac() for given timer.
-    double getTime(FTimers OpeTimer) const{
+    double getTime(std::string TimerName) const{
         //assert to verify size
-        return Timers[OpeTimer].elapsed();
+        return Timers.at(TimerName).elapsed();
     }
 
     /// Cumulated time between all FTic::tic() and FTic::tac() for given timer.
-    double getCumulatedTime(FTimers OpeTimer) const{
+    double getCumulatedTime(std::string TimerName) const{
         //assert to verify size
-        return Timers[OpeTimer].cumulated();
+        return Timers.at(TimerName).cumulated();
     }
 
 };
diff --git a/Src/Utils/FBlas.hpp b/Src/Utils/FBlas.hpp
index ed3bb4576c319dc82344717b6423cc8e5f928a51..f722b0f5f372980fcc9197dc0b9dbd6396578fac 100644
--- a/Src/Utils/FBlas.hpp
+++ b/Src/Utils/FBlas.hpp
@@ -17,6 +17,7 @@
 #define FBLAS_HPP
 
 #include "FGlobal.hpp"
+#include "FFortranMangling.hpp"
 
 #ifndef SCALFMM_USE_BLAS
 #error The BLAS header is included while SCALFMM_USE_BLAS is turned OFF
@@ -30,133 +31,133 @@
 
 // for real
 namespace scalfmm {
-const double D_ZERO =  0.0;
-const double D_ONE  =  1.0;
-const double D_MONE = -1.0;
-const float  S_ZERO =  0.0;
-const float  S_ONE  =  1.0;
-const float  S_MONE = -1.0;
-// for complex
-const double Z_ZERO[2] =  {0.0,0.0};
-const double Z_ONE[2]  =  {1.0,0.0};
-const double Z_MONE[2] =  {-1.0,0.0};
-const float  C_ZERO[2] =  {0.0,0.0};
-const float  C_ONE[2]  =  {1.0,0.0};
-const float  C_MONE[2] =  {-1.0,0.0};
-
-//const double D_PREC = 1e-16;
-
-const unsigned N_ONE = 1;
-const int N_MONE = -1;
-const char JOB_STR[] = "NTOSVULCR";
+  const double D_ZERO =  0.0;
+  const double D_ONE  =  1.0;
+  const double D_MONE = -1.0;
+  const float  S_ZERO =  0.0;
+  const float  S_ONE  =  1.0;
+  const float  S_MONE = -1.0;
+  // for complex
+  const double Z_ZERO[2] =  {0.0,0.0};
+  const double Z_ONE[2]  =  {1.0,0.0};
+  const double Z_MONE[2] =  {-1.0,0.0};
+  const float  C_ZERO[2] =  {0.0,0.0};
+  const float  C_ONE[2]  =  {1.0,0.0};
+  const float  C_MONE[2] =  {-1.0,0.0};
+
+  //const double D_PREC = 1e-16;
+
+  const unsigned N_ONE = 1;
+  const int N_MONE = -1;
+  const char JOB_STR[] = "NTOSVULCR";
 }
 
 extern "C"
 {
-	// double //////////////////////////////////////////////////////////
-	// blas 1
-	double ddot_(const unsigned*, const double*, const unsigned*, const double*, const unsigned*);
-	void dscal_(const unsigned*, const double*, const double*, const unsigned*);
-	void dcopy_(const unsigned*, const double*, const unsigned*, double*, const unsigned*);
-	void daxpy_(const unsigned*, const double*, const double*, const unsigned*, double*, const unsigned*);
-	// blas 2
-	void dgemv_(const char*, const unsigned*, const unsigned*, const double*,
-							const double*, const unsigned*, const double*, const unsigned*,
-							const double*, double*, const unsigned*);
-	// blas 3
-	void dgemm_(const char*, const char*, const unsigned*, const unsigned*,
-							const unsigned*, const double*, double*, const unsigned*,
-							double*, const unsigned*, const double*, double*,	const unsigned*);
-	// lapack
-	void dgesvd_(const char*, const char*, const unsigned*, const unsigned*,
-							 double*, const unsigned*, double*, double*, const unsigned*,
-							 double*, const unsigned*, double*, const unsigned*, int*);
-	void dgeqrf_(const unsigned*, const unsigned*, double*, const unsigned*,
-							 double*, double*, const unsigned*, int*);
-    void dgeqp3_(const unsigned*, const unsigned*, double*, const unsigned*, /*TYPE OF JPIV*/ unsigned*,
-                             double*, double*, const unsigned*, int*);
-    void dorgqr_(const unsigned*, const unsigned*, const unsigned*,
-							 double*, const unsigned*, double*, double*, const unsigned*, int*);
-	void dormqr_(const char*, const char*, 
+  // double //////////////////////////////////////////////////////////
+  // blas 1
+  double Fddot(const unsigned*, const double*, const unsigned*, const double*, const unsigned*);
+  void   Fdscal(const unsigned*, const double*, const double*, const unsigned*);
+  void   Fdcopy(const unsigned*, const double*, const unsigned*, double*, const unsigned*);
+  void   Fdaxpy(const unsigned*, const double*, const double*, const unsigned*, double*, const unsigned*);
+  // blas 2
+  void   Fdgemv(const char*, const unsigned*, const unsigned*, const double*,
+		  const double*, const unsigned*, const double*, const unsigned*,
+	      const double*, double*, const unsigned*);
+  // blas 3
+  void Fdgemm(const char*, const char*, const unsigned*, const unsigned*,
+	      const unsigned*, const double*, double*, const unsigned*,
+	      double*, const unsigned*, const double*, double*,	const unsigned*);
+  // lapack
+  void Fdgesvd(const char*, const char*, const unsigned*, const unsigned*,
+	       double*, const unsigned*, double*, double*, const unsigned*,
+	       double*, const unsigned*, double*, const unsigned*, int*);
+  void Fdgeqrf(const unsigned*, const unsigned*, double*, const unsigned*,
+	       double*, double*, const unsigned*, int*);
+  void Fdgeqp3(const unsigned*, const unsigned*, double*, const unsigned*, /*TYPE OF JPIV*/ unsigned*,
+	       double*, double*, const unsigned*, int*);
+  void Fdorgqr(const unsigned*, const unsigned*, const unsigned*,
+	       double*, const unsigned*, double*, double*, const unsigned*, int*);
+  void Fdormqr(const char*, const char*,
                const unsigned*, const unsigned*, const unsigned*,
-							 const double*, const unsigned*, 
+	       const double*, const unsigned*, 
                double*, double*, const unsigned*, 
                double*, const unsigned*, int*);
-    void dpotrf_(const char*, const unsigned*, double*, const unsigned*, int*);
-
-	// single //////////////////////////////////////////////////////////
-	// blas 1
-	float sdot_(const unsigned*, const float*, const unsigned*,	const float*, const unsigned*);
-	void sscal_(const unsigned*, const float*, const float*, const unsigned*);
-	void scopy_(const unsigned*, const float*, const unsigned*,	float*, const unsigned*);
-	void saxpy_(const unsigned*, const float*, const float*, const unsigned*, float*, const unsigned*);
-	// blas 2
-	void sgemv_(const char*, const unsigned*, const unsigned*, const float*,
-							const float*, const unsigned*, const float*, const unsigned*,
-							const float*, float*, const unsigned*);
-	// blas 3
-	void sgemm_(const char*, const char*, const unsigned*, const unsigned*,
-							const unsigned*, const float*, float*, const unsigned*,
-							float*, const unsigned*, const float*, float*, const unsigned*);
-	// lapack
-	void sgesvd_(const char*, const char*, const unsigned*, const unsigned*,
-							 float*, const unsigned*, float*, float*, const unsigned*,
-							 float*, const unsigned*, float*, const unsigned*, int*);
-	void sgeqrf_(const unsigned*, const unsigned*, float*, const unsigned*,
-							 float*, float*, const unsigned*, int*);
-    void sgeqp3_(const unsigned*, const unsigned*, float*, const unsigned*, /*TYPE OF JPIV*/ unsigned*,
-                             float*, float*, const unsigned*, int*);
-	void sorgqr_(const unsigned*, const unsigned*, const unsigned*,
-							 float*, const unsigned*, float*, float*, const unsigned*, int*);
-	void sormqr_(const char*, const char*, 
+  void Fdpotrf(const char*, const unsigned*, double*, const unsigned*, int*);
+
+  // single //////////////////////////////////////////////////////////
+  // blas 1
+  float Fsdot(const unsigned*, const float*, const unsigned*,	const float*, const unsigned*);
+  void Fsscal(const unsigned*, const float*, const float*, const unsigned*);
+  void Fscopy(const unsigned*, const float*, const unsigned*,	float*, const unsigned*);
+  void Fsaxpy(const unsigned*, const float*, const float*, const unsigned*, float*, const unsigned*);
+  // blas 2
+  void Fsgemv(const char*, const unsigned*, const unsigned*, const float*,
+	      const float*, const unsigned*, const float*, const unsigned*,
+	      const float*, float*, const unsigned*);
+  // blas 3
+  void Fsgemm(const char*, const char*, const unsigned*, const unsigned*,
+	      const unsigned*, const float*, float*, const unsigned*,
+	      float*, const unsigned*, const float*, float*, const unsigned*);
+  // lapack
+  void Fsgesvd(const char*, const char*, const unsigned*, const unsigned*,
+	       float*, const unsigned*, float*, float*, const unsigned*,
+	       float*, const unsigned*, float*, const unsigned*, int*);
+  void Fsgeqrf(const unsigned*, const unsigned*, float*, const unsigned*,
+	       float*, float*, const unsigned*, int*);
+  void Fsgeqp3(const unsigned*, const unsigned*, float*, const unsigned*, /*TYPE OF JPIV*/ unsigned*,
+	       float*, float*, const unsigned*, int*);
+  void Fsorgqr(const unsigned*, const unsigned*, const unsigned*,
+	       float*, const unsigned*, float*, float*, const unsigned*, int*);
+  void Fsormqr(const char*, const char*,
                const unsigned*, const unsigned*, const unsigned*,
-							 const float*, const unsigned*, 
+	       const float*, const unsigned*, 
                float*, float*, const unsigned*, 
                float*, const unsigned*, int*);
-    void spotrf_(const char*, const unsigned*, float*, const unsigned*, int*);
-
-	// double complex //////////////////////////////////////////////////
-	// blas 1
-	void zscal_(const unsigned*, const double*, const double*, const unsigned*);
-	void zcopy_(const unsigned*, const double*, const unsigned*, double*, const unsigned*);
-	void zaxpy_(const unsigned*, const double*, const double*, const unsigned*, double*, const unsigned*);
-	// blas 2
-	void zgemv_(const char*, const unsigned*, const unsigned*, const double*,
-							const double*, const unsigned*, const double*, const unsigned*,
-							const double*, double*, const unsigned*);
-	// blas 3
-	void zgemm_(const char*, const char*, const unsigned*, const unsigned*,
-							const unsigned*, const double*, double*, const unsigned*,
-							double*, const unsigned*, const double*, double*, const unsigned*);
-	void zgesvd_(const char*, const char*, const unsigned*, const unsigned*,
-							 double*, const unsigned*, double*, double*, const unsigned*,
-							 double*, const unsigned*, double*,   int*,  double*,   int*);
-
-	void zgeqrf_(const unsigned*, const unsigned*, double*, const unsigned*,
-							 double*, double*, const unsigned*, int*);
-    void zgeqp3_(const unsigned*, const unsigned*, double*, const unsigned*,/*TYPE OF JPIV*/ unsigned*,
-                             double*, double*, const unsigned*, int*);
-
-    void zpotrf_(const char*, const unsigned*, double*, const unsigned*, int*);
-
-	// single complex //////////////////////////////////////////////////
-	// blas 1
-	void cscal_(const unsigned*, const float*, const float*, const unsigned*);
-	void ccopy_(const unsigned*, const float*, const unsigned*,	float*, const unsigned*);
-	void caxpy_(const unsigned*, const float*, const float*, const unsigned*, float*, const unsigned*);
-	// blas 2
-	void cgemv_(const char*, const unsigned*, const unsigned*, const float*,
-							const float*, const unsigned*, const float*, const unsigned*,
-							const float*, float*, const unsigned*);
-	// blas 3
-	void cgemm_(const char*, const char*, const unsigned*, const unsigned*,
-							const unsigned*, const float*, float*, const unsigned*,
-							float*, const unsigned*, const float*, float*, const unsigned*);
-	void cgeqrf_(const unsigned*, const unsigned*, float*, const unsigned*,
-							 float*, float*, const unsigned*, int*);
-    void cgeqp3_(const unsigned*, const unsigned*, float*, const unsigned*, /*TYPE OF JPIV*/ unsigned*,
-                             float*, float*, const unsigned*, int*);    
-    void cpotrf_(const char*, const unsigned*, float*, const unsigned*, int*);
+  void Fspotrf(const char*, const unsigned*, float*, const unsigned*, int*);
+
+  // double complex //////////////////////////////////////////////////
+  // blas 1
+  void Fzscal(const unsigned*, const double*, const double*, const unsigned*);
+  void Fzcopy(const unsigned*, const double*, const unsigned*, double*, const unsigned*);
+  void Fzaxpy(const unsigned*, const double*, const double*, const unsigned*, double*, const unsigned*);
+  // blas 2
+  void Fzgemv(const char*, const unsigned*, const unsigned*, const double*,
+	      const double*, const unsigned*, const double*, const unsigned*,
+	      const double*, double*, const unsigned*);
+  // blas 3
+  void Fzgemm(const char*, const char*, const unsigned*, const unsigned*,
+	      const unsigned*, const double*, double*, const unsigned*,
+	      double*, const unsigned*, const double*, double*, const unsigned*);
+  void Fzgesvd(const char*, const char*, const unsigned*, const unsigned*,
+	       double*, const unsigned*, double*, double*, const unsigned*,
+	       double*, const unsigned*, double*,   int*,  double*,   int*);
+
+  void Fzgeqrf(const unsigned*, const unsigned*, double*, const unsigned*,
+	       double*, double*, const unsigned*, int*);
+  void Fzgeqp3(const unsigned*, const unsigned*, double*, const unsigned*,/*TYPE OF JPIV*/ unsigned*,
+	       double*, double*, const unsigned*, int*);
+
+  void Fzpotrf(const char*, const unsigned*, double*, const unsigned*, int*);
+
+  // single complex //////////////////////////////////////////////////
+  // blas 1
+  void Fcscal(const unsigned*, const float*, const float*, const unsigned*);
+  void Fccopy(const unsigned*, const float*, const unsigned*,	float*, const unsigned*);
+  void Fcaxpy(const unsigned*, const float*, const float*, const unsigned*, float*, const unsigned*);
+  // blas 2
+  void Fcgemv(const char*, const unsigned*, const unsigned*, const float*,
+	      const float*, const unsigned*, const float*, const unsigned*,
+	      const float*, float*, const unsigned*);
+  // blas 3
+  void Fcgemm(const char*, const char*, const unsigned*, const unsigned*,
+	      const unsigned*, const float*, float*, const unsigned*,
+	      float*, const unsigned*, const float*, float*, const unsigned*);
+  void Fcgeqrf(const unsigned*, const unsigned*, float*, const unsigned*,
+	       float*, float*, const unsigned*, int*);
+  void Fcgeqp3(const unsigned*, const unsigned*, float*, const unsigned*, /*TYPE OF JPIV*/ unsigned*,
+	       float*, float*, const unsigned*, int*);    
+  void Fcpotrf(const char*, const unsigned*, float*, const unsigned*, int*);
 
 
 }
@@ -164,460 +165,460 @@ extern "C"
 
 namespace FBlas {
 
-	// copy
-	inline void copy(const unsigned n, double* orig, double* dest)
-	{	dcopy_(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE);	}
-	inline void copy(const unsigned n, const double* orig, double* dest)
-	{	dcopy_(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE);	}
-	inline void copy(const unsigned n, float* orig, float* dest)
-	{	scopy_(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE);	}
-    inline void copy(const unsigned n, const float* orig, float* dest)
-    {   scopy_(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); }
-	inline void c_copy(const unsigned n, double* orig, double* dest)
-	{	zcopy_(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE);	}
-	inline void c_copy(const unsigned n, const double* orig, double* dest)
-	{	zcopy_(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE);	}
-	inline void c_copy(const unsigned n, float* orig, float* dest)
-	{	ccopy_(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE);	}
-    inline void c_copy(const unsigned n, const float* orig, float* dest)
-    {   ccopy_(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); }
-
-	// copy (variable increment)
-	inline void copy(const unsigned n, double* orig, const unsigned inco, double* dest, const unsigned incd)
-	{	dcopy_(&n, orig, &inco, dest, &incd);	}
-	inline void copy(const unsigned n, float* orig, const unsigned inco, float* dest, const unsigned incd)
-	{	scopy_(&n, orig, &inco, dest, &incd);	}
-	inline void c_copy(const unsigned n, double* orig, const unsigned inco, double* dest, const unsigned incd)
-	{	zcopy_(&n, orig, &inco, dest, &incd);	}
-	inline void c_copy(const unsigned n, float* orig, const unsigned inco, float* dest, const unsigned incd)
-	{	ccopy_(&n, orig, &inco, dest, &incd);	}
-
-	// scale
-	inline void scal(const unsigned n, const double d, double* const x)
-	{	dscal_(&n, &d, x, &scalfmm::N_ONE); }
-	inline void scal(const unsigned n, const float d, float* const x)
-	{	sscal_(&n, &d, x, &scalfmm::N_ONE); }
-	inline void c_scal(const unsigned n, const double d, double* const x)
-	{	zscal_(&n, &d, x, &scalfmm::N_ONE); }
-	inline void c_scal(const unsigned n, const float d, float* const x)
-	{	cscal_(&n, &d, x, &scalfmm::N_ONE); }
-
-	// scale (variable increment)
-	inline void scal(const unsigned n, const double d, double* const x, const unsigned incd)
-	{	dscal_(&n, &d, x, &incd); }
-	inline void scal(const unsigned n, const float d, float* const x, const unsigned incd)
-	{	sscal_(&n, &d, x, &incd); }
-	inline void c_scal(const unsigned n, const double d, double* const x, const unsigned incd)
-	{	zscal_(&n, &d, x, &incd); }
-	inline void c_scal(const unsigned n, const float d, float* const x, const unsigned incd)
-	{	cscal_(&n, &d, x, &incd); }
-
-	// set zero
-	inline void setzero(const unsigned n, double* const x)
-	{	for (unsigned i=0; i<n; ++i) x[i] = 0.0; }
-	inline void setzero(const unsigned n, float* const x)
-	{	for (unsigned i=0; i<n; ++i) x[i] = 0.0f; }
-	inline void c_setzero(const unsigned n, double* const x)
-	{	for (unsigned i=0; i<n; ++i) x[i*2] = x[i*2+1] = 0.0; }
-	inline void c_setzero(const unsigned n, float* const x)
-	{	for (unsigned i=0; i<n; ++i) x[i*2] = x[i*2+1] = 0.0f; }
-
-	// y += x
-	inline void add(const unsigned n, double* const x, double* const y)
-	{	daxpy_(&n, &scalfmm::D_ONE, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE);	}
-	inline void add(const unsigned n, float* const x, float* const y)
-	{	saxpy_(&n, &scalfmm::S_ONE, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE);	}
-	inline void c_add(const unsigned n, float* const x, float* const y)
-	{	caxpy_(&n, scalfmm::C_ONE, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE);	}
-	inline void c_add(const unsigned n, double* const x,double* const y)
-	{	zaxpy_(&n, scalfmm::Z_ONE, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE);	}
-
-	// y += d x
-	inline void axpy(const unsigned n, const double d, const double* const x, double* const y)
-	{	daxpy_(&n, &d, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE);	}
-	inline void axpy(const unsigned n, const float d, const float* const x, float* const y)
-	{	saxpy_(&n, &d, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE);	}
-	inline void c_axpy(const unsigned n, const float* d, const float* const x, float* const y)
-	{	caxpy_(&n, d, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE);	}
-	inline void c_axpy(const unsigned n, const double* d, const double* const x, double* const y)
-	{	zaxpy_(&n, d, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE);	}
-
-
-
-	//	// y = d Ax
-	//	inline void gemv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
-	//	{	cblas_dgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::D_ZERO, y, scalfmm::N_ONE); }
-	//	inline void gemv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
-	//	{	cblas_sgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::S_ZERO, y, scalfmm::N_ONE); }
-	// y = d Ax
-	inline void gemv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
-	{	dgemv_(scalfmm::JOB_STR, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::D_ZERO, y, &scalfmm::N_ONE); }
-	inline void gemv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
-	{	sgemv_(scalfmm::JOB_STR, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::S_ZERO, y, &scalfmm::N_ONE); }
-	inline void c_gemv(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y)
-	{	cgemv_(scalfmm::JOB_STR, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ZERO, y, &scalfmm::N_ONE); }
-	inline void c_gemv(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y)
-	{	zgemv_(scalfmm::JOB_STR, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ZERO, y, &scalfmm::N_ONE); }
-
-	//	// y += d Ax
-	//	inline void gemva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
-	//	{	cblas_dgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::D_ONE, y, scalfmm::N_ONE); }
-	//	inline void gemva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
-	//	{	cblas_sgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::S_ONE, y, scalfmm::N_ONE); }
-	// y += d Ax
-	inline void gemva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
-	{	dgemv_(scalfmm::JOB_STR, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::D_ONE, y, &scalfmm::N_ONE);	}
-	inline void gemva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
-	{	sgemv_(scalfmm::JOB_STR, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::S_ONE, y, &scalfmm::N_ONE);	}
-	inline void c_gemva(const unsigned m, const unsigned n, const float* d, const float* A, const float *x, float *y)
-	{	cgemv_(scalfmm::JOB_STR, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ONE, y, &scalfmm::N_ONE);	}
-	inline void c_gemva(const unsigned m, const unsigned n, const double* d, const double* A, const double *x, double *y)
-	{	zgemv_(scalfmm::JOB_STR, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ONE, y, &scalfmm::N_ONE);	}
-
-	//	// y = d A^T x
-	//	inline void gemtv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
-	//	{ cblas_dgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::D_ZERO, y, scalfmm::N_ONE); }
-	//	inline void gemtv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
-	//	{	cblas_sgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::S_ZERO, y, scalfmm::N_ONE); }
-	// y = d A^T x
-	inline void gemtv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
-	{	dgemv_(scalfmm::JOB_STR+1, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::D_ZERO, y, &scalfmm::N_ONE); }
-	inline void gemtv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
-	{	sgemv_(scalfmm::JOB_STR+1, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::S_ZERO, y, &scalfmm::N_ONE); }
-	inline void c_gemtv(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y)
-	{	cgemv_(scalfmm::JOB_STR+1, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ZERO, y, &scalfmm::N_ONE); }
-	inline void c_gemtv(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y)
-	{	zgemv_(scalfmm::JOB_STR+1, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ZERO, y, &scalfmm::N_ONE); }
-	inline void c_gemhv(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y)
-	{	cgemv_(scalfmm::JOB_STR+7, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ZERO, y, &scalfmm::N_ONE); } // hermitian transposed
-	inline void c_gemhv(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y)
-	{	zgemv_(scalfmm::JOB_STR+7, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ZERO, y, &scalfmm::N_ONE); } // hermitian transposed
-
-	//	// y += d A^T x
-	//	inline void gemtva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
-	//	{	cblas_dgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::D_ONE, y, scalfmm::N_ONE); }
-	//	inline void gemtva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
-	//	{	cblas_sgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::S_ONE, y, scalfmm::N_ONE); }
-	// y += d A^T x
-	inline void gemtva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
-	{	dgemv_(scalfmm::JOB_STR+1, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::D_ONE, y, &scalfmm::N_ONE);	}
-	inline void gemtva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
-	{	sgemv_(scalfmm::JOB_STR+1, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::S_ONE, y, &scalfmm::N_ONE);	}
-	inline void c_gemtva(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y)
-	{	cgemv_(scalfmm::JOB_STR+1, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ONE, y, &scalfmm::N_ONE);	}
-	inline void c_gemtva(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y)
-	{	zgemv_(scalfmm::JOB_STR+1, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ONE, y, &scalfmm::N_ONE); }
-	inline void c_gemhva(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y)
-	{	cgemv_(scalfmm::JOB_STR+7, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ONE, y, &scalfmm::N_ONE);	} // hermitian transposed
-	inline void c_gemhva(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y)
-	{	zgemv_(scalfmm::JOB_STR+7, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ONE, y, &scalfmm::N_ONE);	} // hermitian transposed
-
-
-
-
-	// C = d A B, A is m x p, B is p x n
-	inline void gemm(unsigned m, unsigned p, unsigned n, double d,
-									 double* A, unsigned ldA, double* B, unsigned ldB, double* C, unsigned ldC)
-	{	dgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::D_ZERO, C, &ldC);	}
-	inline void gemm(unsigned m, unsigned p, unsigned n, float d,
-									 float* A, unsigned ldA, float* B, unsigned ldB, float* C, unsigned ldC)
-	{	sgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::S_ZERO, C, &ldC);	}
-	inline void c_gemm(const unsigned m, const unsigned p, const unsigned n, const float* d,
-										 float* A, const unsigned ldA, float* B, const unsigned ldB, float* C, const unsigned ldC)
-	{
-		cgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC);	}
-	inline void c_gemm(const unsigned m, const unsigned p, const unsigned n, const double* d,
-										 double* A, const unsigned ldA, double* B, const unsigned ldB, double* C, const unsigned ldC)
-	{
-		zgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC);	}
-
-	// C += d A B, A is m x p, B is p x n
-	inline void gemma(unsigned m, unsigned p, unsigned n, double d,
-										double* A, unsigned ldA, double* B, unsigned ldB,	double* C, unsigned ldC)
-	{	dgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::D_ONE, C, &ldC); }
-	inline void gemma(unsigned m, unsigned p, unsigned n, float d,
-										float* A, unsigned ldA, float* B, unsigned ldB,	float* C, unsigned ldC)
-	{	sgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::S_ONE, C, &ldC); }
-	inline void c_gemma(unsigned m, unsigned p, unsigned n, float* d,
-											float* A, unsigned ldA, float* B, unsigned ldB,	float* C, unsigned ldC)
-	{	cgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); }
-	inline void c_gemma(unsigned m, unsigned p, unsigned n, double* d,
-											double* A, unsigned ldA, double* B, unsigned ldB,	double* C, unsigned ldC)
-	{	zgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); }
-
-	// C = d A^T B, A is m x p, B is m x n
-	inline void gemtm(unsigned m, unsigned p, unsigned n, double d,
-										double* A, unsigned ldA, double *B, unsigned ldB,	double* C, unsigned ldC)
-	{	dgemm_(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &scalfmm::D_ZERO, C, &ldC);	}
-	inline void gemtm(unsigned m, unsigned p, unsigned n, float d,
-										float* A, unsigned ldA, float *B, unsigned ldB,	float* C, unsigned ldC)
-	{	sgemm_(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &scalfmm::S_ZERO, C, &ldC);	}
-	inline void c_gemtm(unsigned m, unsigned p, unsigned n, float* d,
-											float* A, unsigned ldA, float *B, unsigned ldB,	float* C, unsigned ldC)
-	{	cgemm_(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC);	}
-	inline void c_gemtm(unsigned m, unsigned p, unsigned n, double* d,
-											double* A, unsigned ldA, double *B, unsigned ldB,	double* C, unsigned ldC)
-	{	zgemm_(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC);	}
-	inline void c_gemhm(unsigned m, unsigned p, unsigned n, float* d, // hermitialn transposed
-											float* A, unsigned ldA, float *B, unsigned ldB,	float* C, unsigned ldC)
-	{	cgemm_(scalfmm::JOB_STR+7, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC);	}
-	inline void c_gemhm(unsigned m, unsigned p, unsigned n, double* d, // hermitian transposed
-											double* A, unsigned ldA, double *B, unsigned ldB,	double* C, unsigned ldC)
-	{	zgemm_(scalfmm::JOB_STR+7, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC);	}
-
-	// C += d A^T B, A is m x p, B is m x n
-	inline void gemtma(unsigned m, unsigned p, unsigned n, double d,
-										 double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
-	{	dgemm_(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &scalfmm::D_ONE, C, &ldC); }
-	inline void gemtma(unsigned m, unsigned p, unsigned n, float d,
-										 float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
-	{	sgemm_(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &scalfmm::S_ONE, C, &ldC); }
-	inline void c_gemtma(unsigned m, unsigned p, unsigned n, float* d,
-											 float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
-	{	cgemm_(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); }
-	inline void c_gemtma(unsigned m, unsigned p, unsigned n, double* d,
-											 double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
-	{	zgemm_(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); }
-	inline void c_gemhma(unsigned m, unsigned p, unsigned n, float* d, // hermitian transposed
-											 float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
-	{	cgemm_(scalfmm::JOB_STR+7, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); }
-	inline void c_gemhma(unsigned m, unsigned p, unsigned n, double* d, // hermitian transposed
-											 double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
-	{	zgemm_(scalfmm::JOB_STR+7, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); }
+  // copy
+  inline void copy(const unsigned n, double* orig, double* dest)
+  {	Fdcopy(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE);	}
+  inline void copy(const unsigned n, const double* orig, double* dest)
+  {	Fdcopy(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE);	}
+  inline void copy(const unsigned n, float* orig, float* dest)
+  {	Fscopy(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE);	}
+  inline void copy(const unsigned n, const float* orig, float* dest)
+  {   Fscopy(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); }
+  inline void c_copy(const unsigned n, double* orig, double* dest)
+  {	Fzcopy(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE);	}
+  inline void c_copy(const unsigned n, const double* orig, double* dest)
+  {	Fzcopy(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE);	}
+  inline void c_copy(const unsigned n, float* orig, float* dest)
+  {	Fccopy(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE);	}
+  inline void c_copy(const unsigned n, const float* orig, float* dest)
+  {   Fccopy(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); }
+
+  // copy (variable increment)
+  inline void copy(const unsigned n, double* orig, const unsigned inco, double* dest, const unsigned incd)
+  {	Fdcopy(&n, orig, &inco, dest, &incd);	}
+  inline void copy(const unsigned n, float* orig, const unsigned inco, float* dest, const unsigned incd)
+  {	Fscopy(&n, orig, &inco, dest, &incd);	}
+  inline void c_copy(const unsigned n, double* orig, const unsigned inco, double* dest, const unsigned incd)
+  {	Fzcopy(&n, orig, &inco, dest, &incd);	}
+  inline void c_copy(const unsigned n, float* orig, const unsigned inco, float* dest, const unsigned incd)
+  {	Fccopy(&n, orig, &inco, dest, &incd);	}
+
+  // scale
+  inline void scal(const unsigned n, const double d, double* const x)
+  {	Fdscal(&n, &d, x, &scalfmm::N_ONE); }
+  inline void scal(const unsigned n, const float d, float* const x)
+  {	Fsscal(&n, &d, x, &scalfmm::N_ONE); }
+  inline void c_scal(const unsigned n, const double d, double* const x)
+  {	Fzscal(&n, &d, x, &scalfmm::N_ONE); }
+  inline void c_scal(const unsigned n, const float d, float* const x)
+  {	Fcscal(&n, &d, x, &scalfmm::N_ONE); }
+
+  // scale (variable increment)
+  inline void scal(const unsigned n, const double d, double* const x, const unsigned incd)
+  {	Fdscal(&n, &d, x, &incd); }
+  inline void scal(const unsigned n, const float d, float* const x, const unsigned incd)
+  {	Fsscal(&n, &d, x, &incd); }
+  inline void c_scal(const unsigned n, const double d, double* const x, const unsigned incd)
+  {	Fzscal(&n, &d, x, &incd); }
+  inline void c_scal(const unsigned n, const float d, float* const x, const unsigned incd)
+  {	Fcscal(&n, &d, x, &incd); }
+
+  // set zero
+  inline void setzero(const unsigned n, double* const x)
+  {	for (unsigned i=0; i<n; ++i) x[i] = 0.0; }
+  inline void setzero(const unsigned n, float* const x)
+  {	for (unsigned i=0; i<n; ++i) x[i] = 0.0f; }
+  inline void c_setzero(const unsigned n, double* const x)
+  {	for (unsigned i=0; i<n; ++i) x[i*2] = x[i*2+1] = 0.0; }
+  inline void c_setzero(const unsigned n, float* const x)
+  {	for (unsigned i=0; i<n; ++i) x[i*2] = x[i*2+1] = 0.0f; }
+
+  // y += x
+  inline void add(const unsigned n, double* const x, double* const y)
+  {	Fdaxpy(&n, &scalfmm::D_ONE, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE);	}
+  inline void add(const unsigned n, float* const x, float* const y)
+  {	Fsaxpy(&n, &scalfmm::S_ONE, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE);	}
+  inline void c_add(const unsigned n, float* const x, float* const y)
+  {	Fcaxpy(&n, scalfmm::C_ONE, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE);	}
+  inline void c_add(const unsigned n, double* const x,double* const y)
+  {	Fzaxpy(&n, scalfmm::Z_ONE, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE);	}
+
+  // y += d x
+  inline void axpy(const unsigned n, const double d, const double* const x, double* const y)
+  {	Fdaxpy(&n, &d, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE);	}
+  inline void axpy(const unsigned n, const float d, const float* const x, float* const y)
+  {	Fsaxpy(&n, &d, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE);	}
+  inline void c_axpy(const unsigned n, const float* d, const float* const x, float* const y)
+  {	Fcaxpy(&n, d, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE);	}
+  inline void c_axpy(const unsigned n, const double* d, const double* const x, double* const y)
+  {	Fzaxpy(&n, d, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE);	}
+
+
+
+  //	// y = d Ax
+  //	inline void gemv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
+  //	{	cblas_dgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::D_ZERO, y, scalfmm::N_ONE); }
+  //	inline void gemv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
+  //	{	cblas_sgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::S_ZERO, y, scalfmm::N_ONE); }
+  // y = d Ax
+  inline void gemv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
+  {	Fdgemv(scalfmm::JOB_STR, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::D_ZERO, y, &scalfmm::N_ONE); }
+  inline void gemv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
+  {	Fsgemv(scalfmm::JOB_STR, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::S_ZERO, y, &scalfmm::N_ONE); }
+  inline void c_gemv(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y)
+  {	Fcgemv(scalfmm::JOB_STR, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ZERO, y, &scalfmm::N_ONE); }
+  inline void c_gemv(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y)
+  {	Fzgemv(scalfmm::JOB_STR, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ZERO, y, &scalfmm::N_ONE); }
+
+  //	// y += d Ax
+  //	inline void gemva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
+  //	{	cblas_dgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::D_ONE, y, scalfmm::N_ONE); }
+  //	inline void gemva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
+  //	{	cblas_sgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::S_ONE, y, scalfmm::N_ONE); }
+  // y += d Ax
+  inline void gemva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
+  {	Fdgemv(scalfmm::JOB_STR, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::D_ONE, y, &scalfmm::N_ONE);	}
+  inline void gemva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
+  {	Fsgemv(scalfmm::JOB_STR, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::S_ONE, y, &scalfmm::N_ONE);	}
+  inline void c_gemva(const unsigned m, const unsigned n, const float* d, const float* A, const float *x, float *y)
+  {	Fcgemv(scalfmm::JOB_STR, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ONE, y, &scalfmm::N_ONE);	}
+  inline void c_gemva(const unsigned m, const unsigned n, const double* d, const double* A, const double *x, double *y)
+  {	Fzgemv(scalfmm::JOB_STR, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ONE, y, &scalfmm::N_ONE);	}
+
+  //	// y = d A^T x
+  //	inline void gemtv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
+  //	{ cblas_dgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::D_ZERO, y, scalfmm::N_ONE); }
+  //	inline void gemtv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
+  //	{	cblas_sgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::S_ZERO, y, scalfmm::N_ONE); }
+  // y = d A^T x
+  inline void gemtv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
+  {	Fdgemv(scalfmm::JOB_STR+1, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::D_ZERO, y, &scalfmm::N_ONE); }
+  inline void gemtv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
+  {	Fsgemv(scalfmm::JOB_STR+1, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::S_ZERO, y, &scalfmm::N_ONE); }
+  inline void c_gemtv(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y)
+  {	Fcgemv(scalfmm::JOB_STR+1, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ZERO, y, &scalfmm::N_ONE); }
+  inline void c_gemtv(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y)
+  {	Fzgemv(scalfmm::JOB_STR+1, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ZERO, y, &scalfmm::N_ONE); }
+  inline void c_gemhv(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y)
+  {	Fcgemv(scalfmm::JOB_STR+7, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ZERO, y, &scalfmm::N_ONE); } // hermitian transposed
+  inline void c_gemhv(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y)
+  {	Fzgemv(scalfmm::JOB_STR+7, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ZERO, y, &scalfmm::N_ONE); } // hermitian transposed
+
+  //	// y += d A^T x
+  //	inline void gemtva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
+  //	{	cblas_dgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::D_ONE, y, scalfmm::N_ONE); }
+  //	inline void gemtva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
+  //	{	cblas_sgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::S_ONE, y, scalfmm::N_ONE); }
+  // y += d A^T x
+  inline void gemtva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y)
+  {	Fdgemv(scalfmm::JOB_STR+1, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::D_ONE, y, &scalfmm::N_ONE);	}
+  inline void gemtva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y)
+  {	Fsgemv(scalfmm::JOB_STR+1, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::S_ONE, y, &scalfmm::N_ONE);	}
+  inline void c_gemtva(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y)
+  {	Fcgemv(scalfmm::JOB_STR+1, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ONE, y, &scalfmm::N_ONE);	}
+  inline void c_gemtva(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y)
+  {	Fzgemv(scalfmm::JOB_STR+1, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ONE, y, &scalfmm::N_ONE); }
+  inline void c_gemhva(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y)
+  {	Fcgemv(scalfmm::JOB_STR+7, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ONE, y, &scalfmm::N_ONE);	} // hermitian transposed
+  inline void c_gemhva(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y)
+  {	Fzgemv(scalfmm::JOB_STR+7, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ONE, y, &scalfmm::N_ONE);	} // hermitian transposed
+
+
+
+
+  // C = d A B, A is m x p, B is p x n
+  inline void gemm(unsigned m, unsigned p, unsigned n, double d,
+		   double* A, unsigned ldA, double* B, unsigned ldB, double* C, unsigned ldC)
+  {	Fdgemm(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::D_ZERO, C, &ldC);	}
+  inline void gemm(unsigned m, unsigned p, unsigned n, float d,
+		   float* A, unsigned ldA, float* B, unsigned ldB, float* C, unsigned ldC)
+  {	Fsgemm(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::S_ZERO, C, &ldC);	}
+  inline void c_gemm(const unsigned m, const unsigned p, const unsigned n, const float* d,
+		     float* A, const unsigned ldA, float* B, const unsigned ldB, float* C, const unsigned ldC)
+  {
+    Fcgemm(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC);	}
+  inline void c_gemm(const unsigned m, const unsigned p, const unsigned n, const double* d,
+		     double* A, const unsigned ldA, double* B, const unsigned ldB, double* C, const unsigned ldC)
+  {
+    Fzgemm(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC);	}
+
+  // C += d A B, A is m x p, B is p x n
+  inline void gemma(unsigned m, unsigned p, unsigned n, double d,
+		    double* A, unsigned ldA, double* B, unsigned ldB,	double* C, unsigned ldC)
+  {	Fdgemm(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::D_ONE, C, &ldC); }
+  inline void gemma(unsigned m, unsigned p, unsigned n, float d,
+		    float* A, unsigned ldA, float* B, unsigned ldB,	float* C, unsigned ldC)
+  {	Fsgemm(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::S_ONE, C, &ldC); }
+  inline void c_gemma(unsigned m, unsigned p, unsigned n, float* d,
+		      float* A, unsigned ldA, float* B, unsigned ldB,	float* C, unsigned ldC)
+  {	Fcgemm(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); }
+  inline void c_gemma(unsigned m, unsigned p, unsigned n, double* d,
+		      double* A, unsigned ldA, double* B, unsigned ldB,	double* C, unsigned ldC)
+  {	Fzgemm(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); }
+
+  // C = d A^T B, A is m x p, B is m x n
+  inline void gemtm(unsigned m, unsigned p, unsigned n, double d,
+		    double* A, unsigned ldA, double *B, unsigned ldB,	double* C, unsigned ldC)
+  {	Fdgemm(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &scalfmm::D_ZERO, C, &ldC);	}
+  inline void gemtm(unsigned m, unsigned p, unsigned n, float d,
+		    float* A, unsigned ldA, float *B, unsigned ldB,	float* C, unsigned ldC)
+  {	Fsgemm(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &scalfmm::S_ZERO, C, &ldC);	}
+  inline void c_gemtm(unsigned m, unsigned p, unsigned n, float* d,
+		      float* A, unsigned ldA, float *B, unsigned ldB,	float* C, unsigned ldC)
+  {	Fcgemm(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC);	}
+  inline void c_gemtm(unsigned m, unsigned p, unsigned n, double* d,
+		      double* A, unsigned ldA, double *B, unsigned ldB,	double* C, unsigned ldC)
+  {	Fzgemm(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC);	}
+  inline void c_gemhm(unsigned m, unsigned p, unsigned n, float* d, // hermitialn transposed
+		      float* A, unsigned ldA, float *B, unsigned ldB,	float* C, unsigned ldC)
+  {	Fcgemm(scalfmm::JOB_STR+7, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC);	}
+  inline void c_gemhm(unsigned m, unsigned p, unsigned n, double* d, // hermitian transposed
+		      double* A, unsigned ldA, double *B, unsigned ldB,	double* C, unsigned ldC)
+  {	Fzgemm(scalfmm::JOB_STR+7, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC);	}
+
+  // C += d A^T B, A is m x p, B is m x n
+  inline void gemtma(unsigned m, unsigned p, unsigned n, double d,
+		     double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
+  {	Fdgemm(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &scalfmm::D_ONE, C, &ldC); }
+  inline void gemtma(unsigned m, unsigned p, unsigned n, float d,
+		     float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
+  {	Fsgemm(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &scalfmm::S_ONE, C, &ldC); }
+  inline void c_gemtma(unsigned m, unsigned p, unsigned n, float* d,
+		       float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
+  {	Fcgemm(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); }
+  inline void c_gemtma(unsigned m, unsigned p, unsigned n, double* d,
+		       double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
+  {	Fzgemm(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); }
+  inline void c_gemhma(unsigned m, unsigned p, unsigned n, float* d, // hermitian transposed
+		       float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
+  {	Fcgemm(scalfmm::JOB_STR+7, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); }
+  inline void c_gemhma(unsigned m, unsigned p, unsigned n, double* d, // hermitian transposed
+		       double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
+  {	Fzgemm(scalfmm::JOB_STR+7, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); }
 	
 	
-	// C = d A B^T, A is m x p, B is n x p
-	inline void gemmt(unsigned m, unsigned p, unsigned n, double d,
-										double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
-	{	dgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::D_ZERO, C, &ldC);	}
-	inline void gemmt(unsigned m, unsigned p, unsigned n, float d,
-										float* A, unsigned ldA, float *B, unsigned ldB,	float* C, unsigned ldC)
-	{	sgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::S_ZERO, C, &ldC);	}
-	inline void c_gemmt(unsigned m, unsigned p, unsigned n, float d,
-											float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
-	{	cgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC); }
-	inline void c_gemmt(unsigned m, unsigned p, unsigned n, double d,
-											double* A, unsigned ldA, double *B, unsigned ldB,	double* C, unsigned ldC)
-	{	zgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC);	}
-	inline void c_gemmh(unsigned m, unsigned p, unsigned n, float d, // hermitian transposed
-											float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
-	{	cgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+7, &m, &n, &p, &d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC); }
-	inline void c_gemmh(unsigned m, unsigned p, unsigned n, double d, // hermitian transposed
-											double* A, unsigned ldA, double *B, unsigned ldB,	double* C, unsigned ldC)
-	{	zgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+7, &m, &n, &p, &d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC);	}
-
-	// C += d A B^T, A is m x p, B is n x p
-	inline void gemmta(unsigned m, unsigned p, unsigned n, double d,
-										 double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
-	{	dgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::D_ONE, C, &ldC); }
-	inline void gemmta(unsigned m, unsigned p, unsigned n, float d,
-										 float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
-	{	sgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::S_ONE, C, &ldC); }
-	inline void c_gemmta(unsigned m, unsigned p, unsigned n, float* d,
-											 float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
-	{	cgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); }
-	inline void c_gemmta(unsigned m, unsigned p, unsigned n, double* d,
-											 double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
-	{	zgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); }
-	inline void c_gemmha(unsigned m, unsigned p, unsigned n, float* d, // hermitian transposed
-											 float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
-	{	cgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+7, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); }
-	inline void c_gemmha(unsigned m, unsigned p, unsigned n, double* d, // hermitian transposed
-											 double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
-	{	zgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+7, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); }
-
-
-	// singular value decomposition
-    //
-	inline int gesvd(unsigned m, unsigned n, double* A, double* S, double* VT, unsigned ldVT,
-									 unsigned nwk, double* wk)
-	{
-		int INF;
-		dgesvd_(scalfmm::JOB_STR+2, scalfmm::JOB_STR+3, &m, &n, A, &m, S, A, &m, VT, &ldVT,	wk, &nwk, &INF);
-		return INF;
-	}
-	//
-	//    A = U * SIGMA * conjugate-transpose(V)
-	// scalfmm::JOB_STR+2 = 'O':  the first min(m,n) columns of U (the left singular vectors) are overwritten on the array A;
-	inline int c_gesvd(unsigned m, unsigned n, double* A, double* S, double* VT, unsigned ldVT,
-			int& nwk, double* wk,double* rwk)
-	{
-		int INF;
-		zgesvd_(scalfmm::JOB_STR+2, scalfmm::JOB_STR+3, &m, &n, A, &m, S, A, &m, VT, &ldVT,	wk, &nwk, rwk,&INF);
-		return INF;
-	}
-
-	inline int gesvd(unsigned m, unsigned n, float* A, float* S, float* VT, unsigned ldVT,
-									 unsigned nwk, float* wk)
-	{
-		int INF;
-		sgesvd_(scalfmm::JOB_STR+2, scalfmm::JOB_STR+3, &m, &n, A, &m, S, A, &m, VT, &ldVT,	wk, &nwk, &INF);
-		return INF;
-	}
-
-    // singular value decomposition (SO)
-    inline int gesvdSO(unsigned m, unsigned n, double* A, double* S, double* U, unsigned ldU,
-                       unsigned nwk, double* wk)
-    {
-        int INF;
-        dgesvd_(scalfmm::JOB_STR+3, scalfmm::JOB_STR+2, &m, &n, A, &m, S, U, &m, A, &ldU, wk, &nwk, &INF);
-        return INF;
-    }
-    inline int gesvdSO(unsigned m, unsigned n, float* A, float* S, float* U, unsigned ldU,
-                       unsigned nwk, float* wk)
-    {
-        int INF;
-        sgesvd_(scalfmm::JOB_STR+3, scalfmm::JOB_STR+2, &m, &n, A, &m, S, U, &m, A, &ldU, wk, &nwk, &INF);
-        return INF;
-    }
-
-    // singular value decomposition (AA)
-    inline int gesvdAA(unsigned m, unsigned n, double* A, double* S, double* U, unsigned ldU,
-                       unsigned nwk, double* wk)
-    {
-        int INF;
-        dgesvd_("A", "A", &m, &n, A, &m, S, U, &m, A, &ldU, wk, &nwk, &INF);
-        return INF;
-    }
-    inline int gesvdAA(unsigned m, unsigned n, float* A, float* S, float* U, unsigned ldU,
-                       unsigned nwk, float* wk)
-    {
-        int INF;
-        sgesvd_("A", "A", &m, &n, A, &m, S, U, &m, A, &ldU, wk, &nwk, &INF);
-        return INF;
-    }
-
-	// Scalar product v1'*v2
-	inline double scpr(const unsigned n, const double* const v1, const double* const v2)
-	{	return ddot_(&n, v1, &scalfmm::N_ONE, v2, &scalfmm::N_ONE); }
-	inline float scpr(const unsigned n, const float* const v1, const float* const v2)
-	{	return sdot_(&n, v1, &scalfmm::N_ONE, v2, &scalfmm::N_ONE);	}
-
-
-
-	// QR factorisation
-	inline int geqrf(const unsigned m, const unsigned n, double* A, double* tau, unsigned nwk, double* wk)
-	{
-		int INF;
-		dgeqrf_(&m, &n, A, &m, tau, wk, &nwk, &INF);
-		return INF;
-	}
-	inline int geqrf(const unsigned m, const unsigned n, float* A, float* tau, unsigned nwk, float* wk)
-	{
-		int INF;
-		sgeqrf_(&m, &n, A, &m, tau, wk, &nwk, &INF);
-		return INF;
-	}
-    // QR factorisation with column pivoting
-    inline int geqp3(const unsigned m, const unsigned n, double* A, unsigned* jpiv, double* tau, unsigned nwk, double* wk)
-    {
-        int INF;
-        dgeqp3_(&m, &n, A, &m, jpiv, tau, wk, &nwk, &INF);
-        return INF;
-    }
-    inline int geqp3(const unsigned m, const unsigned n, float* A, unsigned* jpiv, float* tau, unsigned nwk, float* wk)
-    {
-        int INF;
-        sgeqp3_(&m, &n, A, &m, jpiv, tau, wk, &nwk, &INF);
-        return INF;
-    }	
-	inline int c_geqrf(const unsigned m, const unsigned n, float* A, float* tau, unsigned nwk, float* wk)
-	{
-		int INF;
-		cgeqrf_(&m, &n, A, &m, tau, wk, &nwk, &INF);
-		return INF;
-	}
+  // C = d A B^T, A is m x p, B is n x p
+  inline void gemmt(unsigned m, unsigned p, unsigned n, double d,
+		    double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
+  {	Fdgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::D_ZERO, C, &ldC);	}
+  inline void gemmt(unsigned m, unsigned p, unsigned n, float d,
+		    float* A, unsigned ldA, float *B, unsigned ldB,	float* C, unsigned ldC)
+  {	Fsgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::S_ZERO, C, &ldC);	}
+  inline void c_gemmt(unsigned m, unsigned p, unsigned n, float d,
+		      float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
+  {	Fcgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC); }
+  inline void c_gemmt(unsigned m, unsigned p, unsigned n, double d,
+		      double* A, unsigned ldA, double *B, unsigned ldB,	double* C, unsigned ldC)
+  {	Fzgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC);	}
+  inline void c_gemmh(unsigned m, unsigned p, unsigned n, float d, // hermitian transposed
+		      float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
+  {	Fcgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+7, &m, &n, &p, &d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC); }
+  inline void c_gemmh(unsigned m, unsigned p, unsigned n, double d, // hermitian transposed
+		      double* A, unsigned ldA, double *B, unsigned ldB,	double* C, unsigned ldC)
+  {	Fzgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+7, &m, &n, &p, &d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC);	}
+
+  // C += d A B^T, A is m x p, B is n x p
+  inline void gemmta(unsigned m, unsigned p, unsigned n, double d,
+		     double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
+  {	Fdgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::D_ONE, C, &ldC); }
+  inline void gemmta(unsigned m, unsigned p, unsigned n, float d,
+		     float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
+  {	Fsgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::S_ONE, C, &ldC); }
+  inline void c_gemmta(unsigned m, unsigned p, unsigned n, float* d,
+		       float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
+  {	Fcgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); }
+  inline void c_gemmta(unsigned m, unsigned p, unsigned n, double* d,
+		       double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
+  {	Fzgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); }
+  inline void c_gemmha(unsigned m, unsigned p, unsigned n, float* d, // hermitian transposed
+		       float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC)
+  {	Fcgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+7, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); }
+  inline void c_gemmha(unsigned m, unsigned p, unsigned n, double* d, // hermitian transposed
+		       double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC)
+  {	Fzgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+7, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); }
+
+
+  // singular value decomposition
+  //
+  inline int gesvd(unsigned m, unsigned n, double* A, double* S, double* VT, unsigned ldVT,
+		   unsigned nwk, double* wk)
+  {
+    int INF;
+    Fdgesvd(scalfmm::JOB_STR+2, scalfmm::JOB_STR+3, &m, &n, A, &m, S, A, &m, VT, &ldVT,	wk, &nwk, &INF);
+    return INF;
+  }
+  //
+  //    A = U * SIGMA * conjugate-transpose(V)
+  // scalfmm::JOB_STR+2 = 'O':  the first min(m,n) columns of U (the left singular vectors) are overwritten on the array A;
+  inline int c_gesvd(unsigned m, unsigned n, double* A, double* S, double* VT, unsigned ldVT,
+		     int& nwk, double* wk,double* rwk)
+  {
+    int INF;
+    Fzgesvd(scalfmm::JOB_STR+2, scalfmm::JOB_STR+3, &m, &n, A, &m, S, A, &m, VT, &ldVT,	wk, &nwk, rwk,&INF);
+    return INF;
+  }
+
+  inline int gesvd(unsigned m, unsigned n, float* A, float* S, float* VT, unsigned ldVT,
+		   unsigned nwk, float* wk)
+  {
+    int INF;
+    Fsgesvd(scalfmm::JOB_STR+2, scalfmm::JOB_STR+3, &m, &n, A, &m, S, A, &m, VT, &ldVT,	wk, &nwk, &INF);
+    return INF;
+  }
+
+  // singular value decomposition (SO)
+  inline int gesvdSO(unsigned m, unsigned n, double* A, double* S, double* U, unsigned ldU,
+		     unsigned nwk, double* wk)
+  {
+    int INF;
+    Fdgesvd(scalfmm::JOB_STR+3, scalfmm::JOB_STR+2, &m, &n, A, &m, S, U, &m, A, &ldU, wk, &nwk, &INF);
+    return INF;
+  }
+  inline int gesvdSO(unsigned m, unsigned n, float* A, float* S, float* U, unsigned ldU,
+		     unsigned nwk, float* wk)
+  {
+    int INF;
+    Fsgesvd(scalfmm::JOB_STR+3, scalfmm::JOB_STR+2, &m, &n, A, &m, S, U, &m, A, &ldU, wk, &nwk, &INF);
+    return INF;
+  }
+
+  // singular value decomposition (AA)
+  inline int gesvdAA(unsigned m, unsigned n, double* A, double* S, double* U, unsigned ldU,
+		     unsigned nwk, double* wk)
+  {
+    int INF;
+    Fdgesvd("A", "A", &m, &n, A, &m, S, U, &m, A, &ldU, wk, &nwk, &INF);
+    return INF;
+  }
+  inline int gesvdAA(unsigned m, unsigned n, float* A, float* S, float* U, unsigned ldU,
+		     unsigned nwk, float* wk)
+  {
+    int INF;
+    Fsgesvd("A", "A", &m, &n, A, &m, S, U, &m, A, &ldU, wk, &nwk, &INF);
+    return INF;
+  }
+
+  // Scalar product v1'*v2
+  inline double scpr(const unsigned n, const double* const v1, const double* const v2)
+  {	return Fddot(&n, v1, &scalfmm::N_ONE, v2, &scalfmm::N_ONE); }
+  inline float scpr(const unsigned n, const float* const v1, const float* const v2)
+  {	return Fsdot(&n, v1, &scalfmm::N_ONE, v2, &scalfmm::N_ONE);	}
+
+
+
+  // QR factorisation
+  inline int geqrf(const unsigned m, const unsigned n, double* A, double* tau, unsigned nwk, double* wk)
+  {
+    int INF;
+    Fdgeqrf(&m, &n, A, &m, tau, wk, &nwk, &INF);
+    return INF;
+  }
+  inline int geqrf(const unsigned m, const unsigned n, float* A, float* tau, unsigned nwk, float* wk)
+  {
+    int INF;
+    Fsgeqrf(&m, &n, A, &m, tau, wk, &nwk, &INF);
+    return INF;
+  }
+  // QR factorisation with column pivoting
+  inline int geqp3(const unsigned m, const unsigned n, double* A, unsigned* jpiv, double* tau, unsigned nwk, double* wk)
+  {
+    int INF;
+    Fdgeqp3(&m, &n, A, &m, jpiv, tau, wk, &nwk, &INF);
+    return INF;
+  }
+  inline int geqp3(const unsigned m, const unsigned n, float* A, unsigned* jpiv, float* tau, unsigned nwk, float* wk)
+  {
+    int INF;
+    Fsgeqp3(&m, &n, A, &m, jpiv, tau, wk, &nwk, &INF);
+    return INF;
+  }	
+  inline int c_geqrf(const unsigned m, const unsigned n, float* A, float* tau, unsigned nwk, float* wk)
+  {
+    int INF;
+    Fcgeqrf(&m, &n, A, &m, tau, wk, &nwk, &INF);
+    return INF;
+  }
 	
-	inline int c_geqrf(const unsigned m, const unsigned n, double* A, double* tau, unsigned nwk, double* wk)
-	{
-		int INF;
-		zgeqrf_(&m, &n, A, &m, tau, wk, &nwk, &INF);
-		return INF;
-	}
-    inline int c_geqp3(const unsigned m, const unsigned n, float* A, unsigned* jpiv, float* tau, unsigned nwk, float* wk)
-    {
-        int INF;
-        cgeqp3_(&m, &n, A, &m, jpiv, tau, wk, &nwk, &INF);
-        return INF;
-    }
+  inline int c_geqrf(const unsigned m, const unsigned n, double* A, double* tau, unsigned nwk, double* wk)
+  {
+    int INF;
+    Fzgeqrf(&m, &n, A, &m, tau, wk, &nwk, &INF);
+    return INF;
+  }
+  inline int c_geqp3(const unsigned m, const unsigned n, float* A, unsigned* jpiv, float* tau, unsigned nwk, float* wk)
+  {
+    int INF;
+    Fcgeqp3(&m, &n, A, &m, jpiv, tau, wk, &nwk, &INF);
+    return INF;
+  }
     
-    inline int c_geqp3(const unsigned m, const unsigned n, double* A, unsigned* jpiv, double* tau, unsigned nwk, double* wk)
-    {
-        int INF;
-        zgeqp3_(&m, &n, A, &m, jpiv, tau, wk, &nwk, &INF);
-        return INF;
-    }
-
-	// return full of Q-Matrix (QR factorization) in A
-	inline int orgqr_full(const unsigned m, const unsigned n, double* A, double* tau, unsigned nwk, double* wk)
-	{
-		int INF;
-		dorgqr_(&m, &m, &n, A, &m, tau, wk, &nwk, &INF);
-		return INF;
-	}
-	inline int orgqr_full(const unsigned m, const unsigned n, float* A, float* tau, unsigned nwk, float* wk)
-	{
-		int INF;
-		sorgqr_(&m, &m, &n, A, &m, tau, wk, &nwk, &INF);
-		return INF;
-	}
-	// return the leading n columns of Q-Matrix (QR factorization) in A
-	inline int orgqr(const unsigned m, const unsigned n, double* A, double* tau, unsigned nwk, double* wk)
-	{
-		int INF;
-		dorgqr_(&m, &n, &n, A, &m, tau, wk, &nwk, &INF);
-		return INF;
-	}
-	inline int orgqr(const unsigned m, const unsigned n, float* A, float* tau, unsigned nwk, float* wk)
-	{
-		int INF;
-		sorgqr_(&m, &n, &n, A, &m, tau, wk, &nwk, &INF);
-		return INF;
-	}
-
-
-
-    // apply Q-Matrix (from QR factorization) to C
-    // LEFT: Q(^T)C
-	inline int left_ormqr(const char* TRANS, const unsigned m, const unsigned n, const double* A, double* tau, double* C, unsigned nwk, double* wk)
-	{
-		int INF;
-		dormqr_("L", TRANS, &m, &n, &m, A, &m, tau, C, &m, wk, &nwk, &INF);
-		return INF;
-	}
-	inline int left_ormqr(const char* TRANS, const unsigned m, const unsigned n, const float* A, float* tau, float* C, unsigned nwk, float* wk)
-	{
-		int INF;
-		sormqr_("L", TRANS, &m, &n, &m, A, &m, tau, C, &m, wk, &nwk, &INF);
-		return INF;
-	}
-    // RIGHT: CQ(^T)
-	inline int right_ormqr(const char* TRANS, const unsigned m, const unsigned n, const double* A, double* tau, double* C, unsigned nwk, double* wk)
-	{
-		int INF;
-		dormqr_("R", TRANS, &m, &n, &n, A, &n, tau, C, &m, wk, &nwk, &INF);
-		return INF;
-	}
-	inline int right_ormqr(const char* TRANS, const unsigned m, const unsigned n, const float* A, float* tau, float* C, unsigned nwk, float* wk)
-	{
-		int INF;
-		sormqr_("R", TRANS, &m, &n, &n, A, &n, tau, C, &m, wk, &nwk, &INF);
-		return INF;
-	}
-
-    // Cholesky decomposition: A=LL^T (if A is symmetric definite positive)
-    inline int potrf(const unsigned m, double* A, const unsigned n)
-    { 
-		int INF;  
-        dpotrf_("L", &m, A, &n, &INF); 
-        return INF;
-    }
-    inline int potrf(const unsigned m, float* A, const unsigned n)
-    { 
-		int INF;  
-        spotrf_("L", &m, A, &n, &INF); 
-        return INF;
-    }
+  inline int c_geqp3(const unsigned m, const unsigned n, double* A, unsigned* jpiv, double* tau, unsigned nwk, double* wk)
+  {
+    int INF;
+    Fzgeqp3(&m, &n, A, &m, jpiv, tau, wk, &nwk, &INF);
+    return INF;
+  }
+
+  // return full of Q-Matrix (QR factorization) in A
+  inline int orgqr_full(const unsigned m, const unsigned n, double* A, double* tau, unsigned nwk, double* wk)
+  {
+    int INF;
+    Fdorgqr(&m, &m, &n, A, &m, tau, wk, &nwk, &INF);
+    return INF;
+  }
+  inline int orgqr_full(const unsigned m, const unsigned n, float* A, float* tau, unsigned nwk, float* wk)
+  {
+    int INF;
+    Fsorgqr(&m, &m, &n, A, &m, tau, wk, &nwk, &INF);
+    return INF;
+  }
+  // return the leading n columns of Q-Matrix (QR factorization) in A
+  inline int orgqr(const unsigned m, const unsigned n, double* A, double* tau, unsigned nwk, double* wk)
+  {
+    int INF;
+    Fdorgqr(&m, &n, &n, A, &m, tau, wk, &nwk, &INF);
+    return INF;
+  }
+  inline int orgqr(const unsigned m, const unsigned n, float* A, float* tau, unsigned nwk, float* wk)
+  {
+    int INF;
+    Fsorgqr(&m, &n, &n, A, &m, tau, wk, &nwk, &INF);
+    return INF;
+  }
+
+
+
+  // apply Q-Matrix (from QR factorization) to C
+  // LEFT: Q(^T)C
+  inline int left_ormqr(const char* TRANS, const unsigned m, const unsigned n, const double* A, double* tau, double* C, unsigned nwk, double* wk)
+  {
+    int INF;
+    Fdormqr("L", TRANS, &m, &n, &m, A, &m, tau, C, &m, wk, &nwk, &INF);
+    return INF;
+  }
+  inline int left_ormqr(const char* TRANS, const unsigned m, const unsigned n, const float* A, float* tau, float* C, unsigned nwk, float* wk)
+  {
+    int INF;
+    Fsormqr("L", TRANS, &m, &n, &m, A, &m, tau, C, &m, wk, &nwk, &INF);
+    return INF;
+  }
+  // RIGHT: CQ(^T)
+  inline int right_ormqr(const char* TRANS, const unsigned m, const unsigned n, const double* A, double* tau, double* C, unsigned nwk, double* wk)
+  {
+    int INF;
+    Fdormqr("R", TRANS, &m, &n, &n, A, &n, tau, C, &m, wk, &nwk, &INF);
+    return INF;
+  }
+  inline int right_ormqr(const char* TRANS, const unsigned m, const unsigned n, const float* A, float* tau, float* C, unsigned nwk, float* wk)
+  {
+    int INF;
+    Fsormqr("R", TRANS, &m, &n, &n, A, &n, tau, C, &m, wk, &nwk, &INF);
+    return INF;
+  }
+
+  // Cholesky decomposition: A=LL^T (if A is symmetric definite positive)
+  inline int potrf(const unsigned m, double* A, const unsigned n)
+  { 
+    int INF;  
+    Fdpotrf("L", &m, A, &n, &INF);
+    return INF;
+  }
+  inline int potrf(const unsigned m, float* A, const unsigned n)
+  { 
+    int INF;  
+    Fspotrf("L", &m, A, &n, &INF);
+    return INF;
+  }
 
 } // end namespace FCBlas
 
diff --git a/Src/Utils/FFortranMangling.hpp b/Src/Utils/FFortranMangling.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..92fc3f42e37674709aabf8c8b27c2e4022b9ea5c
--- /dev/null
+++ b/Src/Utils/FFortranMangling.hpp
@@ -0,0 +1,91 @@
+/*
+ * FFortranMangling.hpp
+ *
+ *  Created on: 6 juin 2016
+ *      Author: coulaud
+ */
+
+#ifndef SRC_UTILS_FFORTRANMANGLING_HPP_
+#define SRC_UTILS_FFORTRANMANGLING_HPP_
+
+
+#include "ScalFmmConfig.h"
+
+#ifdef SCALFMM_BLAS_ADD_
+/* Mangling for Fortran subroutine symbols with underscores. */
+
+#define FortranName(name,NAME) name##_
+
+#elif defined(SCALFMM_BLAS_UPCASE)
+
+/* Mangling for Fortran subroutine  symbols in uppercase and without underscores */
+
+#define FortranName(name,NAME) NAME
+
+#elif defined(SCALFMM_BLAS_NOCHANGE)
+/* Mangling for Fortran subroutine  symbols without no change. */
+
+#define FortranName(name,NAME) name
+
+#else
+
+#error("Fortran MANGLING NOT DEFINED")
+
+#endif
+
+  // blas 1
+#define Fddot  FortranName(ddot,DDOT)
+#define Fdscal FortranName(dscal,DSCAL)
+#define Fdcopy FortranName(dcopy,DCOPY)
+#define Fdaxpy FortranName(daxpy,DAXPY)
+#define Fsdot  FortranName(sdot,SDOT)
+#define Fsscal FortranName(sscal,SSCAL)
+#define Fscopy FortranName(scopy,SCOPY)
+#define Fsaxpy FortranName(saxpy,SAXPY)
+#define Fcscal FortranName(cscal,CSCAL)
+#define Fccopy FortranName(ccopy,CCOPY)
+#define Fcaxpy FortranName(caxpy,CAXPY)
+#define Fzscal FortranName(zscal,ZSCAL)
+#define Fzcopy FortranName(zcopy,ZCOPY)
+#define Fzaxpy FortranName(zaxpy,ZAXPY)
+// blas 2
+#define Fdgemv FortranName(dgemv,DGEMV)
+#define Fsgemv FortranName(sgemv,SGEMV)
+#define Fcgemv FortranName(cgemv,CGEMV)
+#define Fzgemv FortranName(zgemv,ZGEMV)
+  // blas 3
+#define Fdgemm FortranName(dgemm,DGEMM)
+#define Fsgemm FortranName(sgemm,SGEMM)
+#define Fcgemm FortranName(cgemm,CGEMM)
+#define Fzgemm FortranName(zgemm,ZGEMM)
+  // lapack
+#define Fdgesvd FortranName(dgesvd,DGESVD)
+#define Fdgeqrf FortranName(dgeqrf,DGEQRF)
+#define Fdgeqp3  FortranName(dgeqp3,DGEQP3)
+#define Fdorgqr  FortranName(dorgqr,DORGQR)
+#define Fdormqr FortranName(dormqr,DORMQR)
+#define Fdpotrf  FortranName(dpotrf,DPOTRF)
+#define Fsgesvd FortranName(sgesvd,SGESVD)
+#define Fsgeqrf FortranName(sgeqrf,SGEQRF)
+#define Fsgeqp3  FortranName(sgeqp3,SGEQP3)
+#define Fsorgqr  FortranName(sorgqr,SORGQR)
+#define Fsormqr FortranName(sormqr,SORMQR)
+#define Fspotrf  FortranName(spotrf,SPOTRF)
+#define Fcgesvd FortranName(cgesvd,CGESVD)
+#define Fcgeqrf FortranName(cgeqrf,CGEQRF)
+#define Fcgeqp3  FortranName(cgeqp3,CGEQP3)
+#define Fcorgqr  FortranName(corgqr,CORGQR)
+#define Fcormqr FortranName(cormqr,CORMQR)
+#define Fcpotrf  FortranName(cpotrf,CPOTRF)
+#define Fzgesvd FortranName(zgesvd,ZGESVD)
+#define Fzgeqrf FortranName(zgeqrf,ZGEQRF)
+#define Fzgeqp3  FortranName(zgeqp3,ZGEQP3)
+#define Fzorgqr  FortranName(zorgqr,ZORGQR)
+#define Fzormqr FortranName(zormqr,ZORMQR)
+#define Fzpotrf  FortranName(zpotrf,ZPOTRF)
+
+
+#endif /* SRC_UTILS_FFORTRANMANGLING_HPP_ */
+
+
+
diff --git a/Src/Utils/FTic.hpp b/Src/Utils/FTic.hpp
index 911c88043091d1985fb8324bd377bd52d013c5e4..72527b6e454f50ab0a2db5d67ab54ce8c00dae2e 100644
--- a/Src/Utils/FTic.hpp
+++ b/Src/Utils/FTic.hpp
@@ -4,13 +4,13 @@
 // This software is a computer program whose purpose is to compute the FMM.
 //
 // This software is governed by the CeCILL-C and LGPL licenses and
-// abiding by the rules of distribution of free software.  
-// 
+// abiding by the rules of distribution of free software.
+//
 // This program is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public and CeCILL-C Licenses for more details.
-// "http://www.cecill.info". 
+// "http://www.cecill.info".
 // "http://www.gnu.org/licenses".
 // ===================================================================================
 #ifndef FTIC_HPP
@@ -18,7 +18,11 @@
 
 #include "FGlobal.hpp"
 
-#ifdef _OPENMP
+#define USE_STD_CHRONO
+
+#if defined(USE_STD_CHRONO)
+    #include <chrono>
+#elif defined(_OPENMP)
     #include <omp.h>
 #elif defined(WINDOWS) // We need an os specific function
     #include <time.h>
@@ -42,7 +46,7 @@
  *
  *  - use elapsed() to get the last time interval;
  *  - use cumulated() to get the total running time;
- *  - use reset() to stop and reset the counter. 
+ *  - use reset() to stop and reset the counter.
  *
  * \code
  * FTic timer;
@@ -66,12 +70,12 @@ private:
 
     double start    = 0;    ///< start time (tic)
     double end      = 0;    ///< stop time (tac)
-    double cumulate = 0;    ///< the cumulate time
+    double cumulate = 0;    ///< cumulated duration
 
 public:
     /// Constructor
     FTic() {
-        tic();
+        this->reset();
     }
 
     /// Copy constructor
@@ -100,12 +104,12 @@ public:
         res.cumulate += other.cumulate;
         return res;
     }
-    
+
     /// Resets the timer
     /**\warning Use tic() to restart the timer. */
     void reset() {
-        start    = 0;
-        end      = 0;
+        start    = FTic::GetTime();
+        end      = start;
         cumulate = 0;
     }
 
@@ -114,26 +118,35 @@ public:
         this->start = FTic::GetTime();
     }
 
+    /// Peek at current elapsed time without stopping timer
+    double peek() const {
+        return FTic::GetTime() - this->start;;
+    }
+
     /// Stop measuring time and add to cumulated time.
-    void tac(){
+    double tac(){
         this->end = FTic::GetTime();
-        cumulate += elapsed();
+        auto lapse = this->elapsed();
+        cumulate += lapse;
+        return lapse;
     }
 
     /// Elapsed time between the last tic() and tac() (in seconds).
     /** \return the time elapsed between tic() & tac() in second. */
-    double elapsed() const{
+    double elapsed() const {
         return this->end - this->start;
     }
 
     /// Cumulated tic() - tac() time spans
     /** \return the time elapsed between ALL tic() & tac() in second. */
-    double cumulated() const{
+    double cumulated() const {
         return cumulate;
     }
 
     /// Combination of tic() and elapsed().
-    /** \return the time elapsed between tic() & tac() in second. */
+    /**
+     * \todo deprecate
+     * \return the time elapsed between tic() & tac() in second. */
     double tacAndElapsed() {
         tac();
         return elapsed();
@@ -145,7 +158,11 @@ public:
      *  \return A system dependent time point.
      */
     static double GetTime(){
-#ifdef _OPENMP
+#if defined(USE_STD_CHRONO)
+        using clock = std::chrono::high_resolution_clock;
+        using duration = std::chrono::duration<double>;
+        return duration(clock::now().time_since_epoch()).count();
+#elif defined(_OPENMP)
         return omp_get_wtime();
 #elif defined(WINDOWS)
         return static_cast<double>(GetTickCount())/1000.0;
@@ -159,4 +176,3 @@ public:
 
 
 #endif
-
diff --git a/Tests/GroupTree/testBlockedUniformBench.cpp b/Tests/GroupTree/testBlockedUniformBench.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..56f3be19bacf564a455337c01a614f9ef6bab9df
--- /dev/null
+++ b/Tests/GroupTree/testBlockedUniformBench.cpp
@@ -0,0 +1,194 @@
+
+// ==== CMAKE =====
+// @FUSE_BLAS
+// @FUSE_FFT
+// @FUSE_STARPU
+// ================
+// Keep in private GIT
+
+
+#include "../../Src/Utils/FGlobal.hpp"
+
+#include "../../Src/GroupTree/Core/FGroupTree.hpp"
+
+#include "../../Src/Components/FSimpleLeaf.hpp"
+#include "../../Src/Containers/FVector.hpp"
+
+#include "../../Src/Kernels/P2P/FP2PParticleContainer.hpp"
+
+#include "Kernels/Interpolation/FInterpMatrixKernel.hpp"
+#include "../../Src/Kernels/Uniform/FUnifKernel.hpp"
+
+#include "../../Src/GroupTree/Uniform/FUnifCellPOD.hpp"
+
+#include "../../Src/Utils/FMath.hpp"
+#include "../../Src/Utils/FMemUtils.hpp"
+#include "../../Src/Utils/FParameters.hpp"
+
+#include "../../Src/Files/FRandomLoader.hpp"
+#include "../../Src/Files/FFmaGenericLoader.hpp"
+
+#include "../../Src/GroupTree/Core/FGroupSeqAlgorithm.hpp"
+#include "../../Src/GroupTree/Core/FGroupTaskAlgorithm.hpp"
+#ifdef SCALFMM_USE_OMP4
+#include "../../Src/GroupTree/Core/FGroupTaskDepAlgorithm.hpp"
+#endif
+#ifdef SCALFMM_USE_STARPU
+#include "../../Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp"
+#include "../../Src/GroupTree/StarPUUtils/FStarPUKernelCapacities.hpp"
+#endif
+#include "../../Src/GroupTree/Core/FP2PGroupParticleContainer.hpp"
+
+#include "../../Src/Utils/FParameterNames.hpp"
+
+#include <memory>
+
+
+#define RANDOM_PARTICLES
+
+int main(int argc, char* argv[]){
+    const FParameterNames LocalOptionBlocSize { {"-bs"}, "The size of the block of the blocked tree"};
+    const FParameterNames LocalOptionValidate { {"-validation"}, "To compare with direct computation"};
+    FHelpDescribeAndExit(argc, argv, "Perform Lagrange Kernel based simulation with StarPU",
+                         FParameterDefinitions::OctreeHeight,
+#ifdef RANDOM_PARTICLES
+                         FParameterDefinitions::NbParticles,
+#else
+                         FParameterDefinitions::InputFile,
+#endif
+                         FParameterDefinitions::NbThreads,
+                         LocalOptionBlocSize, LocalOptionValidate);
+
+    // Initialize the types
+    typedef double FReal;
+    static const int ORDER = 5;
+    typedef FInterpMatrixKernelR<FReal> MatrixKernelClass;
+
+    typedef FUnifCellPODCore         GroupCellSymbClass;
+    typedef FUnifCellPODPole<FReal,ORDER>  GroupCellUpClass;
+    typedef FUnifCellPODLocal<FReal,ORDER> GroupCellDownClass;
+    typedef FUnifCellPOD<FReal,ORDER>      GroupCellClass;
+
+    typedef FP2PGroupParticleContainer<FReal>          GroupContainerClass;
+    typedef FGroupTree< FReal, GroupCellClass, GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass, GroupContainerClass, 1, 4, FReal>  GroupOctreeClass;
+
+    typedef FStarPUAllCpuCapacities<FUnifKernel<FReal,GroupCellClass,GroupContainerClass,MatrixKernelClass,ORDER>> GroupKernelClass;
+    typedef FStarPUCpuWrapper<typename GroupOctreeClass::CellGroupClass, GroupCellClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupContainerClass> GroupCpuWrapper;
+    typedef FGroupTaskStarPUAlgorithm<GroupOctreeClass, typename GroupOctreeClass::CellGroupClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupCpuWrapper > GroupAlgorithm;
+
+    // Get params
+    const int NbLevels      = FParameters::getValue(argc,argv,FParameterDefinitions::OctreeHeight.options, 5);
+    const int groupSize     = FParameters::getValue(argc,argv,LocalOptionBlocSize.options, 250);
+
+    // Load the particles
+#ifdef RANDOM_PARTICLES
+    FRandomLoader<FReal> loader(FParameters::getValue(argc,argv,FParameterDefinitions::NbParticles.options, 2000), 1.0, FPoint<FReal>(0,0,0), 0);
+#else
+    const char* const filename = FParameters::getStr(argc,argv,FParameterDefinitions::InputFile.options, "../Data/test20k.fma");
+    FFmaGenericLoader<FReal> loader(filename);
+#endif
+    FAssertLF(loader.isOpen());
+    FTic timer;
+
+    FP2PParticleContainer<FReal> allParticles;
+    for(FSize idxPart = 0 ; idxPart < loader.getNumberOfParticles() ; ++idxPart){
+        FPoint<FReal> particlePosition;
+        FReal physicalValue;
+#ifdef RANDOM_PARTICLES
+        physicalValue = 0.10;
+        loader.fillParticle(&particlePosition);
+#else
+        loader.fillParticle(&particlePosition, &physicalValue);
+#endif
+        allParticles.push(particlePosition, physicalValue);
+    }
+    std::cout << "Particles loaded in " << timer.tacAndElapsed() << "s\n";
+
+    // Put the data into the tree
+    timer.tic();
+    GroupOctreeClass groupedTree(NbLevels, loader.getBoxWidth(), loader.getCenterOfBox(), groupSize, &allParticles);
+    groupedTree.printInfoBlocks();
+    std::cout << "Tree created in " << timer.tacAndElapsed() << "s\n";
+
+    // Run the algorithm
+    const MatrixKernelClass MatrixKernel;
+    GroupKernelClass groupkernel(NbLevels, loader.getBoxWidth(), loader.getCenterOfBox(), &MatrixKernel);
+    GroupAlgorithm groupalgo(&groupedTree,&groupkernel);
+
+    timer.tic();
+    groupalgo.execute();
+    timer.tac();
+    std::cout << "@EXEC TIME = " << timer.elapsed() << "s\n";
+
+    // Validate the result
+    if(FParameters::existParameter(argc, argv, LocalOptionValidate.options) == true){
+        FSize offsetParticles = 0;
+        FReal*const allPhysicalValues = allParticles.getPhysicalValues();
+        FReal*const allPosX = const_cast<FReal*>( allParticles.getPositions()[0]);
+        FReal*const allPosY = const_cast<FReal*>( allParticles.getPositions()[1]);
+        FReal*const allPosZ = const_cast<FReal*>( allParticles.getPositions()[2]);
+
+        groupedTree.forEachCellLeaf<FP2PGroupParticleContainer<FReal> >([&](GroupCellClass cellTarget, FP2PGroupParticleContainer<FReal> * leafTarget){
+            const FReal*const physicalValues = leafTarget->getPhysicalValues();
+            const FReal*const posX = leafTarget->getPositions()[0];
+            const FReal*const posY = leafTarget->getPositions()[1];
+            const FReal*const posZ = leafTarget->getPositions()[2];
+            const FSize nbPartsInLeafTarget = leafTarget->getNbParticles();
+
+            for(FSize idxPart = 0 ; idxPart < nbPartsInLeafTarget ; ++idxPart){
+                allPhysicalValues[offsetParticles + idxPart] = physicalValues[idxPart];
+                allPosX[offsetParticles + idxPart] = posX[idxPart];
+                allPosY[offsetParticles + idxPart] = posY[idxPart];
+                allPosZ[offsetParticles + idxPart] = posZ[idxPart];
+            }
+
+            offsetParticles += nbPartsInLeafTarget;
+        });
+
+        FAssertLF(offsetParticles == loader.getNumberOfParticles());
+
+        FReal*const allDirectPotentials = allParticles.getPotentials();
+        FReal*const allDirectforcesX = allParticles.getForcesX();
+        FReal*const allDirectforcesY = allParticles.getForcesY();
+        FReal*const allDirectforcesZ = allParticles.getForcesZ();
+
+        for(int idxTgt = 0 ; idxTgt < offsetParticles ; ++idxTgt){
+            for(int idxMutual = idxTgt + 1 ; idxMutual < offsetParticles ; ++idxMutual){
+                FP2PR::MutualParticles(
+                    allPosX[idxTgt],allPosY[idxTgt],allPosZ[idxTgt], allPhysicalValues[idxTgt],
+                    &allDirectforcesX[idxTgt], &allDirectforcesY[idxTgt], &allDirectforcesZ[idxTgt], &allDirectPotentials[idxTgt],
+                    allPosX[idxMutual],allPosY[idxMutual],allPosZ[idxMutual], allPhysicalValues[idxMutual],
+                    &allDirectforcesX[idxMutual], &allDirectforcesY[idxMutual], &allDirectforcesZ[idxMutual], &allDirectPotentials[idxMutual]
+                );
+            }
+        }
+
+        FMath::FAccurater<FReal> potentialDiff;
+        FMath::FAccurater<FReal> fx, fy, fz;
+        offsetParticles = 0;
+        groupedTree.forEachCellLeaf<FP2PGroupParticleContainer<FReal> >([&](GroupCellClass cellTarget, FP2PGroupParticleContainer<FReal> * leafTarget){
+            const FReal*const potentials = leafTarget->getPotentials();
+            const FReal*const forcesX = leafTarget->getForcesX();
+            const FReal*const forcesY = leafTarget->getForcesY();
+            const FReal*const forcesZ = leafTarget->getForcesZ();
+            const FSize nbPartsInLeafTarget = leafTarget->getNbParticles();
+
+            for(int idxTgt = 0 ; idxTgt < nbPartsInLeafTarget ; ++idxTgt){
+                potentialDiff.add(allDirectPotentials[idxTgt + offsetParticles], potentials[idxTgt]);
+                fx.add(allDirectforcesX[idxTgt + offsetParticles], forcesX[idxTgt]);
+                fy.add(allDirectforcesY[idxTgt + offsetParticles], forcesY[idxTgt]);
+                fz.add(allDirectforcesZ[idxTgt + offsetParticles], forcesZ[idxTgt]);
+            }
+
+            offsetParticles += nbPartsInLeafTarget;
+        });
+
+        std::cout << "Error : Potential " << potentialDiff << "\n";
+        std::cout << "Error : fx " << fx << "\n";
+        std::cout << "Error : fy " << fy << "\n";
+        std::cout << "Error : fz " << fz << "\n";
+    }
+
+    return 0;
+}
+
diff --git a/Tests/GroupTree/testBlockedUniformCompare.cpp b/Tests/GroupTree/testBlockedUniformCompare.cpp
index 7571699259f5321c1bdfec73655a36584161ac50..a1de9692eb99e1430d4a09f84872946f06c97078 100644
--- a/Tests/GroupTree/testBlockedUniformCompare.cpp
+++ b/Tests/GroupTree/testBlockedUniformCompare.cpp
@@ -496,27 +496,51 @@ struct RunContainer{
                         typedef FFmmAlgorithmThreadBalance<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass;
                         std::cout << "Using FFmmAlgorithmThreadBalance " << std::endl;
                         FmmClass algorithm(&tree, &kernels);
+#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME)
+                        // The taskname() clause is only supported by KSTAR. Make sure
+                        // to set it from CMake to enable tracing.
+                        starpu_fxt_start_profiling();
+#endif
                         time.tic();
                         algorithm.execute();
                         time.tac();
+#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME)
+                        starpu_fxt_stop_profiling();
+#endif
                         std::cout << "Done  " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl;
                     }
                     else if(FParameters::existParameter(argc, argv, LocalOptionOmpTask.options)){
                         typedef FFmmAlgorithmTask<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass;
                         std::cout << "Using FFmmAlgorithmTask " << std::endl;
                         FmmClass algorithm(&tree, &kernels);
+#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME)
+                        // The taskname() clause is only supported by KSTAR. Make sure
+                        // to set it from CMake to enable tracing.
+                        starpu_fxt_start_profiling();
+#endif
                         time.tic();
                         algorithm.execute();
                         time.tac();
+#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME)
+                        starpu_fxt_stop_profiling();
+#endif
                         std::cout << "Done  " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl;
                     }
                     else if(FParameters::existParameter(argc, argv, LocalOptionOmpSection.options)){
                         typedef FFmmAlgorithmSectionTask<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass;
                         std::cout << "Using FFmmAlgorithmSectionTask " << std::endl;
                         FmmClass algorithm(&tree, &kernels);
+#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME)
+                        // The taskname() clause is only supported by KSTAR. Make sure
+                        // to set it from CMake to enable tracing.
+                        starpu_fxt_start_profiling();
+#endif
                         time.tic();
                         algorithm.execute();
                         time.tac();
+#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME)
+                        starpu_fxt_stop_profiling();
+#endif
                         std::cout << "Done  " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl;
                     }
     #ifdef SCALFMM_USE_OMP4
@@ -524,9 +548,17 @@ struct RunContainer{
                         typedef FFmmAlgorithmOmp4<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass;
                         std::cout << "Using FFmmAlgorithmOmp4 " << std::endl;
                         FmmClass algorithm(&tree, &kernels);
+#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME)
+                        // The taskname() clause is only supported by KSTAR. Make sure
+                        // to set it from CMake to enable tracing.
+                        starpu_fxt_start_profiling();
+#endif
                         time.tic();
                         algorithm.execute();
                         time.tac();
+#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME)
+                        starpu_fxt_stop_profiling();
+#endif
                         std::cout << "Done  " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl;
                     }
     #endif
@@ -534,9 +566,17 @@ struct RunContainer{
                         typedef FFmmAlgorithmThread<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass;
                         std::cout << "Using FFmmAlgorithmThread " << std::endl;
                         FmmClass algorithm(&tree, &kernels);
+#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME)
+                        // The taskname() clause is only supported by KSTAR. Make sure
+                        // to set it from CMake to enable tracing.
+                        starpu_fxt_start_profiling();
+#endif
                         time.tic();
                         algorithm.execute();
                         time.tac();
+#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME)
+                        starpu_fxt_stop_profiling();
+#endif
                         std::cout << "Done  " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl;
                     }
                 } // -----------------------------------------------------
@@ -848,9 +888,17 @@ struct RunContainer{
                     typedef FFmmAlgorithmThread<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass;
                     std::cout << "Using FFmmAlgorithmThread " << std::endl;
                     FmmClass algorithm(&tree, &kernels);
+#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME)
+                    // The taskname() clause is only supported by KSTAR. Make sure
+                    // to set it from CMake to enable tracing.
+                    starpu_fxt_start_profiling();
+#endif
                     time.tic();
                     algorithm.execute();
                     time.tac();
+#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME)
+                    starpu_fxt_stop_profiling();
+#endif
                     std::cout << "Done  " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl;
                 }
 
diff --git a/Tests/Utils/testOctreeRearrangeTsm.cpp b/Tests/Utils/testOctreeRearrangeTsm.cpp
index b6b6f4db93b2d9a55dac6ebfad3f7870991fc1f0..e859f5f6f1d7e6351023ea5ac8202c0484d76b9a 100644
--- a/Tests/Utils/testOctreeRearrangeTsm.cpp
+++ b/Tests/Utils/testOctreeRearrangeTsm.cpp
@@ -98,7 +98,7 @@ int main(int argc, char ** argv){
                                        (BoxWidth*FReal(drand48())) + (BoxCenter-(BoxWidth/2)),
                                        (BoxWidth*FReal(drand48())) + (BoxCenter-(BoxWidth/2)),
                                        (BoxWidth*FReal(drand48())) + (BoxCenter-(BoxWidth/2)));
-            tree.insert(particleToFill,FParticleTypeSource,idxPart);
+            tree.insert(particleToFill,FParticleType::FParticleTypeSource,idxPart);
         }
 
         for(FSize idxPart = 0 ; idxPart < NbPart_Target; ++idxPart){
@@ -106,7 +106,7 @@ int main(int argc, char ** argv){
                                        (BoxWidth*FReal(drand48())) + (BoxCenter-(BoxWidth/2)),
                                        (BoxWidth*FReal(drand48())) + (BoxCenter-(BoxWidth/2)),
                                        (BoxWidth*FReal(drand48())) + (BoxCenter-(BoxWidth/2)));
-            tree.insert(particleToFill,FParticleTypeTarget,idxPart);
+            tree.insert(particleToFill,FParticleType::FParticleTypeTarget,idxPart);
         }
     }
 
diff --git a/Tests/noDist/ChebyshevPeriodic.cpp b/Tests/noDist/ChebyshevPeriodic.cpp
index 58425bb91554795e28c847fd014f2832deff99c9..d1babc715d7b083940d0e7c9c5b4b3ed714cbcf6 100644
--- a/Tests/noDist/ChebyshevPeriodic.cpp
+++ b/Tests/noDist/ChebyshevPeriodic.cpp
@@ -83,12 +83,6 @@ int main(int argc, char* argv[])
     const unsigned int NbThreads        = FParameters::getValue(argc, argv, FParameterDefinitions::NbThreads.options, 1);
     const int PeriodicDeep                     = FParameters::getValue(argc,argv,FParameterDefinitions::PeriodicityNbLevels.options, 3);
 
-#ifdef _OPENMP
-    omp_set_num_threads(NbThreads);
-    std::cout << "\n>> Using " << omp_get_max_threads() << " threads.\n" << std::endl;
-#else
-    std::cout << "\n>> Sequential version.\n" << std::endl;
-#endif
     //
     std::cout <<	 "Parameters  "<< std::endl
                   <<     "\t      Octree Depth      \t"<< TreeHeight <<std::endl
diff --git a/Tests/noDist/FMMnonUnitCube.cpp b/Tests/noDist/FMMnonUnitCube.cpp
index 08bcb11857fb046fc9984d8f9adaa36526a11d04..dc9d79bc6d6a06264a41ba1a2272b797cfd84b52 100644
--- a/Tests/noDist/FMMnonUnitCube.cpp
+++ b/Tests/noDist/FMMnonUnitCube.cpp
@@ -98,20 +98,11 @@ int main(int argc, char* argv[])
     const std::string  filename(FParameters::getStr(argc,argv,FParameterDefinitions::InputFile.options, "../Data/UTest/unitCubeRef20kDouble.bfma"));
     const unsigned int TreeHeight    = FParameters::getValue(argc, argv, FParameterDefinitions::OctreeHeight.options, 5);
     const unsigned int SubTreeHeight = FParameters::getValue(argc, argv, FParameterDefinitions::OctreeSubHeight.options, 2);
-    const unsigned int NbThreads      = FParameters::getValue(argc, argv, FParameterDefinitions::NbThreads.options, omp_get_max_threads());
-
-	//
-#ifdef _OPENMP
-	omp_set_num_threads(NbThreads);
-#else
-	std::cout << "\n>> Sequential version.\n" << std::
-#endif
 
 			std::cout <<	 "Parameters  "<< std::endl
 			<<     "      Octree Depth      \t"<< TreeHeight <<std::endl
 			<<	  "      SubOctree depth \t"<< SubTreeHeight <<std::endl
 			<<     "      Input file  name: \t" <<filename <<std::endl
-			<<     "      Thread number:  \t" << NbThreads <<std::endl
 			<<std::endl;
 
 	// init timer
diff --git a/Tests/noDist/PerfTest.cpp b/Tests/noDist/PerfTest.cpp
deleted file mode 100644
index e036f686329e6e9f2ecd7beb0776a81106c83f4d..0000000000000000000000000000000000000000
--- a/Tests/noDist/PerfTest.cpp
+++ /dev/null
@@ -1,204 +0,0 @@
-// ==== CMAKE ====
-// Keep in private GIT
-// @SCALFMM_PRIVATE
-
-
-/**
- * \file
- * \author Quentin Khan
- *
- * This program is used to run different performance tests for the various
- * algorithms that have been implemented for ScalFMM.
- *
- * See the PerfUtils.hpp file classes for some more in depth information. Run
- * with argument --help for usage information.
- */
-
-
-#include <iostream>
-#include <string>
-
-#include "Utils/FParameters.hpp"
-#include "Utils/FParameterNames.hpp"
-
-#include "PerfTest/PerfTestUtils.hpp"
-
-#include "PerfTest/TreeLoaderBasic.hpp"
-#include "PerfTest/TreeLoaderFCheb.hpp"
-
-#ifdef SCALFMM_USE_MPI
-#include "PerfTest/TreeLoaderMpiSplitFCheb.hpp"
-#include "PerfTest/TreeLoaderMpiGenericFCheb.hpp"
-#endif
-
-#include "PerfTest/KernelLoaderFChebSym.hpp"
-
-#include "PerfTest/AlgoLoaderThread.hpp"
-#include "PerfTest/AlgoLoaderTask.hpp"
-#include "PerfTest/AlgoLoaderSectionTask.hpp"
-#include "PerfTest/AlgoLoaderCostZones.hpp"
-#include "PerfTest/AlgoLoaderThreadBalance.hpp"
-
-#ifdef SCALFMM_USE_MPI
-#include "PerfTest/AlgoLoaderThreadProc.hpp"
-#endif
-
-#define HOST_NAME_MAX 64
-
-/**
- * \brief Runs a generic sequence of actions to use an algorithm.
- *
- * This function runs the basic steps that are needed to run an FMM algorithm
- * over a set of particles. It does the following steps :
- *
- *    - Load a tree using the class defined as a TreeLoader
- *    - Prepare the needed kernels using the KernelLoader
- *    - Prepare and run the algorithm using the AlgorithmLoader
- *
- * See documentation of FTreeLoader, FKernelLoader, FAlgoLoader.
- */
-template <class TreeLoader,
-          template <typename TL_1> class KernelLoader,
-          template <typename TL_2, template <typename TL_3> class KL> class AlgoLoader>
-void runperf(FPerfTestParams& params)
-{
-    TreeLoader treeLoader(params);
-    KernelLoader<TreeLoader> kernelLoader(params, treeLoader);
-    AlgoLoader<TreeLoader, KernelLoader> algoLoader(params, treeLoader, kernelLoader);
-    algoLoader.run();
-
-    char hostname[HOST_NAME_MAX];
-    memset(hostname,'\0',HOST_NAME_MAX);
-    if ( -1 == gethostname(hostname, HOST_NAME_MAX-1) ) {
-        perror("Could not get hostname");
-        strncpy(hostname, "unknown", HOST_NAME_MAX);
-    }
-
-    std::cout << "@@ "
-              << "host:" << hostname << " "
-              << "algo:" << params.algo << " "
-              << "file:" << params.filename.substr(
-                  params.filename.find_last_of('/')+1 ) << " "
-              << "particles:" << treeLoader._loader.getNumberOfParticles() << " "
-              << "procs:"     << params.nbProcs                            << " "
-              << "threads:"   << params.nbThreads                          << " "
-              << "height:"    << params.treeHeight                         << " "
-              << "subheight:" << params.subTreeHeight                      << " "
-              << algoLoader.getRunInfoString()
-              << "P2M:" << algoLoader.getCumulatedTime(FAlgorithmTimers::P2MTimer)     << " "
-              << "M2M:" << algoLoader.getCumulatedTime(FAlgorithmTimers::M2MTimer)     << " "
-              << "M2L:" << algoLoader.getCumulatedTime(FAlgorithmTimers::M2LTimer)     << " "
-              << "L2L:" << algoLoader.getCumulatedTime(FAlgorithmTimers::L2LTimer)     << " "
-              << "P2PL2P:" << algoLoader.getCumulatedTime(FAlgorithmTimers::NearTimer) << " "
-              << std::endl;
-}
-
-namespace ParName {
-    const FParameterNames Algo = {{"--algo"},"Algorithm to run (basic, task, costzones, sectiontask, autobalance"
-#ifdef SCALFMM_USE_MPI
-                                  ", mpi-split, mpi-generic"
-#endif
-                                  ")."};
-    const FParameterNames Schedule = {{"--schedule"},"OpenMP scheduling policy (static, dynamic)."};
-    const FParameterNames ChunkSize = {{"--chunk-size"},"OpenMP chunk size for basic dynamic algorithm."};
-}
-
-int main (int argc, char** argv)
-{
-    // Parameter handling //////////////
-    FHelpDescribeAndExit(argc, argv,
-                         "Performance test program for FMM balancing techniques. "
-#ifdef SCALFMM_USE_MPI
-                         "This program has been compiled with MPI superpowers !"
-#endif
-                         ,
-                         FParameterDefinitions::InputFile,
-                         FParameterDefinitions::OctreeHeight,
-                         FParameterDefinitions::OctreeSubHeight,
-                         FParameterDefinitions::NbThreads,
-                         ParName::Algo,
-                         ParName::Schedule,
-                         ParName::ChunkSize);
-    FPerfTestParams params;
-    {
-        using namespace FParameterDefinitions;
-        using namespace FParameters;
-        params.filename      = getStr(argc,argv,InputFile.options,
-                                 "../Data/unitCubeXYZQ100.bfma");
-        params.treeHeight    = getValue(argc, argv, OctreeHeight.options, 5);
-        params.subTreeHeight = getValue(argc, argv, OctreeSubHeight.options, 2);
-        params.nbThreads     = getValue(argc, argv, NbThreads.options, 1);
-        params.algo = getStr(argc,argv,ParName::Algo.options,"task");
-        params.omp_chunk_size = getValue(argc, argv, ParName::ChunkSize.options, 0);
-
-#ifdef SCALFMM_USE_MPI
-        std::string prefix("mpi-");
-        if( params.algo.substr(0, prefix.size()) == prefix ) {
-            params.mpiContext = new FMpi(argc,argv);
-            params.nbProcs = params.mpiContext->global().processCount();
-        }
-#endif
-    }
-    // End of Parameter handling ///////
-
-    char hostname[HOST_NAME_MAX];
-    memset(hostname,'\0',HOST_NAME_MAX);
-    if ( -1 == gethostname(hostname, HOST_NAME_MAX-1) ) {
-        perror("Could not get hostname");
-        strncpy(hostname, "unknown", HOST_NAME_MAX);
-    }
-    std::cout << "Hostname: " << hostname << std::endl;
-
-    omp_set_num_threads(params.nbThreads);
-
-    using FReal = double;
-    constexpr const int ORDER = 7;
-
-    if( "basic" == params.algo ) {
-        runperf<TreeLoaderFCheb<FReal,ORDER>,
-                KernelLoaderFChebSym,
-                AlgoLoaderThread>
-            (params);
-    } else if( "task" == params.algo ) {
-        runperf<TreeLoaderFCheb<FReal,ORDER>,
-                KernelLoaderFChebSym,
-                AlgoLoaderTask>
-            (params);
-    } else if ( "costzones" == params.algo ) {
-        runperf<TreeLoaderFCheb<FReal,ORDER>,
-                KernelLoaderFChebSym,
-                AlgoLoaderCostZones>
-            (params);
-    } else if ( "sectiontask" == params.algo ) {
-        runperf<TreeLoaderFCheb<FReal,ORDER>,
-                KernelLoaderFChebSym,
-                AlgoLoaderSectionTask>
-            (params);
-    } else if ( "autobalance" == params.algo ) {
-        runperf<TreeLoaderFCheb<FReal,ORDER>,
-                KernelLoaderFChebSym,
-                AlgoLoaderThreadBalance>
-            (params);
-#ifdef SCALFMM_USE_MPI
-    } else if ( "mpi-split" == params.algo ) {
-        runperf<TreeLoaderMpiSplitFCheb<FReal,ORDER>,
-                KernelLoaderFChebSym,
-                AlgoLoaderThreadProc>
-            (params);        
-    } else if ( "mpi-generic" == params.algo ) {
-        runperf<TreeLoaderMpiGenericFCheb<FReal,ORDER>,
-                KernelLoaderFChebSym,
-                AlgoLoaderThreadProc>
-            (params);        
-#endif
-    } else {
-        std::cout << "Unknown algorithm: " << params.algo << std::endl;
-    }
-    
-#ifdef SCALFMM_USE_MPI
-    if( nullptr != params.mpiContext ) {
-        delete params.mpiContext;
-    }
-#endif
-
-}
diff --git a/Tests/noDist/PerfTest/AlgoLoaderCostZones.hpp b/Tests/noDist/PerfTest/AlgoLoaderCostZones.hpp
deleted file mode 100644
index 017c553815d7001b208fbcf2c2b8177a01a5b258..0000000000000000000000000000000000000000
--- a/Tests/noDist/PerfTest/AlgoLoaderCostZones.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-// ==== CMAKE ====
-// Keep in private GIT
-// @SCALFMM_PRIVATE
-
-#ifndef _ALGOLOADERCOSTZONES_HPP_
-#define _ALGOLOADERCOSTZONES_HPP_
-
-#include <memory>
-#include <sstream>
-
-#include "PerfTestUtils.hpp"
-
-#include "Core/FFmmAlgorithmThread.hpp"
-
-#include "BalanceTree/FFmmAlgorithmThreadBalanced.hpp"
-#include "BalanceTree/FCostCell.hpp"
-#include "BalanceTree/FCostZones.hpp"
-
-/**
- * \brief Algorithm loader for FFmmAlgorithmThreadBalanced.
- *
- * See FAlgoLoader documentation.
- *
- * \warning : This loader requires that the KernelLoader supply a type definition
- * for a `CostKernelClass`
- */
-template <class _TreeLoader, template<typename> class _KernelLoader>
-class AlgoLoaderCostZones : public FAlgoLoader<_TreeLoader, _KernelLoader> {
-public:
-    // Types definitions
-
-    /// The TreeLoader type that is used.
-    using TreeLoader     = _TreeLoader;
-    using KernelLoader   = _KernelLoader<TreeLoader>;
-
-    using FReal          = typename TreeLoader::FReal;
-    using CellClass      = typename TreeLoader::CellClass;
-    using ContainerClass = typename TreeLoader::ContainerClass;
-    using LeafClass      = typename TreeLoader::LeafClass;
-    using OctreeClass    = typename TreeLoader::OctreeClass;
-    using KernelClass    = typename KernelLoader::KernelClass;
-    using CostKernelClass= typename KernelLoader::CostKernelClass;
-
-    static_assert(std::is_base_of<FCostCellTypeTrait, CellClass>::value,
-        "The tree cells must derive from FCostCell.");
-
-    using FMMClass = FFmmAlgorithmThreadBalanced
-        <OctreeClass, CellClass, ContainerClass, KernelClass, LeafClass>;    
-    using CostFmmClass = FFmmAlgorithmThread
-        <OctreeClass, CellClass, ContainerClass, CostKernelClass, LeafClass>;
-
-    std::stringstream _infostring;
-    TreeLoader& _treeLoader;
-    KernelLoader& _kernelLoader;
-
-    std::unique_ptr<FMMClass> _algo;
-    
-
-    /// Builds the loader
-    AlgoLoaderCostZones(FPerfTestParams& /*params*/,
-                        TreeLoader& treeLoader,
-                        KernelLoader& kernelLoader) :
-        _treeLoader(treeLoader),
-        _kernelLoader(kernelLoader),
-        _algo(nullptr) {
-        
-    }
-
-    /// Computes the tree cells costs then runs the costzones and FMM algorithms.
-    void run() {
-        // The tree loader holds the tree structure
-        OctreeClass* p_tree = &(_treeLoader._tree);
-
-        // Compute tree cells costs
-        CostFmmClass costAlgo(p_tree, &(_kernelLoader._costKernel));
-
-        this->time.tic();
-        costAlgo.execute();
-        this->time.tac();
-        std::cout << "Generating tree cost: " << this->time.elapsed() << "s.\n";
-        _infostring << "costgen:" << this->time.elapsed() << " ";
-
-        // Compute cost zones
-        FCostZones<OctreeClass, CellClass> costzones(p_tree, omp_get_max_threads());
-
-        this->time.tic();
-        costzones.run();
-        this->time.tac();
-        std::cout << "Generating cost zones: " << this->time.elapsed() << "s.\n";
-        _infostring << "zonegen:" << this->time.elapsed() << " ";
-
-        // Execute FFM algorithm
-        this->time.tic();
-        _algo = std::unique_ptr<FMMClass>(
-            new FMMClass(p_tree, &(_kernelLoader._kernel),
-                         costzones.getZoneBounds(), costzones.getLeafZoneBounds()));
-        _algo->execute();
-        this->time.tac();
-    }
-
-    std::string getRunInfoString() const {
-        return _infostring.str();
-    }
-
-    double getCumulatedTime(FAlgorithmTimers::FTimers timerName) const {
-        return _algo->getCumulatedTime(timerName);
-    }
-
-};
-
-
-
-#endif
diff --git a/Tests/noDist/PerfTest/AlgoLoaderSectionTask.hpp b/Tests/noDist/PerfTest/AlgoLoaderSectionTask.hpp
deleted file mode 100644
index 245e3c477f3a8f6f848cf08980bf7aaaed6093ef..0000000000000000000000000000000000000000
--- a/Tests/noDist/PerfTest/AlgoLoaderSectionTask.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// ==== CMAKE ====
-// Keep in private GIT
-// @SCALFMM_PRIVATE
-
-#ifndef _ALGOLOADERSECTIONTASK_HPP_
-#define _ALGOLOADERSECTIONTASK_HPP_
-
-#include <memory>
-
-#include "PerfTestUtils.hpp"
-
-#include "Core/FFmmAlgorithmSectionTask.hpp"
-
-
-template <class _TreeLoader, template<typename> class _KernelLoader>
-class AlgoLoaderSectionTask : public FAlgoLoader<_TreeLoader, _KernelLoader> {
-public:
-    using TreeLoader     = _TreeLoader;
-    using KernelLoader   = _KernelLoader<TreeLoader>;
-
-
-    using FReal          = typename TreeLoader::FReal;
-    using CellClass      = typename TreeLoader::CellClass;
-    using ContainerClass = typename TreeLoader::ContainerClass;
-    using LeafClass      = typename TreeLoader::LeafClass;
-    using OctreeClass    = typename TreeLoader::OctreeClass;
-    using KernelClass    = typename KernelLoader::KernelClass;
-
-    using FMMClass = FFmmAlgorithmSectionTask<OctreeClass, CellClass, ContainerClass, KernelClass, LeafClass>;
-    
-    TreeLoader& _treeLoader;
-    KernelLoader& _kernelLoader;
-
-    std::unique_ptr<FMMClass> _algo;
-
-    AlgoLoaderSectionTask(FPerfTestParams& /*params*/,
-                   TreeLoader& treeLoader,
-                   KernelLoader& kernelLoader) :
-        _treeLoader(treeLoader),
-        _kernelLoader(kernelLoader),
-        _algo(nullptr) {
-        
-    }
-
-
-    void run() {
-        _algo = std::unique_ptr<FMMClass>(
-            new FMMClass(&(_treeLoader._tree), &(_kernelLoader._kernel)));
-        _algo->execute();
-    }
-
-    double getCumulatedTime(FAlgorithmTimers::FTimers timerName) const {
-        return _algo->getCumulatedTime(timerName);
-    }
-};
-
-
-
-#endif
diff --git a/Tests/noDist/PerfTest/AlgoLoaderTask.hpp b/Tests/noDist/PerfTest/AlgoLoaderTask.hpp
deleted file mode 100644
index 2422f7e7c6efb7c7a27fa6b916424276c785a992..0000000000000000000000000000000000000000
--- a/Tests/noDist/PerfTest/AlgoLoaderTask.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// ==== CMAKE ====
-// Keep in private GIT
-// @SCALFMM_PRIVATE
-
-#ifndef _ALGOLOADERTASK_HPP_
-#define _ALGOLOADERTASK_HPP_
-
-#include <memory>
-
-#include "PerfTestUtils.hpp"
-
-#include "Core/FFmmAlgorithmTask.hpp"
-
-
-template <class _TreeLoader, template<typename> class _KernelLoader>
-class AlgoLoaderTask : public FAlgoLoader<_TreeLoader, _KernelLoader> {
-public:
-    using TreeLoader     = _TreeLoader;
-    using KernelLoader   = _KernelLoader<TreeLoader>;
-
-
-    using FReal          = typename TreeLoader::FReal;
-    using CellClass      = typename TreeLoader::CellClass;
-    using ContainerClass = typename TreeLoader::ContainerClass;
-    using LeafClass      = typename TreeLoader::LeafClass;
-    using OctreeClass    = typename TreeLoader::OctreeClass;
-    using KernelClass    = typename KernelLoader::KernelClass;
-
-    using FMMClass = FFmmAlgorithmTask<OctreeClass, CellClass, ContainerClass, KernelClass, LeafClass>;
-    
-    TreeLoader& _treeLoader;
-    KernelLoader& _kernelLoader;
-
-    std::unique_ptr<FMMClass> _algo;
-
-    AlgoLoaderTask(FPerfTestParams& /*params*/,
-                   TreeLoader& treeLoader,
-                   KernelLoader& kernelLoader) :
-        _treeLoader(treeLoader),
-        _kernelLoader(kernelLoader),
-        _algo(nullptr) {
-        
-    }
-
-
-    void run() {
-        _algo = std::unique_ptr<FMMClass>(
-            new FMMClass(&(_treeLoader._tree), &(_kernelLoader._kernel)));
-        _algo->execute();
-    }
-
-    double getCumulatedTime(FAlgorithmTimers::FTimers timerName) const {
-        return _algo->getCumulatedTime(timerName);
-    }
-};
-
-
-
-#endif
diff --git a/Tests/noDist/PerfTest/AlgoLoaderThread.hpp b/Tests/noDist/PerfTest/AlgoLoaderThread.hpp
deleted file mode 100644
index 322ec6fd205e875a07d67ae26a6bde4dc1160ec3..0000000000000000000000000000000000000000
--- a/Tests/noDist/PerfTest/AlgoLoaderThread.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-// ==== CMAKE ====
-// Keep in private GIT
-// @SCALFMM_PRIVATE
-
-#ifndef _ALGOLOADERTHREAD_HPP_
-#define _ALGOLOADERTHREAD_HPP_
-
-#include <memory>
-#include <sstream>
-
-#include "PerfTestUtils.hpp"
-
-#include "Core/FFmmAlgorithmThread.hpp"
-
-/**
- * \brief Algorithm loader for FFmmAlgorithmThread
- *
- * See FAlgoLoader.
- */
-template <class _TreeLoader, template<typename> class _KernelLoader>
-class AlgoLoaderThread : public FAlgoLoader<_TreeLoader, _KernelLoader> {
-public:
-
-    // Type definitions, allows them to be reused by other classes
-    using TreeLoader     = _TreeLoader;
-    using KernelLoader   = _KernelLoader<TreeLoader>;
-
-    using FReal          = typename TreeLoader::FReal;
-    using CellClass      = typename TreeLoader::CellClass;
-    using ContainerClass = typename TreeLoader::ContainerClass;
-    using LeafClass      = typename TreeLoader::LeafClass;
-    using OctreeClass    = typename TreeLoader::OctreeClass;
-    using KernelClass    = typename KernelLoader::KernelClass;
-
-    /// FMM algorithm class
-    using FMMClass = FFmmAlgorithmThread<OctreeClass, CellClass, ContainerClass, KernelClass, LeafClass>;
-    
-    /// The tree loader (FTreeLoader) that was used
-    TreeLoader& _treeLoader;
-
-    /// The kernel loader (FKernelLoader) that was used
-    KernelLoader& _kernelLoader;
-
-    unsigned int _omp_chunk_size; ///< Chunk size for OpenMP
-
-    /// The #FMMClass algorithm instance
-    std::unique_ptr<FMMClass> _algo;
-
-    AlgoLoaderThread(FPerfTestParams& params,
-                     TreeLoader& treeLoader,
-                     KernelLoader& kernelLoader) :
-        _treeLoader(treeLoader),
-        _kernelLoader(kernelLoader),
-        _omp_chunk_size(params.omp_chunk_size),
-        _algo(nullptr) {
-        
-    }
-
-    void run() {
-        _algo = std::unique_ptr<FMMClass>(
-            new FMMClass(&(_treeLoader._tree), &(_kernelLoader._kernel)));
-        _algo->setChunkSize(_omp_chunk_size);
-
-        _algo->execute();
-    }
-
-
-    virtual std::string getRunInfoString() const {
-        std::stringstream sstr;
-        sstr << "chunksize:" << _omp_chunk_size << " ";
-        return sstr.str();
-    }
-
-    double getCumulatedTime(FAlgorithmTimers::FTimers timerName) const {
-        return _algo->getCumulatedTime(timerName);
-    }
-
-
-};
-
-#endif
diff --git a/Tests/noDist/PerfTest/AlgoLoaderThreadBalance.hpp b/Tests/noDist/PerfTest/AlgoLoaderThreadBalance.hpp
deleted file mode 100644
index 05cb80de0bccb8d9ae9e26634e5f2a6beecc2eaf..0000000000000000000000000000000000000000
--- a/Tests/noDist/PerfTest/AlgoLoaderThreadBalance.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-// ==== CMAKE ====
-// Keep in private GIT
-// @SCALFMM_PRIVATE
-
-#ifndef _ALGOLOADERTHREADBALANCE_HPP_
-#define _ALGOLOADERTHREADBALANCE_HPP_
-
-#include <memory>
-#include <sstream>
-
-#include "PerfTestUtils.hpp"
-
-#include "Core/FFmmAlgorithmThreadBalance.hpp"
-
-/**
- * \brief An algorithm loader for FFmmAlgorithmBalance
- *
- * See FAlgoLoader documentation.
- */
-template <class _TreeLoader, template<typename> class _KernelLoader>
-class AlgoLoaderThreadBalance : public FAlgoLoader<_TreeLoader, _KernelLoader> {
-public:
-    using TreeLoader     = _TreeLoader;
-    using KernelLoader   = _KernelLoader<TreeLoader>;
-
-    using FReal          = typename TreeLoader::FReal;
-    using CellClass      = typename TreeLoader::CellClass;
-    using ContainerClass = typename TreeLoader::ContainerClass;
-    using LeafClass      = typename TreeLoader::LeafClass;
-    using OctreeClass    = typename TreeLoader::OctreeClass;
-    using KernelClass    = typename KernelLoader::KernelClass;
-
-    using FMMClass = FFmmAlgorithmThreadBalance<OctreeClass, CellClass, ContainerClass, KernelClass, LeafClass>;
-    
-    TreeLoader& _treeLoader;
-    KernelLoader& _kernelLoader;
-
-    std::unique_ptr<FMMClass> _algo;
-
-    AlgoLoaderThreadBalance(FPerfTestParams& params,
-                     TreeLoader& treeLoader,
-                     KernelLoader& kernelLoader) :
-        _treeLoader(treeLoader),
-        _kernelLoader(kernelLoader),
-        _algo(nullptr) {
-        
-    }
-
-    void run() {
-        _algo = std::unique_ptr<FMMClass>(
-            new FMMClass(&(_treeLoader._tree), &(_kernelLoader._kernel)));
-
-        _algo->execute();
-    }
-
-
-    virtual std::string getRunInfoString() const {
-        return "";
-    }
-
-    double getCumulatedTime(FAlgorithmTimers::FTimers timerName) const {
-        return _algo->getCumulatedTime(timerName);
-    }
-
-
-};
-
-#endif
diff --git a/Tests/noDist/PerfTest/AlgoLoaderThreadProc.hpp b/Tests/noDist/PerfTest/AlgoLoaderThreadProc.hpp
deleted file mode 100644
index 9798e4733d0e758782cccb57098fc844837ac314..0000000000000000000000000000000000000000
--- a/Tests/noDist/PerfTest/AlgoLoaderThreadProc.hpp
+++ /dev/null
@@ -1,87 +0,0 @@
-// ==== CMAKE ====
-// Keep in private GIT
-// @SCALFMM_PRIVATE
-
-#ifndef _ALGOLOADERTHREADPROC_HPP_
-#define _ALGOLOADERTHREADPROC_HPP_
-
-#include <memory>
-#include <sstream>
-
-#include "PerfTestUtils.hpp"
-
-#include "Core/FFmmAlgorithmThreadProc.hpp"
-#include "Utils/FMpi.hpp"
-
-/**
- * \brief Algorithm loader for FFmmAlgorithmThread
- *
- * See FAlgoLoader.
- */
-template <class _TreeLoader, template<typename> class _KernelLoader>
-class AlgoLoaderThreadProc : public FAlgoLoader<_TreeLoader, _KernelLoader> {
-public:
-
-    // Type definitions, allows them to be reused by other classes
-    using TreeLoader     = _TreeLoader;
-    using KernelLoader   = _KernelLoader<TreeLoader>;
-
-    using FReal          = typename TreeLoader::FReal;
-    using CellClass      = typename TreeLoader::CellClass;
-    using ContainerClass = typename TreeLoader::ContainerClass;
-    using LeafClass      = typename TreeLoader::LeafClass;
-    using OctreeClass    = typename TreeLoader::OctreeClass;
-    using KernelClass    = typename KernelLoader::KernelClass;
-
-    /// FMM algorithm class
-    using FMMClass = FFmmAlgorithmThreadProc<OctreeClass, CellClass, ContainerClass, KernelClass, LeafClass>;
-    
-    FMpi* _mpiContext;
-
-    /// The tree loader (FTreeLoader) that was used
-    TreeLoader& _treeLoader;
-
-    /// The kernel loader (FKernelLoader) that was used
-    KernelLoader& _kernelLoader;
-
-    /// The #FMMClass algorithm instance
-    std::unique_ptr<FMMClass> _algo;
-    
-    /// Array of MPI gathered cumulated times
-    double timers[FAlgorithmTimers::nbTimers] {0};
-
-
-    AlgoLoaderThreadProc(FPerfTestParams& params,
-                     TreeLoader& treeLoader,
-                     KernelLoader& kernelLoader) :
-        _mpiContext(params.mpiContext),
-        _treeLoader(treeLoader),
-        _kernelLoader(kernelLoader),
-        _algo(nullptr) {
-        
-    }
-    
-
-    void run() {
-        _algo = std::unique_ptr<FMMClass>(
-            new FMMClass(_mpiContext->global(), &(_treeLoader._tree), &(_kernelLoader._kernel)));
-        _algo->execute();
-        
-        for( int idxTimer = 0; idxTimer < FAlgorithmTimers::nbTimers; ++idxTimer ) {
-            timers[idxTimer] = _algo->getCumulatedTime(FAlgorithmTimers::FTimers(idxTimer));
-        }
-
-        if( _mpiContext->global().processId() == 0) {
-            MPI_Reduce(MPI_IN_PLACE, timers, FAlgorithmTimers::nbTimers, MPI_DOUBLE, MPI_MAX, 0, _mpiContext->global().getComm());
-        } else {
-            MPI_Reduce(timers, NULL, FAlgorithmTimers::nbTimers, MPI_DOUBLE, MPI_MAX, 0, _mpiContext->global().getComm());
-        }
-    }
-
-    double getCumulatedTime(FAlgorithmTimers::FTimers timerName) const {
-        return timers[timerName];
-    }
-
-};
-
-#endif
diff --git a/Tests/noDist/PerfTest/KernelLoaderFChebSym.hpp b/Tests/noDist/PerfTest/KernelLoaderFChebSym.hpp
deleted file mode 100644
index 2945c2714e72dc4a91fcad5c19b5a092bcbf5aaf..0000000000000000000000000000000000000000
--- a/Tests/noDist/PerfTest/KernelLoaderFChebSym.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-// ==== CMAKE ====
-// Keep in private GIT
-// @SCALFMM_PRIVATE
-
-
-
-#ifndef _KERNELLOADERFCHEBSYM_HPP_
-#define _KERNELLOADERFCHEBSYM_HPP_
-
-#include "PerfTestUtils.hpp"
-
-#include "Kernels/Interpolation/FInterpMatrixKernel.hpp"
-#include "Kernels/Chebyshev/FChebSymKernel.hpp"
-
-#include "BalanceTree/FChebSymCostKernel.hpp"
-
-/**
- * \brief Kernel loader for the symetric Chebyshev kernel.
- *
- * \warning This loader requires that TreeLoader::CellClass inherits from
- * FChebCell.
- *
- * \note This loader also provides the typedef CostKernelClass and a member
- * _costKernel that cam be used by the AlgoLoaderCostZones.
- */
-template <typename _TreeLoader>
-class KernelLoaderFChebSym : public FKernelLoader<_TreeLoader> {
-    // Meaningfull (?) error message.
-    static_assert(
-        std::is_base_of<FChebCell<typename _TreeLoader::FReal,_TreeLoader::ORDER>,
-                        typename _TreeLoader::CellClass>::value,
-        "TreeLoader::CellClass must derive from FChebCell");    
-
-
-public:
-    // Required type definitions
-    using TreeLoader     = _TreeLoader;
-    using FReal          = typename TreeLoader::FReal;
-    /// Must derive from FChebCell
-    using CellClass      = typename TreeLoader::CellClass;
-    using ContainerClass = typename TreeLoader::ContainerClass;
-    using OctreeClass    = typename TreeLoader::OctreeClass;
-
-    using MatrixKernelClass = FInterpMatrixKernelR<FReal>;
-    using KernelClass       = FChebSymKernel <FReal, CellClass, ContainerClass,
-                                              MatrixKernelClass, TreeLoader::ORDER>;
-    /// Kernel class used to compute the tree cell costs.
-    using CostKernelClass = FChebSymCostKernel<FReal, CellClass, ContainerClass, 
-                                               MatrixKernelClass, TreeLoader::ORDER,
-                                               OctreeClass>;
-
-    const FReal epsilon = 1e-4;
-    
-    /// Matrix used to compute the tree cells interactions.
-    const MatrixKernelClass _matrixKernel;
-    /// Kernel used to compute the tree cells interactions.
-    KernelClass _kernel;
-    /// Kernel used to compute the tree cells costs.
-    CostKernelClass _costKernel;
-
-    /// Builds and loads the kernel.
-    /** \param params Parameters from the main invocation, UNSUSED
-     *  \param treeLoader Tree loader that was used.
-     */
-    KernelLoaderFChebSym(FPerfTestParams& /*params*/, TreeLoader& treeLoader) :
-        _matrixKernel(),
-        _kernel(treeLoader._tree.getHeight(),
-                treeLoader._tree.getBoxWidth(),
-                treeLoader._tree.getBoxCenter(),
-                &_matrixKernel),
-        _costKernel(&(treeLoader._tree), epsilon){
-
-    }
-
-
-};
-
-
-#endif
diff --git a/Tests/noDist/PerfTest/PerfTestUtils.hpp b/Tests/noDist/PerfTest/PerfTestUtils.hpp
deleted file mode 100644
index cc4020467774f17d916461405304e8550da88db6..0000000000000000000000000000000000000000
--- a/Tests/noDist/PerfTest/PerfTestUtils.hpp
+++ /dev/null
@@ -1,150 +0,0 @@
-// ==== CMAKE ====
-// Keep in private GIT
-// @SCALFMM_PRIVATE
-
-#ifndef _PERFTESTUTILS_HPP_
-#define _PERFTESTUTILS_HPP_
-
-#include <string>
-
-#ifdef SCALFMM_USE_MPI
-#include "Utils/FMpi.hpp"
-#endif
-
-#include "Utils/FTic.hpp"
-#include "Files/FFmaGenericLoader.hpp"
-
-#include "Containers/FOctree.hpp"
-
-/**
- * \brief Store the PerfTest program parameters.
- */
-struct FPerfTestParams {
-    int subTreeHeight = 2; ///< Subtree height.
-    int treeHeight = 5;    ///< Tree height.
-    int nbThreads = 1;     ///< Maximum number of threads (when used). 
-    std::string filename = ""; ///< Particles file.
-    std::string algo = "task"; ///< Algorithm to run.
-    int  omp_chunk_size = 0;   ///< OpenMP chunk size for basic algorithm (FFmmAlgorithmThread)
-    int nbProcs = 1;
-#ifdef SCALFMM_USE_MPI
-    FMpi* mpiContext = nullptr;
-#endif
-};
-
-
-/**
- * \brief Base class for tree loaders.
- *
- * This class itself does not provide anything but a base on which to build tree
- * loaders. A tree loader should satisfy the following rules.
- *
- *    - Define the public typedefs : CellClass, ContainerClass, LeafClass,
- *      OctreeClass.
- *    - Provide public acces to a member of type OctreeClass _tree as the tree
- *      that is loaded.
- *    - Tree loading must happen at construction.
- *    - It may provide any other members or typdefs required by a special
- *      FKernelLoader or FAlgoLoader.
- *
- * For convenience, this class provides a timer and a basic loadTree method that
- * should be enough to load a tree from and FMA file.
- *
- * \note It is not mandatory that a loader inherit from this class. It must
- * however follow the aforementioned rules.
- */
-class FTreeLoader {
-public:
-    /// A timer used to time the loadTree method.
-    FTic time;
-protected:
-
-    /**
-     * \brief Load a tree from a file.
-     *
-     * \param loader The file loader to read from the file.
-     * \param tree The tree to be filled.
-     */
-    virtual void loadTree() = 0;
-};
-
-/**
- * \brief Base class for kernel loaders.
- *
- * This class itself does not provide anything but a base on which to build
- * kernel loaders. A kernel loader should satisfy the following rules.
- *
- *    - Define the public typedefs : TreeLoader, KernelClass.
- *    - Provide public acces to a member of type Kernelclass _kernel as the
- *      kernel that is loaded.
- *    - Kernel loading must happen at construction.
- *    - It may provide any other members or typdefs required by a special
- *      FAlgoLoader.
- *
- * For convenience, this class provides a timer.
- *
- * \tparam _TreeLoader The tree loader that was used.
- *
- * \note It is not mandatory that a loader inherit from this class. It must
- * however follow the aforementioned rules.
- */
-template<class _TreeLoader>
-class FKernelLoader {
-    /// The tree loader that was used (see FTreeLoader).
-    using TreeLoader = _TreeLoader;
-public:
-    /// A timer
-    FTic time;
-};
-
-/**
- * \brief Base class for algorithm loaders.
- *
- * This class itself does not provide anything but a base on which to build
- * algorithm loaders. A kernel loader should satisfy the following rules.
- *
- *    - Define the public typedefs : TreeLoader, KernelLoader.
- *    - Provide public acces to a member of type
- *      \link TreeLoader Treeloader::OctreeClass* \endlink` _algo`
- *      as the algorithm that is loaded. This pointer should be valid from the
- *      end of the ::run method to the destruction of the loader.
- *    - It may provide any other members or typdefs.
- *
- * For convenience, this class provides a timer.
- *
- * \tparam _TreeLoader The tree loader that was used.
- * \tparam _KernelLoader The kernel loader *template* that was used, the
- *         KernelLoader type will then be _KernelLoader<_TreeLoader>.
- *
- * \note It is not mandatory that a loader inherit from this class. It must
- * however follow the aforementioned rules.
- */
-template <class _TreeLoader, template<typename> class _KernelLoader>
-class FAlgoLoader {
-    /// The tree loader that was used (see FTreeLoader).
-    using TreeLoader = _TreeLoader;
-    /// The kernel loader that was used (see FKernelLoader).
-    using KernelLoader = _KernelLoader<TreeLoader>;
-public:
-    /// A timer.
-    FTic time;
-
-    /// Method that runs the algorithm.
-    virtual void run() = 0;
-
-    /// Additionnal information for specific algorithm loader.
-    /**  
-     * The string should be formated as a key:value list separated by spaces.
-     * For instance : "key1:value1 key2:value2 ". It may be a good idea to add a
-     * space at the end of the string.
-     */
-    virtual std::string getRunInfoString() const {
-        return "";
-    }
-};
-
-
-
-
-
-#endif
diff --git a/Tests/noDist/PerfTest/TreeLoaderBasic.hpp b/Tests/noDist/PerfTest/TreeLoaderBasic.hpp
deleted file mode 100644
index c130c5d1968f95ec0b0c95e52996b87369af9a10..0000000000000000000000000000000000000000
--- a/Tests/noDist/PerfTest/TreeLoaderBasic.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-// ==== CMAKE ====
-// Keep in private GIT
-// @SCALFMM_PRIVATE
-
-#ifndef _TREELOADERBASIC_HPP_
-#define _TREELOADERBASIC_HPP_
-
-#include "PerfTestUtils.hpp"
-
-#include "Containers/FOctree.hpp"
-#include "Components/FSimpleLeaf.hpp"
-#include "Kernels/P2P/FP2PParticleContainerIndexed.hpp"
-
-#include "BalanceTree/FCostCell.hpp"
-
-/**
- * \brief Basic tree loader.
- *
- * See FTreeLoader documentation.
- */
-template <typename _FReal, typename _BaseClass>
-class TreeLoaderBasic : public FTreeLoader {
-public:
-    using FReal     = _FReal;
-    using BaseClass = _BaseClass;
-
-    // Required type definitions.
-    using CellClass          = FCostCell<BaseClass>;
-    using ContainerClass     = FP2PParticleContainerIndexed<FReal>;
-    using LeafClass          = FSimpleLeaf<FReal, ContainerClass >;
-    using OctreeClass        = FOctree<FReal, CellClass, ContainerClass, LeafClass>;
-
-    /// File loader.
-    FFmaGenericLoader<FReal> _loader;
-    /// Required tree member.
-    OctreeClass _tree;
-
-    /// Constructs the loader and loads the tree.
-    TreeLoaderBasic(FPerfTestParams& params):
-        _loader(params.filename),
-        _tree(params.treeHeight,
-              params.subTreeHeight,
-              _loader.getBoxWidth(),
-              _loader.getCenterOfBox()) {
-        this->loadTree();
-    }
-
-    virtual void loadTree() {
-        std::cout << "Creating & inserting particles" << std::flush;
-        
-        time.tic();
-        
-        FPoint<FReal> position;
-        FReal physicalValue = 0.0;
-        for(FSize idxPart = 0 ; idxPart < _loader.getNumberOfParticles() ; ++idxPart) {
-            // Read particle per particle from file
-            _loader.fillParticle(&position,&physicalValue);
-            // put particle in octree
-            _tree.insert(position, idxPart, physicalValue);
-        }
-
-        time.tac();
-        std::cout << " Done  (" << time.elapsed() << " s)." << std::endl;
-    }
-
-};
-
-#endif
diff --git a/Tests/noDist/PerfTest/TreeLoaderFCheb.hpp b/Tests/noDist/PerfTest/TreeLoaderFCheb.hpp
deleted file mode 100644
index 8821d98767f9a271addc6e2776e6e952a2f68bd0..0000000000000000000000000000000000000000
--- a/Tests/noDist/PerfTest/TreeLoaderFCheb.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// ==== CMAKE ====
-// Keep in private GIT
-// @SCALFMM_PRIVATE
-
-#ifndef _TREELOADERFCHEB_HPP_
-#define _TREELOADERFCHEB_HPP_
-
-#include "Kernels/Chebyshev/FChebCell.hpp"
-
-#include "TreeLoaderBasic.hpp"
-
-
-/**
- * \brief Tree loader for a Chebyshev cell type tree.
- *
- * See FTreeLoader and TreeLoaderBasic documentation.
- */
-template <typename _FReal, int _ORDER>
-class TreeLoaderFCheb : public TreeLoaderBasic<_FReal, FChebCell<_FReal, _ORDER> > {
-public:
-
-    enum {ORDER=_ORDER};
-
-    /// Constructs the loader and loads the tree.
-    TreeLoaderFCheb(FPerfTestParams& params):
-        TreeLoaderBasic<_FReal, FChebCell<_FReal, _ORDER>>(params)
-    {}
-
-};
-
-#endif
diff --git a/Tests/noDist/PerfTest/TreeLoaderMpiGeneric.hpp b/Tests/noDist/PerfTest/TreeLoaderMpiGeneric.hpp
deleted file mode 100644
index 4775a9f6d6d0f1be09c6ba2aae71ba8ba4bdcac2..0000000000000000000000000000000000000000
--- a/Tests/noDist/PerfTest/TreeLoaderMpiGeneric.hpp
+++ /dev/null
@@ -1,132 +0,0 @@
-// ==== CMAKE ====
-// Keep in private GIT
-// @SCALFMM_PRIVATE
-
-#ifndef _TREELOADERMPIGENERIC_HPP_
-#define _TREELOADERMPIGENERIC_HPP_
-
-#include "PerfTestUtils.hpp"
-#include "Utils/FMpi.hpp"
-
-#include "Kernels/P2P/FP2PParticleContainerIndexed.hpp"
-#include "BalanceTree/FCostCell.hpp"
-#include "Components/FSimpleLeaf.hpp"
-#include "Containers/FOctree.hpp"
-
-#include "Files/FFmaGenericLoader.hpp"
-#include "Files/FMpiFmaGenericLoader.hpp"
-#include "Files/FMpiTreeBuilder.hpp"
-
-
-/**
- * \brief Genericted FMA file tree loader.
- *
- * See FTreeLoader documentation.
- */
-template <typename _FReal, class _BaseClass>
-class TreeLoaderMpiGeneric : public FTreeLoader {
-public:
-    using FReal = _FReal;
-
-    // Required type definitions.
-    using BaseClass          = _BaseClass;
-    using CellClass          = FCostCell<BaseClass>;
-    using ContainerClass     = FP2PParticleContainerIndexed<FReal>;
-    using LeafClass          = FSimpleLeaf<FReal, ContainerClass >;
-    using OctreeClass        = FOctree<FReal, CellClass, ContainerClass, LeafClass>;
-
-    /// MPI applcation context.
-    FMpi* _mpiContext;
-    /// File loader.
-    FMpiFmaGenericLoader<FReal> _loader;
-    /// Required tree member.
-    OctreeClass _tree;
-
-    /* Mock particle structure to balance the tree over the processes. */
-    struct TestParticle{
-        FSize index;             // Index of the particle in the original file.
-        FPoint<FReal> position;  // Spatial position of the particle.
-        FReal physicalValue;     // Physical value of the particle.
-        /* Returns the particle position. */
-        const FPoint<FReal>& getPosition(){
-            return position;
-        }
-    };
-
-
-    /// Constructs the loader and loads the tree.
-    TreeLoaderMpiGeneric(FPerfTestParams& params):
-        _mpiContext(params.mpiContext),
-        _loader(params.filename, _mpiContext->global()),
-        _tree(params.treeHeight,
-              params.subTreeHeight,
-              _loader.getBoxWidth(),
-              _loader.getCenterOfBox()) {
-        this->loadTree();
-    }
-
-    void loadTree() {
-        if( 0 == _mpiContext->global().processId())
-            std::cout << "Creating & inserting particles" << std::flush;
-        
-        time.tic();
-
-        // Temporary array of particles read by this process.
-        TestParticle* particles = new TestParticle[_loader.getMyNumberOfParticles()];
-        memset(particles, 0, (sizeof(TestParticle) * _loader.getMyNumberOfParticles()));
-
-        // Index (in file) of the first particle that will be read by this process.
-        FSize idxStart = _loader.getStart();
-
-        // Read particles from parts.
-        for(FSize idxPart = 0 ; idxPart < _loader.getMyNumberOfParticles() ; ++idxPart){
-            // Store the index (in the original file) the particle.
-            particles[idxPart].index = idxPart + idxStart;
-            // Read particle from file
-            _loader.fillParticle(&particles[idxPart].position,
-                                &particles[idxPart].physicalValue);
-        }
-
-        // Final vector of particles
-        FVector<TestParticle> finalParticles;
-        FLeafBalance balancer;
-
-        // Redistribute particules between processes
-        FMpiTreeBuilder< FReal, TestParticle >::
-            DistributeArrayToContainer(_mpiContext->global(),
-                                       particles,
-                                       _loader.getMyNumberOfParticles(),
-                                       _tree.getBoxCenter(),
-                                       _tree.getBoxWidth(),
-                                       _tree.getHeight(),
-                                       &finalParticles,
-                                       &balancer);
-        
-        // Free temporary array memory.
-        delete[] particles;
-
-        // Insert final particles into tree.
-        for(FSize idx = 0 ; idx < finalParticles.getSize(); ++idx){
-            _tree.insert(finalParticles[idx].position,
-                        finalParticles[idx].index,
-                        finalParticles[idx].physicalValue);
-        }
-
-        time.tac();
-        double elapsedTime = time.elapsed(), minTime, maxTime;
-
-        MPI_Reduce(&elapsedTime,&minTime,1,MPI_DOUBLE,MPI_MIN,0,_mpiContext->global().getComm());
-        MPI_Reduce(&elapsedTime,&maxTime,1,MPI_DOUBLE,MPI_MAX,0,_mpiContext->global().getComm());
-
-        if( 0 == _mpiContext->global().processId()) {
-            std::cout << " Done  ( min-time:" << minTime
-                      << " max-time:" << maxTime
-                      << " )"
-                      << std::endl;
-
-        }
-    }
-
-};
-
-#endif
diff --git a/Tests/noDist/PerfTest/TreeLoaderMpiGenericFCheb.hpp b/Tests/noDist/PerfTest/TreeLoaderMpiGenericFCheb.hpp
deleted file mode 100644
index 745126125905d87821f34333998fd59d53fa27ed..0000000000000000000000000000000000000000
--- a/Tests/noDist/PerfTest/TreeLoaderMpiGenericFCheb.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// ==== CMAKE ====
-// Keep in private GIT
-// @SCALFMM_PRIVATE
-
-#ifndef _TREELOADERMPIGENERICFCHEB_HPP_
-#define _TREELOADERMPIGENERICFCHEB_HPP_
-
-#include "Kernels/Chebyshev/FChebCell.hpp"
-
-#include "TreeLoaderMpiGeneric.hpp"
-
-
-/**
- * \brief Tree loader for a Chebyshev cell type tree.
- *
- * See FTreeLoader and TreeLoaderBasic documentation.
- */
-template <typename _FReal, int _ORDER>
-class TreeLoaderMpiGenericFCheb : public TreeLoaderMpiGeneric<_FReal, FChebCell<_FReal, _ORDER> > {
-public:
-
-    enum {ORDER=_ORDER};
-
-    /// Constructs the loader and loads the tree.
-    TreeLoaderMpiGenericFCheb(FPerfTestParams& params):
-        TreeLoaderMpiGeneric<_FReal, FChebCell<_FReal, _ORDER>>(params)
-    {}
-
-};
-
-#endif
diff --git a/Tests/noDist/PerfTest/TreeLoaderMpiSplit.hpp b/Tests/noDist/PerfTest/TreeLoaderMpiSplit.hpp
deleted file mode 100644
index d38697fe3289086bb3ef4c8e371142c9c148aa43..0000000000000000000000000000000000000000
--- a/Tests/noDist/PerfTest/TreeLoaderMpiSplit.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-// ==== CMAKE ====
-// Keep in private GIT
-// @SCALFMM_PRIVATE
-
-#ifndef _TREELOADERMPISPLIT_HPP_
-#define _TREELOADERMPISPLIT_HPP_
-
-#include "PerfTestUtils.hpp"
-#include "Utils/FMpi.hpp"
-
-#include "Kernels/P2P/FP2PParticleContainerIndexed.hpp"
-#include "BalanceTree/FCostCell.hpp"
-#include "Components/FSimpleLeaf.hpp"
-#include "Containers/FOctree.hpp"
-
-#include "Files/FMpiSplitFmaLoader.hpp"
-
-
-/**
- * \brief Splitted FMA file tree loader.
- *
- * See FTreeLoader documentation.
- */
-template <typename _FReal, class _BaseClass>
-class TreeLoaderMpiSplit : public FTreeLoader {
-public:
-    using FReal = _FReal;
-
-    // Required type definitions.
-    using BaseClass          = _BaseClass;
-    using CellClass          = FCostCell<BaseClass>;
-    using ContainerClass     = FP2PParticleContainerIndexed<FReal>;
-    using LeafClass          = FSimpleLeaf<FReal, ContainerClass >;
-    using OctreeClass        = FOctree<FReal, CellClass, ContainerClass, LeafClass>;
-
-    /// Mpi application context
-    FMpi* _mpiContext;
-    /// File loader.
-    FMpiSplitFmaLoader<FReal> _loader;
-    /// Required tree member.
-    OctreeClass _tree;
-
-    /// Constructs the loader and loads the tree.
-    TreeLoaderMpiSplit(FPerfTestParams& params):
-        _mpiContext(params.mpiContext),
-        _loader(params.filename,_mpiContext->global().processId()),
-        _tree(params.treeHeight,
-              params.subTreeHeight,
-              _loader.getBoxWidth(),
-              _loader.getCenterOfBox()) {
-        if( nullptr == _mpiContext ) {
-            std::cerr << "No MPI context available" << std::endl;
-            exit(-1);
-        }
-
-        this->loadTree();
-    }
-
-    void loadTree() {
-        std::cout << "Creating & inserting particles" << std::flush;
-        
-        time.tic();
-        
-        FPoint<FReal> position;
-        FReal physicalValue = 0.0;
-        for(FSize idxPart = 0 ; idxPart < _loader.getMyNumberOfParticles() ; ++idxPart) {
-            // Read particle per particle from file
-            _loader.fillParticle(&position,&physicalValue);
-            // put particle in octree
-            _tree.insert(position, idxPart, physicalValue);
-        }
-
-        time.tac();
-        std::cout << " Done  (" << time.elapsed() << " s)." << std::endl;
-    }
-
-};
-
-#endif
diff --git a/Tests/noDist/PerfTest/TreeLoaderMpiSplitFCheb.hpp b/Tests/noDist/PerfTest/TreeLoaderMpiSplitFCheb.hpp
deleted file mode 100644
index 2ede231989c0c12714cacfbbc3fa3705787234f0..0000000000000000000000000000000000000000
--- a/Tests/noDist/PerfTest/TreeLoaderMpiSplitFCheb.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// ==== CMAKE ====
-// Keep in private GIT
-// @SCALFMM_PRIVATE
-
-#ifndef _TREELOADERMPISPLITFCHEB_HPP_
-#define _TREELOADERMPISPLITFCHEB_HPP_
-
-#include "Kernels/Chebyshev/FChebCell.hpp"
-
-#include "TreeLoaderMpiSplit.hpp"
-
-
-/**
- * \brief Tree loader for a Chebyshev cell type tree.
- *
- * See FTreeLoader and TreeLoaderBasic documentation.
- */
-template <typename _FReal, int _ORDER>
-class TreeLoaderMpiSplitFCheb : public TreeLoaderMpiSplit<_FReal, FChebCell<_FReal, _ORDER> > {
-public:
-
-    enum {ORDER=_ORDER};
-
-    /// Constructs the loader and loads the tree.
-    TreeLoaderMpiSplitFCheb(FPerfTestParams& params):
-        TreeLoaderMpiSplit<_FReal, FChebCell<_FReal, _ORDER>>(params)
-    {}
-
-};
-
-#endif
diff --git a/Tests/noDist/testFmmAlgorithmBalanced.cpp b/Tests/noDist/testFmmAlgorithmBalanced.cpp
index 8259ba6ab91fb867b4f1462f52211da2717ccef6..10d84e5cfb67f9729a014343b059333c415d1d77 100644
--- a/Tests/noDist/testFmmAlgorithmBalanced.cpp
+++ b/Tests/noDist/testFmmAlgorithmBalanced.cpp
@@ -17,6 +17,7 @@
 // ==== CMAKE ====
 // Keep in private GIT
 // @SCALFMM_PRIVATE
+// @FUSE_BLAS
 
 
 #include <string>
diff --git a/Tests/noDist/testSphericalDebug.cpp b/Tests/noDist/testSphericalDebug.cpp
index f72ca2a398c3b97901acc8d39e688e5b701a489a..0956b8b69018243e233b3e082f67ed95b0468ff3 100644
--- a/Tests/noDist/testSphericalDebug.cpp
+++ b/Tests/noDist/testSphericalDebug.cpp
@@ -16,7 +16,7 @@
 
 // Keep in private GIT
 // @SCALFMM_PRIVATE
-
+// @FUSE_BLAS
 #define DEBUG_SPHERICAL_M2L
 #define  BLAS_SPHERICAL_COMPRESS
 #define  BLAS_M2L_P
diff --git a/UTests/utestChebyshevDirectTsm.cpp b/UTests/utestChebyshevDirectTsm.cpp
index c0c32fa5b883214dc8e09397bc070f6ae4bbb56f..215a437b756a1f549921c4be7b6cbc57a8297795 100644
--- a/UTests/utestChebyshevDirectTsm.cpp
+++ b/UTests/utestChebyshevDirectTsm.cpp
@@ -69,7 +69,7 @@ class TestChebyshevDirectTsm : public FUTester<TestChebyshevDirectTsm> {
             FPoint<FReal> position;
             loader.fillParticle(&position);
             // put in tree
-            tree.insert(position, FParticleTypeTarget, idxPart, physicalValue);
+            tree.insert(position, FParticleType::FParticleTypeTarget, idxPart, physicalValue);
             // get copy
             particlesTargets[idxPart].setPosition(position);
             *(particlesTargets[idxPart].setPhysicalValue()) = physicalValue;
@@ -84,7 +84,7 @@ class TestChebyshevDirectTsm : public FUTester<TestChebyshevDirectTsm> {
             FPoint<FReal> position;
             loader.fillParticle(&position);
             // put in tree
-            tree.insert(position, FParticleTypeSource, idxPart, physicalValue);
+            tree.insert(position, FParticleType::FParticleTypeSource, idxPart, physicalValue);
             // get copy
             particlesSources[idxPart].setPosition(position);
             *(particlesSources[idxPart].setPhysicalValue()) = physicalValue;
diff --git a/UTests/utestRotationDirectTsm.cpp b/UTests/utestRotationDirectTsm.cpp
index 3ea1f79bedb27a20ce7dd54de70ad497a519ff66..7e1122fdc1cc3bb08d32b13cd6230b2a6fbac143 100644
--- a/UTests/utestRotationDirectTsm.cpp
+++ b/UTests/utestRotationDirectTsm.cpp
@@ -68,7 +68,7 @@ class TestRotationDirectTsm : public FUTester<TestRotationDirectTsm> {
             FPoint<FReal> position;
 			loader.fillParticle(&position);
 			// put in tree
-			tree.insert(position, FParticleTypeTarget, idxPart, physicalValue);
+            tree.insert(position, FParticleType::FParticleTypeTarget, idxPart, physicalValue);
 			// get copy
 			particlesTargets[idxPart].setPosition(position);
 			*(particlesTargets[idxPart].setPhysicalValue()) = physicalValue;
@@ -83,7 +83,7 @@ class TestRotationDirectTsm : public FUTester<TestRotationDirectTsm> {
             FPoint<FReal> position;
 			loader.fillParticle(&position);
 			// put in tree
-			tree.insert(position, FParticleTypeSource, idxPart, physicalValue);
+            tree.insert(position, FParticleType::FParticleTypeSource, idxPart, physicalValue);
 			// get copy
 			particlesSources[idxPart].setPosition(position);
 			*(particlesSources[idxPart].setPhysicalValue()) = physicalValue;
diff --git a/Utils/noDist/FmmAlgorithmTsm.cpp b/Utils/noDist/FmmAlgorithmTsm.cpp
index 3731ce86aaf0c7c2765a065e9218b27c2c5ee524..3579231f9bf1bfacabd62b105f72cbc304282d4e 100644
--- a/Utils/noDist/FmmAlgorithmTsm.cpp
+++ b/Utils/noDist/FmmAlgorithmTsm.cpp
@@ -152,7 +152,7 @@ struct TempMainStruct{
 
 		{
 			// Insert sources
-			FParticleType particleType, source = FParticleTypeSource;
+			FParticleType particleType, source = FParticleType::FParticleTypeSource;
 			for(FSize idxPart = 0 ; idxPart < nbSRC ; ++idxPart){
 				loader.fillParticle(&particlePosition, &particleType);
 //				std::cout << idxPart << "  " << particlePosition << "  type " << particleType
@@ -175,9 +175,9 @@ struct TempMainStruct{
 			//		int nbTargets = 256;
 			for(FSize idxPart = 0 ; idxPart < nbTargets; ++idxPart){
                 particlePosition2.incX(dx);
-                std::cout << idxPart << "  " <<particlePosition2.getX()/dimLeaf<< "   " <<  particlePosition2 << "  type " << FParticleTypeTarget
+                std::cout << idxPart << "  " <<particlePosition2.getX()/dimLeaf<< "   " <<  particlePosition2 << "  type " << static_cast<int>(FParticleType::FParticleTypeTarget)
 						<< "  " <<physicalValue<<std::endl;
-                tree.insert(particlePosition2, FParticleTypeTarget,idxPart,physicalValue );
+                tree.insert(particlePosition2, FParticleType::FParticleTypeTarget,idxPart,physicalValue );
 
 			}
 		}
diff --git a/Utils/python/readGeod.py b/Utils/python/readGeod.py
new file mode 100644
index 0000000000000000000000000000000000000000..496f951b6a2bc5a9fde5ec880105c5a42e001bda
--- /dev/null
+++ b/Utils/python/readGeod.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun  3 10:28:29 2016
+
+@author: coulaud
+"""
+
+import random
+import numpy
+import math
+
+
+meshFile="Vega_Z09_RR1_sans_fils.geod"
+fmmFile="Vega_Z09_RR1_sans_fils.fma"
+
+ptfile = open(fmmFile,'w')
+ptfile.write("8  4 \n")
+
+Fichier = open(meshFile,'r')
+line = Fichier.readline()
+line = Fichier.readline()
+# mear size triangles an points
+line = Fichier.readline()
+size = line.rstrip('\n\r').split()
+print(line)
+Npt = int(size[1])
+NT = int(size[0])
+x = numpy.zeros([Npt,3])
+for i in range(Npt):
+    line = Fichier.readline()
+    size = line.rstrip('\n\r').split()
+    x[i,0] = float(size[1])
+    x[i,1] = float(size[2])
+    x[i,2] = float(size[3])
+    
+a = numpy.amin(x,axis=0)
+b = numpy.amax(x,axis=0)
+print(a)
+print(b)
+length = math.ceil(max(b-a))
+centre= (a+b)/2
+print('Centre: ',centre)
+print('length: ',length,max(b-a))
+ptfile.write(str(Npt)+'  ' + str((length)/2) + '  '+ str(centre[0])
+    + '  '+ str(centre[1])+ '  '+ str(centre[2])  +"\n"  )   
+
+for i in range(Npt):
+    rho = 2*random.random()-1
+    str1 = str(x[i,0])+ '  ' + str(x[i,1])+'  ' +str(x[i,2])+ '  '  + str(rho) +"\n"
+    ptfile.write(str1)
+ptfile.close()
\ No newline at end of file