diff --git a/Addons/BenchEfficiency/bordeaux_0116.pdf b/Addons/BenchEfficiency/bordeaux_0116.pdf deleted file mode 100644 index c7785df796322f7b6779f10deff6bef7ef3bb436..0000000000000000000000000000000000000000 Binary files a/Addons/BenchEfficiency/bordeaux_0116.pdf and /dev/null differ diff --git a/Addons/BenchEfficiency/execAllHomogeneous.sh b/Addons/BenchEfficiency/execAllHomogeneous.sh index 483add05b41f306b9bc971e6f943703284e1d028..a4d5899711d3699d8ae5d1d9d223bf67aacf659b 100644 --- a/Addons/BenchEfficiency/execAllHomogeneous.sh +++ b/Addons/BenchEfficiency/execAllHomogeneous.sh @@ -13,14 +13,14 @@ cpu=1 STARPU_NCPUS=$cpu STARPU_NCUDA=0 -logoutput=`./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_SEQ` +logoutput=`./Tests/Release/testBlockedUniformBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_SEQ` if [[ $VERBOSE ]] ; then echo $logoutput fi -$SCALFMM_STARPU_DIR/bin/starpu_fxt_tool -i "/tmp/prof_file_"$USER"_0" +$TUTORIAL_STARPU_DIR/bin/starpu_fxt_tool -i "/tmp/prof_file_"$USER"_0" rec_name="$SCALFMM_RES_DIR/trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_BS_CPU_SEQ-CPU_$cpu.rec" mv trace.rec $rec_name -python $SCALFMM_STARPU_DIR/bin/starpu_trace_state_stats.py -t $rec_name > $rec_name.time +python $TUTORIAL_STARPU_DIR/bin/starpu_trace_state_stats.py -t $rec_name > $rec_name.time for (( cpu=1 ; cpu<=$SCALFMM_MAX_NB_CPU ; cpu++)) ; do @@ -29,13 +29,13 @@ for (( cpu=1 ; cpu<=$SCALFMM_MAX_NB_CPU ; cpu++)) ; do STARPU_NCPUS=$cpu STARPU_NCUDA=0 - logoutput=`./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_PAR` + logoutput=`./Tests/Release/testBlockedUniformBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_PAR` if [[ $VERBOSE ]] ; then echo $logoutput fi - $SCALFMM_STARPU_DIR/bin/starpu_fxt_tool -i "/tmp/prof_file_"$USER"_0" + $TUTORIAL_STARPU_DIR/bin/starpu_fxt_tool -i "/tmp/prof_file_"$USER"_0" rec_name="$SCALFMM_RES_DIR/trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_BS_CPU_PAR-CPU_$cpu.rec" mv trace.rec $rec_name - python $SCALFMM_STARPU_DIR/bin/starpu_trace_state_stats.py -t $rec_name > $rec_name.time + python $TUTORIAL_STARPU_DIR/bin/starpu_trace_state_stats.py -t $rec_name > $rec_name.time done diff --git a/Addons/BenchEfficiency/global-eff.data b/Addons/BenchEfficiency/global-eff.data deleted file mode 100644 index b6c8743f9af5b92d32687ae620f4b810a3bdd682..0000000000000000000000000000000000000000 --- a/Addons/BenchEfficiency/global-eff.data +++ /dev/null @@ -1,25 +0,0 @@ -0 granularity-eff tasks-eff runtime-eff pipeline-eff -1 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 -2 9.588832e-01 9.588832e-01 9.972215e-01 9.999844e-01 -3 9.984195e-01 9.984195e-01 9.992539e-01 9.999840e-01 -4 9.936055e-01 9.936055e-01 9.992505e-01 9.999843e-01 -5 9.859209e-01 9.859209e-01 9.991938e-01 9.999840e-01 -6 9.913540e-01 9.913540e-01 9.992224e-01 9.999848e-01 -7 9.980442e-01 9.980442e-01 9.993216e-01 9.999841e-01 -8 9.932070e-01 9.932070e-01 9.993356e-01 9.999844e-01 -9 9.953908e-01 9.953908e-01 9.993136e-01 9.999852e-01 -10 9.930517e-01 9.930517e-01 9.991280e-01 9.999848e-01 -11 9.937148e-01 9.937148e-01 9.992802e-01 9.999838e-01 -12 9.895039e-01 9.895039e-01 9.992958e-01 9.999842e-01 -13 9.934571e-01 9.934571e-01 9.992770e-01 9.999845e-01 -14 9.939346e-01 9.939346e-01 9.993242e-01 9.999845e-01 -15 9.929928e-01 9.929928e-01 9.993077e-01 9.999849e-01 -16 9.946804e-01 9.946804e-01 9.993051e-01 9.999838e-01 -17 9.959137e-01 9.959137e-01 9.992893e-01 9.999839e-01 -18 9.652375e-01 9.652375e-01 9.961152e-01 9.999832e-01 -19 9.937258e-01 9.937258e-01 9.992987e-01 9.999845e-01 -20 9.949256e-01 9.949256e-01 9.992757e-01 9.999843e-01 -21 9.886613e-01 9.886613e-01 9.992616e-01 9.999838e-01 -22 9.921982e-01 9.921982e-01 9.992499e-01 9.999842e-01 -23 1.001717e+00 1.001717e+00 9.992881e-01 9.999846e-01 -24 9.957642e-01 9.957642e-01 9.992461e-01 9.999836e-01 diff --git a/Addons/BenchEfficiency/global-eff.png b/Addons/BenchEfficiency/global-eff.png deleted file mode 100644 index fc24c14787e6bbc9276edf022a5f81b405a91268..0000000000000000000000000000000000000000 Binary files a/Addons/BenchEfficiency/global-eff.png and /dev/null differ diff --git a/Addons/BenchEfficiency/mergetimefile.cpp b/Addons/BenchEfficiency/mergetimefile.cpp index 8e781fea5cb538651c7c2fd5e89cd5ede55edaa4..6dcdc96f733833ebb39d516c18224030cfe9c570 100644 --- a/Addons/BenchEfficiency/mergetimefile.cpp +++ b/Addons/BenchEfficiency/mergetimefile.cpp @@ -68,7 +68,7 @@ struct LineData{ } } if(words.size() != 4){ - printf("Error line is no composed of 4 words\n"); + printf("Error line is no composed of 4 words, has %lu for %s\n", words.size(), line); exit(111); } name = ReduceName(words[0].substr(1, words[0].size() - 2)); @@ -186,30 +186,32 @@ int main(int argc, char** argv){ } while((sizeLine = getline((char**)&line, &sizeLine, timeFile)) != -1){ - LineData dt(line); - // Task, Runtime, Other - if(dt.type == "Task"){ - if(dt.name != "execute_on_all_wrapper"){ - timeTasks[idxFile][dt.name] += dt.duration; - allTaskNames.insert(dt.name); - times[idxFile].tt += dt.duration; + if(strncmp(line, "WARNING", 7) != 0){ + LineData dt(line); + // Task, Runtime, Other + if(dt.type == "Task"){ + if(dt.name != "execute_on_all_wrapper"){ + timeTasks[idxFile][dt.name] += dt.duration; + allTaskNames.insert(dt.name); + times[idxFile].tt += dt.duration; + } } - } - else if(dt.type == "Runtime"){ - if(dt.name == "Scheduling" - || dt.name == "FetchingInput" - || dt.name == "PushingOutput"){ - times[idxFile].tr += dt.duration; + else if(dt.type == "Runtime"){ + if(dt.name == "Scheduling" + || dt.name == "FetchingInput" + || dt.name == "PushingOutput"){ + times[idxFile].tr += dt.duration; + } } - } - else if(dt.type == "Other"){ - if(dt.name == "Idle"){ - times[idxFile].ti += dt.duration; + else if(dt.type == "Other"){ + if(dt.name == "Idle"){ + times[idxFile].ti += dt.duration; + } + } + else { + printf("Arg do not know type %s\n", dt.type.c_str()); + //return 3; } - } - else { - printf("Arg do not know type %s\n", dt.type.c_str()); - return 3; } } diff --git a/Addons/BenchEfficiency/par-bs-search.png b/Addons/BenchEfficiency/par-bs-search.png deleted file mode 100644 index 4243937f718415f59047a70a48cb51391dcd223e..0000000000000000000000000000000000000000 Binary files a/Addons/BenchEfficiency/par-bs-search.png and /dev/null differ diff --git a/Addons/BenchEfficiency/scalfmm.html b/Addons/BenchEfficiency/scalfmm.html deleted file mode 100644 index 3ac064e14379adf390033d3a6632416598dcf5b0..0000000000000000000000000000000000000000 --- a/Addons/BenchEfficiency/scalfmm.html +++ /dev/null @@ -1,594 +0,0 @@ -<!DOCTYPE html> -<html> -<head> - <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> - <title>scalfmm.html</title> - <meta name="generator" content="Haroopad 0.13.1" /> - <meta name="viewport" content="width=device-width, initial-scale=1.0"> - - <style>div.oembedall-githubrepos{border:1px solid #DDD;border-radius:4px;list-style-type:none;margin:0 0 10px;padding:8px 10px 0;font:13.34px/1.4 helvetica,arial,freesans,clean,sans-serif;width:452px;background-color:#fff}div.oembedall-githubrepos .oembedall-body{background:-moz-linear-gradient(center top,#FAFAFA,#EFEFEF);background:-webkit-gradient(linear,left top,left bottom,from(#FAFAFA),to(#EFEFEF));border-bottom-left-radius:4px;border-bottom-right-radius:4px;border-top:1px solid #EEE;margin-left:-10px;margin-top:8px;padding:5px 10px;width:100%}div.oembedall-githubrepos h3{font-size:14px;margin:0;padding-left:18px;white-space:nowrap}div.oembedall-githubrepos p.oembedall-description{color:#444;font-size:12px;margin:0 0 3px}div.oembedall-githubrepos p.oembedall-updated-at{color:#888;font-size:11px;margin:0}div.oembedall-githubrepos ul.oembedall-repo-stats{border:none;float:right;font-size:11px;font-weight:700;padding-left:15px;position:relative;z-index:5;margin:0}div.oembedall-githubrepos ul.oembedall-repo-stats li{border:none;color:#666;display:inline-block;list-style-type:none;margin:0!important}div.oembedall-githubrepos ul.oembedall-repo-stats li a{background-color:transparent;border:none;color:#666!important;background-position:5px -2px;background-repeat:no-repeat;border-left:1px solid #DDD;display:inline-block;height:21px;line-height:21px;padding:0 5px 0 23px}div.oembedall-githubrepos ul.oembedall-repo-stats li:first-child a{border-left:medium none;margin-right:-3px}div.oembedall-githubrepos ul.oembedall-repo-stats li a:hover{background:5px -27px no-repeat #4183C4;color:#FFF!important;text-decoration:none}div.oembedall-githubrepos ul.oembedall-repo-stats li:first-child a:hover{border-bottom-left-radius:3px;border-top-left-radius:3px}ul.oembedall-repo-stats li:last-child a:hover{border-bottom-right-radius:3px;border-top-right-radius:3px}span.oembedall-closehide{background-color:#aaa;border-radius:2px;cursor:pointer;margin-right:3px}div.oembedall-container{margin-top:5px;text-align:left}.oembedall-ljuser{font-weight:700}.oembedall-ljuser img{vertical-align:bottom;border:0;padding-right:1px}.oembedall-stoqembed{border-bottom:1px dotted #999;float:left;overflow:hidden;width:730px;line-height:1;background:#FFF;color:#000;font-family:Arial,Liberation Sans,DejaVu Sans,sans-serif;font-size:80%;text-align:left;margin:0;padding:0}.oembedall-stoqembed a{color:#07C;text-decoration:none;margin:0;padding:0}.oembedall-stoqembed a:hover{text-decoration:underline}.oembedall-stoqembed a:visited{color:#4A6B82}.oembedall-stoqembed h3{font-family:Trebuchet MS,Liberation Sans,DejaVu Sans,sans-serif;font-size:130%;font-weight:700;margin:0;padding:0}.oembedall-stoqembed .oembedall-reputation-score{color:#444;font-size:120%;font-weight:700;margin-right:2px}.oembedall-stoqembed .oembedall-user-info{height:35px;width:185px}.oembedall-stoqembed .oembedall-user-info .oembedall-user-gravatar32{float:left;height:32px;width:32px}.oembedall-stoqembed .oembedall-user-info .oembedall-user-details{float:left;margin-left:5px;overflow:hidden;white-space:nowrap;width:145px}.oembedall-stoqembed .oembedall-question-hyperlink{font-weight:700}.oembedall-stoqembed .oembedall-stats{background:#EEE;margin:0 0 0 7px;padding:4px 7px 6px;width:58px}.oembedall-stoqembed .oembedall-statscontainer{float:left;margin-right:8px;width:86px}.oembedall-stoqembed .oembedall-votes{color:#555;padding:0 0 7px;text-align:center}.oembedall-stoqembed .oembedall-vote-count-post{font-size:240%;color:#808185;display:block;font-weight:700}.oembedall-stoqembed .oembedall-views{color:#999;padding-top:4px;text-align:center}.oembedall-stoqembed .oembedall-status{margin-top:-3px;padding:4px 0;text-align:center;background:#75845C;color:#FFF}.oembedall-stoqembed .oembedall-status strong{color:#FFF;display:block;font-size:140%}.oembedall-stoqembed .oembedall-summary{float:left;width:635px}.oembedall-stoqembed .oembedall-excerpt{line-height:1.2;margin:0;padding:0 0 5px}.oembedall-stoqembed .oembedall-tags{float:left;line-height:18px}.oembedall-stoqembed .oembedall-tags a:hover{text-decoration:none}.oembedall-stoqembed .oembedall-post-tag{background-color:#E0EAF1;border-bottom:1px solid #3E6D8E;border-right:1px solid #7F9FB6;color:#3E6D8E;font-size:90%;line-height:2.4;margin:2px 2px 2px 0;padding:3px 4px;text-decoration:none;white-space:nowrap}.oembedall-stoqembed .oembedall-post-tag:hover{background-color:#3E6D8E;border-bottom:1px solid #37607D;border-right:1px solid #37607D;color:#E0EAF1}.oembedall-stoqembed .oembedall-fr{float:right}.oembedall-stoqembed .oembedall-statsarrow{background-image:url(http://cdn.sstatic.net/stackoverflow/img/sprites.png?v=3);background-repeat:no-repeat;overflow:hidden;background-position:0 -435px;float:right;height:13px;margin-top:12px;width:7px}.oembedall-facebook1{border:1px solid #1A3C6C;padding:0;font:13.34px/1.4 verdana;width:500px}.oembedall-facebook2{background-color:#627add}.oembedall-facebook2 a{color:#e8e8e8;text-decoration:none}.oembedall-facebookBody{background-color:#fff;vertical-align:top;padding:5px}.oembedall-facebookBody .contents{display:inline-block;width:100%}.oembedall-facebookBody div img{float:left;margin-right:5px}div.oembedall-lanyard{-webkit-box-shadow:none;-webkit-transition-delay:0s;-webkit-transition-duration:.4000000059604645s;-webkit-transition-property:width;-webkit-transition-timing-function:cubic-bezier(0.42,0,.58,1);background-attachment:scroll;background-clip:border-box;background-color:transparent;background-image:none;background-origin:padding-box;border-width:0;box-shadow:none;color:#112644;display:block;float:left;font-family:'Trebuchet MS',Trebuchet,sans-serif;font-size:16px;height:253px;line-height:19px;margin:0;max-width:none;min-height:0;outline:#112644 0;overflow-x:visible;overflow-y:visible;padding:0;position:relative;text-align:left;vertical-align:baseline;width:804px}div.oembedall-lanyard .tagline{font-size:1.5em}div.oembedall-lanyard .wrapper{overflow:hidden;clear:both}div.oembedall-lanyard .split{float:left;display:inline}div.oembedall-lanyard .prominent-place .flag:active,div.oembedall-lanyard .prominent-place .flag:focus,div.oembedall-lanyard .prominent-place .flag:hover,div.oembedall-lanyard .prominent-place .flag:link,div.oembedall-lanyard .prominent-place .flag:visited{float:left;display:block;width:48px;height:48px;position:relative;top:-5px;margin-right:10px}div.oembedall-lanyard .place-context{font-size:.889em}div.oembedall-lanyard .prominent-place .sub-place{display:block}div.oembedall-lanyard .prominent-place{font-size:1.125em;line-height:1.1em;font-weight:400}div.oembedall-lanyard .main-date{color:#8CB4E0;font-weight:700;line-height:1.1}div.oembedall-lanyard .first{width:48.57%;margin:0 0 0 2.857%}.mermaid .label{color:#333}.node circle,.node polygon,.node rect{fill:#cde498;stroke:#13540c;stroke-width:1px}.edgePath .path{stroke:green;stroke-width:1.5px}.cluster rect{fill:#cdffb2;rx:40;stroke:#6eaa49;stroke-width:1px}.cluster text{fill:#333}.actor{stroke:#13540c;fill:#cde498}text.actor{fill:#000;stroke:none}.actor-line{stroke:grey}.messageLine0{stroke-width:1.5;stroke-dasharray:"2 2";marker-end:"url(#arrowhead)";stroke:#333}.messageLine1{stroke-width:1.5;stroke-dasharray:"2 2";stroke:#333}#arrowhead{fill:#333}#crosshead path{fill:#333!important;stroke:#333!important}.messageText{fill:#333;stroke:none}.labelBox{stroke:#326932;fill:#cde498}.labelText,.loopText{fill:#000;stroke:none}.loopLine{stroke-width:2;stroke-dasharray:"2 2";marker-end:"url(#arrowhead)";stroke:#326932}.note{stroke:#6eaa49;fill:#fff5ad}.noteText{fill:#000;stroke:none;font-family:'trebuchet ms',verdana,arial;font-size:14px}.section{stroke:none;opacity:.2}.section0,.section2{fill:#6eaa49}.section1,.section3{fill:#fff;opacity:.2}.sectionTitle0,.sectionTitle1,.sectionTitle2,.sectionTitle3{fill:#333}.sectionTitle{text-anchor:start;font-size:11px;text-height:14px}.grid .tick{stroke:lightgrey;opacity:.3;shape-rendering:crispEdges}.grid path{stroke-width:0}.today{fill:none;stroke:red;stroke-width:2px}.task{stroke-width:2}.taskText{text-anchor:middle;font-size:11px}.taskTextOutsideRight{fill:#000;text-anchor:start;font-size:11px}.taskTextOutsideLeft{fill:#000;text-anchor:end;font-size:11px}.taskText0,.taskText1,.taskText2,.taskText3{fill:#fff}.task0,.task1,.task2,.task3{fill:#487e3a;stroke:#13540c}.taskTextOutside0,.taskTextOutside1,.taskTextOutside2,.taskTextOutside3{fill:#000}.active0,.active1,.active2,.active3{fill:#cde498;stroke:#13540c}.activeText0,.activeText1,.activeText2,.activeText3{fill:#000!important}.done0,.done1,.done2,.done3{stroke:grey;fill:lightgrey;stroke-width:2}.doneText0,.doneText1,.doneText2,.doneText3{fill:#000!important}.crit0,.crit1,.crit2,.crit3{stroke:#f88;fill:red;stroke-width:2}.activeCrit0,.activeCrit1,.activeCrit2,.activeCrit3{stroke:#f88;fill:#cde498;stroke-width:2}.doneCrit0,.doneCrit1,.doneCrit2,.doneCrit3{stroke:#f88;fill:lightgrey;stroke-width:2;cursor:pointer;shape-rendering:crispEdges}.activeCritText0,.activeCritText1,.activeCritText2,.activeCritText3,.doneCritText0,.doneCritText1,.doneCritText2,.doneCritText3{fill:#000!important}.titleText{text-anchor:middle;font-size:18px;fill:#000}text{font-family:'trebuchet ms',verdana,arial;font-size:14px}html{height:100%}body{margin:0!important;padding:5px 20px 26px!important;background-color:#fff;font-family:"Lucida Grande","Segoe UI","Apple SD Gothic Neo","Malgun Gothic","Lucida Sans Unicode",Helvetica,Arial,sans-serif;font-size:.9em;overflow-x:hidden;overflow-y:auto}br,h1,h2,h3,h4,h5,h6{clear:both}hr.page{background:url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAYAAAAECAYAAACtBE5DAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAAyJpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADw/eHBhY2tldCBiZWdpbj0i77u/IiBpZD0iVzVNME1wQ2VoaUh6cmVTek5UY3prYzlkIj8+IDx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IkFkb2JlIFhNUCBDb3JlIDUuMC1jMDYwIDYxLjEzNDc3NywgMjAxMC8wMi8xMi0xNzozMjowMCAgICAgICAgIj4gPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4gPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIgeG1sbnM6eG1wPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvIiB4bWxuczp4bXBNTT0iaHR0cDovL25zLmFkb2JlLmNvbS94YXAvMS4wL21tLyIgeG1sbnM6c3RSZWY9Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9zVHlwZS9SZXNvdXJjZVJlZiMiIHhtcDpDcmVhdG9yVG9vbD0iQWRvYmUgUGhvdG9zaG9wIENTNSBNYWNpbnRvc2giIHhtcE1NOkluc3RhbmNlSUQ9InhtcC5paWQ6OENDRjNBN0E2NTZBMTFFMEI3QjRBODM4NzJDMjlGNDgiIHhtcE1NOkRvY3VtZW50SUQ9InhtcC5kaWQ6OENDRjNBN0I2NTZBMTFFMEI3QjRBODM4NzJDMjlGNDgiPiA8eG1wTU06RGVyaXZlZEZyb20gc3RSZWY6aW5zdGFuY2VJRD0ieG1wLmlpZDo4Q0NGM0E3ODY1NkExMUUwQjdCNEE4Mzg3MkMyOUY0OCIgc3RSZWY6ZG9jdW1lbnRJRD0ieG1wLmRpZDo4Q0NGM0E3OTY1NkExMUUwQjdCNEE4Mzg3MkMyOUY0OCIvPiA8L3JkZjpEZXNjcmlwdGlvbj4gPC9yZGY6UkRGPiA8L3g6eG1wbWV0YT4gPD94cGFja2V0IGVuZD0iciI/PqqezsUAAAAfSURBVHjaYmRABcYwBiM2QSA4y4hNEKYDQxAEAAIMAHNGAzhkPOlYAAAAAElFTkSuQmCC) repeat-x;border:0;height:3px;padding:0}hr.underscore{border-top-style:dashed!important}body >:first-child{margin-top:0!important}img.plugin{box-shadow:0 1px 3px rgba(0,0,0,.1);border-radius:3px}iframe{border:0}figure{-webkit-margin-before:0;-webkit-margin-after:0;-webkit-margin-start:0;-webkit-margin-end:0}kbd{border:1px solid #aaa;-moz-border-radius:2px;-webkit-border-radius:2px;border-radius:2px;-moz-box-shadow:1px 2px 2px #ddd;-webkit-box-shadow:1px 2px 2px #ddd;box-shadow:1px 2px 2px #ddd;background-color:#f9f9f9;background-image:-moz-linear-gradient(top,#eee,#f9f9f9,#eee);background-image:-o-linear-gradient(top,#eee,#f9f9f9,#eee);background-image:-webkit-linear-gradient(top,#eee,#f9f9f9,#eee);background-image:linear-gradient(top,#eee,#f9f9f9,#eee);padding:1px 3px;font-family:inherit;font-size:.85em}.oembeded .oembed_photo{display:inline-block}img[data-echo]{margin:25px 0;width:100px;height:100px;background:url(../img/ajax.gif) center center no-repeat #fff}.spinner{display:inline-block;width:10px;height:10px;margin-bottom:-.1em;border:2px solid rgba(0,0,0,.5);border-top-color:transparent;border-radius:100%;-webkit-animation:spin 1s infinite linear;animation:spin 1s infinite linear}.spinner:after{content:'';display:block;width:0;height:0;position:absolute;top:-6px;left:0;border:4px solid transparent;border-bottom-color:rgba(0,0,0,.5);-webkit-transform:rotate(45deg);transform:rotate(45deg)}@-webkit-keyframes spin{to{-webkit-transform:rotate(360deg)}}@keyframes spin{to{transform:rotate(360deg)}}p.toc{margin:0!important}p.toc ul{padding-left:10px}p.toc>ul{padding:10px;margin:0 10px;display:inline-block;border:1px solid #ededed;border-radius:5px}p.toc li,p.toc ul{list-style-type:none}p.toc li{width:100%;padding:0;overflow:hidden}p.toc li a::after{content:"."}p.toc li a:before{content:"• "}p.toc h5{text-transform:uppercase}p.toc .title{float:left;padding-right:3px}p.toc .number{margin:0;float:right;padding-left:3px;background:#fff;display:none}input.task-list-item{margin-left:-1.62em}.markdown{font-family:"Hiragino Sans GB","Microsoft YaHei",STHeiti,SimSun,"Lucida Grande","Lucida Sans Unicode","Lucida Sans",'Segoe UI',AppleSDGothicNeo-Medium,'Malgun Gothic',Verdana,Tahoma,sans-serif;padding:20px}.markdown a{text-decoration:none;vertical-align:baseline}.markdown a:hover{text-decoration:underline}.markdown h1{font-size:2.2em;font-weight:700;margin:1.5em 0 1em}.markdown h2{font-size:1.8em;font-weight:700;margin:1.275em 0 .85em}.markdown h3{font-size:1.6em;font-weight:700;margin:1.125em 0 .75em}.markdown h4{font-size:1.4em;font-weight:700;margin:.99em 0 .66em}.markdown h5{font-size:1.2em;font-weight:700;margin:.855em 0 .57em}.markdown h6{font-size:1em;font-weight:700;margin:.75em 0 .5em}.markdown h1+p,.markdown h1:first-child,.markdown h2+p,.markdown h2:first-child,.markdown h3+p,.markdown h3:first-child,.markdown h4+p,.markdown h4:first-child,.markdown h5+p,.markdown h5:first-child,.markdown h6+p,.markdown h6:first-child{margin-top:0}.markdown hr{border:1px solid #ccc}.markdown p{margin:1em 0;word-wrap:break-word}.markdown ol{list-style-type:decimal}.markdown li{display:list-item;line-height:1.4em}.markdown blockquote{margin:1em 20px}.markdown blockquote>:first-child{margin-top:0}.markdown blockquote>:last-child{margin-bottom:0}.markdown blockquote cite:before{content:'\2014 \00A0'}.markdown .code{border-radius:3px;word-wrap:break-word}.markdown pre{border-radius:3px;word-wrap:break-word;border:1px solid #ccc;overflow:auto;padding:.5em}.markdown pre code{border:0;display:block}.markdown pre>code{font-family:Consolas,Inconsolata,Courier,monospace;font-weight:700;white-space:pre;margin:0}.markdown code{border-radius:3px;word-wrap:break-word;border:1px solid #ccc;padding:0 5px;margin:0 2px}.markdown img{max-width:100%}.markdown mark{color:#000;background-color:#fcf8e3}.markdown table{padding:0;border-collapse:collapse;border-spacing:0;margin-bottom:16px}.markdown table tr td,.markdown table tr th{border:1px solid #ccc;margin:0;padding:6px 13px}.markdown table tr th{font-weight:700}.markdown table tr th>:first-child{margin-top:0}.markdown table tr th>:last-child{margin-bottom:0}.markdown table tr td>:first-child{margin-top:0}.markdown table tr td>:last-child{margin-bottom:0}@import url(http://fonts.googleapis.com/css?family=Roboto+Condensed:300italic,400italic,700italic,400,300,700);.haroopad{padding:20px;color:#222;font-size:15px;font-family:"Roboto Condensed",Tauri,"Hiragino Sans GB","Microsoft YaHei",STHeiti,SimSun,"Lucida Grande","Lucida Sans Unicode","Lucida Sans",'Segoe UI',AppleSDGothicNeo-Medium,'Malgun Gothic',Verdana,Tahoma,sans-serif;background:#fff;line-height:1.6;-webkit-font-smoothing:antialiased}.haroopad a{color:#3269a0}.haroopad a:hover{color:#4183c4}.haroopad h2{border-bottom:1px solid #e6e6e6}.haroopad h6{color:#777}.haroopad hr{border:1px solid #e6e6e6}.haroopad blockquote>code,.haroopad h1>code,.haroopad h2>code,.haroopad h3>code,.haroopad h4>code,.haroopad h5>code,.haroopad h6>code,.haroopad li>code,.haroopad p>code,.haroopad td>code{font-family:Consolas,"Liberation Mono",Menlo,Courier,monospace;font-size:85%;background-color:rgba(0,0,0,.02);padding:.2em .5em;border:1px solid #efefef}.haroopad pre>code{font-size:1em;letter-spacing:-1px;font-weight:700}.haroopad blockquote{border-left:4px solid #e6e6e6;padding:0 15px;color:#777}.haroopad table{background-color:#fafafa}.haroopad table tr td,.haroopad table tr th{border:1px solid #e6e6e6}.haroopad table tr:nth-child(2n){background-color:#f2f2f2}.hljs{display:block;overflow-x:auto;padding:.5em;background:#282b2e;-webkit-text-size-adjust:none}.css .hljs-id,.hljs-change,.hljs-flow,.hljs-keyword,.hljs-literal,.hljs-winutils,.nginx .hljs-title,.tex .hljs-special{color:#93c763}.hljs-number{color:#ffcd22}.hljs{color:#e0e2e4}.css .hljs-pseudo,.css .hljs-tag{color:#d0d2b5}.hljs .hljs-constant,.hljs-attribute{color:#668bb0}.xml .hljs-attribute{color:#b3b689}.xml .hljs-tag .hljs-value{color:#e8e2b7}.hljs-class .hljs-title,.hljs-code,.hljs-header{color:#fff}.hljs-class,.hljs-hexcolor{color:#93c763}.hljs-regexp{color:#d39745}.hljs-at_rule,.hljs-at_rule .hljs-keyword{color:#a082bd}.hljs-doctype{color:#557182}.apache .hljs-cbracket,.apache .hljs-tag,.django .hljs-filter .hljs-argument,.django .hljs-template_tag,.django .hljs-variable,.hljs-addition,.hljs-attr_selector,.hljs-built_in,.hljs-bullet,.hljs-emphasis,.hljs-envvar,.hljs-javadoc,.hljs-link_url,.hljs-pragma,.hljs-preprocessor,.hljs-prompt,.hljs-pseudo,.hljs-stream,.hljs-subst,.hljs-tag,.hljs-tag .hljs-title,.hljs-type,.ruby .hljs-class .hljs-parent,.smalltalk .hljs-array,.smalltalk .hljs-class,.smalltalk .hljs-localvars,.tex .hljs-command{color:#8cbbad}.hljs-string{color:#ec7600}.apache .hljs-sqbracket,.hljs-annotation,.hljs-blockquote,.hljs-comment,.hljs-decorator,.hljs-deletion,.hljs-horizontal_rule,.hljs-pi,.hljs-shebang,.tex .hljs-formula{color:#818e96}.apache .hljs-tag,.bash .hljs-variable,.css .hljs-id,.diff .hljs-header,.hljs-at_rule .hljs-keyword,.hljs-chunk,.hljs-dartdoc,.hljs-header,.hljs-keyword,.hljs-literal,.hljs-phpdoc,.hljs-request,.hljs-status,.hljs-title,.hljs-type,.hljs-winutils,.rsl .hljs-built_in,.smalltalk .hljs-class,.tex .hljs-special,.vbscript .hljs-built_in{font-weight:700}.coffeescript .javascript,.javascript .xml,.tex .hljs-formula,.xml .css,.xml .hljs-cdata,.xml .javascript,.xml .vbscript{opacity:.5}.MathJax_Hover_Frame{border-radius:.25em;-webkit-border-radius:.25em;-moz-border-radius:.25em;-khtml-border-radius:.25em;box-shadow:0 0 15px #83A;-webkit-box-shadow:0 0 15px #83A;-moz-box-shadow:0 0 15px #83A;-khtml-box-shadow:0 0 15px #83A;border:1px solid #A6D!important;display:inline-block;position:absolute}.MathJax_Hover_Arrow{position:absolute;width:15px;height:11px;cursor:pointer}#MathJax_About{position:fixed;left:50%;width:auto;text-align:center;border:3px outset;padding:1em 2em;background-color:#DDD;color:#000;cursor:default;font-family:message-box;font-size:120%;font-style:normal;text-indent:0;text-transform:none;line-height:normal;letter-spacing:normal;word-spacing:normal;word-wrap:normal;white-space:nowrap;float:none;z-index:201;border-radius:15px;-webkit-border-radius:15px;-moz-border-radius:15px;-khtml-border-radius:15px;box-shadow:0 10px 20px gray;-webkit-box-shadow:0 10px 20px gray;-moz-box-shadow:0 10px 20px gray;-khtml-box-shadow:0 10px 20px gray;filter:progid:DXImageTransform.Microsoft.dropshadow(OffX=2, OffY=2, Color='gray', Positive='true')}.MathJax_Menu{position:absolute;background-color:#fff;color:#000;width:auto;padding:5px 0;border:1px solid #CCC;margin:0;cursor:default;font:menu;text-align:left;text-indent:0;text-transform:none;line-height:normal;letter-spacing:normal;word-spacing:normal;word-wrap:normal;white-space:nowrap;float:none;z-index:201;border-radius:5px;-webkit-border-radius:5px;-moz-border-radius:5px;-khtml-border-radius:5px;box-shadow:0 10px 20px gray;-webkit-box-shadow:0 10px 20px gray;-moz-box-shadow:0 10px 20px gray;-khtml-box-shadow:0 10px 20px gray;filter:progid:DXImageTransform.Microsoft.dropshadow(OffX=2, OffY=2, Color='gray', Positive='true')}.MathJax_MenuItem{padding:1px 2em;background:0 0}.MathJax_MenuArrow{position:absolute;right:.5em;color:#666}.MathJax_MenuActive .MathJax_MenuArrow{color:#fff}.MathJax_MenuArrow.RTL{left:.5em;right:auto}.MathJax_MenuCheck{position:absolute;left:.7em}.MathJax_MenuCheck.RTL{right:.7em;left:auto}.MathJax_MenuRadioCheck{position:absolute;left:.7em}.MathJax_MenuRadioCheck.RTL{right:.7em;left:auto}.MathJax_MenuLabel{padding:1px 2em 3px 1.33em;font-style:italic}.MathJax_MenuRule{border-top:1px solid #DDD;margin:4px 3px}.MathJax_MenuDisabled{color:GrayText}.MathJax_MenuActive{background-color:#606872;color:#fff}.MathJax_Menu_Close{position:absolute;width:31px;height:31px;top:-15px;left:-15px}#MathJax_Zoom{position:absolute;background-color:#F0F0F0;overflow:auto;display:block;z-index:301;padding:.5em;border:1px solid #000;margin:0;font-weight:400;font-style:normal;text-align:left;text-indent:0;text-transform:none;line-height:normal;letter-spacing:normal;word-spacing:normal;word-wrap:normal;white-space:nowrap;float:none;box-shadow:5px 5px 15px #AAA;-webkit-box-shadow:5px 5px 15px #AAA;-moz-box-shadow:5px 5px 15px #AAA;-khtml-box-shadow:5px 5px 15px #AAA;filter:progid:DXImageTransform.Microsoft.dropshadow(OffX=2, OffY=2, Color='gray', Positive='true')}#MathJax_ZoomOverlay{position:absolute;left:0;top:0;z-index:300;display:inline-block;width:100%;height:100%;border:0;padding:0;margin:0;background-color:#fff;opacity:0;filter:alpha(opacity=0)}#MathJax_ZoomFrame{position:relative;display:inline-block;height:0;width:0}#MathJax_ZoomEventTrap{position:absolute;left:0;top:0;z-index:302;display:inline-block;border:0;padding:0;margin:0;background-color:#fff;opacity:0;filter:alpha(opacity=0)}.MathJax_Preview{color:#888}#MathJax_Message{position:fixed;left:1px;bottom:2px;background-color:#E6E6E6;border:1px solid #959595;margin:0;padding:2px 8px;z-index:102;color:#000;font-size:80%;width:auto;white-space:nowrap}#MathJax_MSIE_Frame{position:absolute;top:0;left:0;width:0;z-index:101;border:0;margin:0;padding:0}.MathJax_Error{color:#C00;font-style:italic}footer{position:fixed;font-size:.8em;text-align:right;bottom:0;margin-left:-25px;height:20px;width:100%}</style> -</head> -<body class="markdown haroopad"> -<h1 id="scalfmm-with-starpu+cuda"><a name="scalfmm-with-starpu+cuda" href="#scalfmm-with-starpu+cuda"></a>ScalFMM with StarPU+CUDA</h1><p>In this tutorial, we provide the commands to install ScalFMM and the needed tools in order to compute parallel efficiencies.<br>We first show how to obtain the homogeneous efficencies and then the heterogeneous ones (not done yet).</p><h2 id="installing-the-libraries"><a name="installing-the-libraries" href="#installing-the-libraries"></a>Installing the libraries</h2><p>For some installation steps, we provide a “valid-if” test which shows if the previous command has been done correctly or not.<br>In case of success <code>STEP-OK</code> will be print-out.<br>In addition, if a library is already installed on the system, it is possible to set the output variables directly and test with the “valid-if” command if it will work.</p><p>It is possible to follow these steps only to compile ScalFMM above StarPU and so we marked the installation of execution-trace tools as <strong>Optional</strong>.<br>However, we higly recommended to install them and to follow all the steps since they let have the efficiencies.<br>But if one wants to execute without any overhead, it might need to remove the usage of FXT.</p><h3 id="pre-requiste:"><a name="pre-requiste:" href="#pre-requiste:"></a>Pre-requiste:</h3><p>In order to follow this tutorial, it is needed to have the following applications installed:</p><ul> -<li>autoconf (>= 2.69)</li><li>gawk (Awk >= 4.0.1)</li><li>make (>= 3.81) </li><li>cmake (>= 3.2.2)</li><li>gcc/g<ins> (>= 4.9) and the gcc/g</ins> names should point to the correct binaries</li><li>BLAS/LAPACK (The configure of ScalFMM is different if the MKL is used or not, but with the MKL it is recommended to set environment variable <code>MKLROOT</code>)</li><li>CUDA (>= 7) and <code>CUDA_PATH</code> must be set. In our case, <code>CUDA_PATH=/usr/local/cuda-7.5/</code></li><li><strong>Optional</strong> Vite (from <code>sudo apt-get install vite</code> or see <a href="http://vite.gforge.inria.fr/download.php"></a><a href="http://vite.gforge.inria.fr/download.php">http://vite.gforge.inria.fr/download.php</a>)</li><li><strong>Optional</strong> Qt5 library to be able to change the colors of the execution traces in order to visualize the different FMM operators</li><li>gnuplot to generate the figures</li></ul><blockquote> -<p>[Remark] Some installations of CUDA does not have libcuda file.<br>In this case, one needs to create a link : <code>sudo ln /usr/local/cuda-7.5/lib64/libcudart.so /usr/local/cuda-7.5/lib64/libcuda.so</code></p> -<p>[Plafrim-Developers] </p> -<p>For those who use this tutorial on Plafrim (or a similar cluster), we provide extra informations.</p> -<p>To allocate an heterogeneous node : <code>salloc -N 1 --time=03:00:00 --exclusive -p court_sirocco -CHaswell --gres=gpu:4 -x sirocco06</code></p> -<p>Then, find it using <code>squeue</code> and access it by <code>ssh</code>.</p> -<p>We have run this tutorial with the modules : <code>module load compiler/gcc/4.9.2 cuda75/toolkit/7.5.18 intel/mkl/64/11.2/2016.0.0 build/cmake/3.2.1</code></p> -</blockquote><h3 id="working-directory"><a name="working-directory" href="#working-directory"></a>Working directory</h3><p>The variable <code>SCALFMM_TEST_DIR</code> is used to specify the working directory where all the tools are going to be installed:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">export SCALFMM_TEST_DIR=~/scalfmm_test -cd $SCALFMM_TEST_DIR -</code></pre>"><span class="hljs-built_in">export</span> SCALFMM_TEST_DIR=~/scalfmm_<span class="hljs-built_in">test</span> -<span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span> -</code></pre><p>In order to be able to stop the tutorial in the middle and restart later, we will register the variables in a file that should be source to restart later:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash"># function scalfmmRegisterVariable() { echo "export $1=${!1}" &gt;&gt; "$SCALFMM_TEST_DIR/environment.source"; } -echo "function scalfmmRegisterVariable() { echo \"export \$1=\${!1}\" &gt;&gt; \"$SCALFMM_TEST_DIR/environment.source\"; }" &gt; "$SCALFMM_TEST_DIR/environment.source" -source "$SCALFMM_TEST_DIR/environment.source" -</code></pre>"><span class="hljs-comment"># function scalfmmRegisterVariable() { echo "export $1=${!1}" >> "$SCALFMM_TEST_DIR/environment.source"; }</span> -<span class="hljs-built_in">echo</span> <span class="hljs-string">"function scalfmmRegisterVariable() { echo \"export \$1=\${!1}\" >> \"<span class="hljs-variable">$SCALFMM_TEST_DIR</span>/environment.source\"; }"</span> > <span class="hljs-string">"<span class="hljs-variable">$SCALFMM_TEST_DIR</span>/environment.source"</span> -<span class="hljs-built_in">source</span> <span class="hljs-string">"<span class="hljs-variable">$SCALFMM_TEST_DIR</span>/environment.source"</span> -</code></pre><p><em>Output variables:</em> <code>scalfmmRegisterVariable SCALFMM_TEST_DIR</code></p><p>Valid-if</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">if [[ -n $SCALFMM_TEST_DIR ]] &amp;&amp; [[ -d $SCALFMM_TEST_DIR ]] ; then - echo “STEP-OK” -fi -</code></pre>"><span class="hljs-keyword">if</span> [[ -n <span class="hljs-variable">$SCALFMM_TEST_DIR</span> ]] && [[ <span class="hljs-operator">-d</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span> ]] ; <span class="hljs-keyword">then</span> - <span class="hljs-built_in">echo</span> “STEP-OK” -<span class="hljs-keyword">fi</span> -</code></pre><ul> -<li>Restarting the tutorial</li></ul><p>To restart the tutorial, one needs to re-define the working directory and to source the save file before to resume:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">export SCALFMM_TEST_DIR=~/scalfmm_test -if [[ ! -d $SCALFMM_TEST_DIR ]] ; then - mkdir $SCALFMM_TEST_DIR -else - source "$SCALFMM_TEST_DIR/environment.source" -fi -cd $SCALFMM_TEST_DIR -</code></pre>"><span class="hljs-built_in">export</span> SCALFMM_TEST_DIR=~/scalfmm_<span class="hljs-built_in">test</span> -<span class="hljs-keyword">if</span> [[ ! <span class="hljs-operator">-d</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span> ]] ; <span class="hljs-keyword">then</span> - mkdir <span class="hljs-variable">$SCALFMM_TEST_DIR</span> -<span class="hljs-keyword">else</span> - <span class="hljs-built_in">source</span> <span class="hljs-string">"<span class="hljs-variable">$SCALFMM_TEST_DIR</span>/environment.source"</span> -<span class="hljs-keyword">fi</span> -<span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span> -</code></pre><h3 id="downloading-the-packages-(in-advance)"><a name="downloading-the-packages-(in-advance)" href="#downloading-the-packages-(in-advance)"></a>Downloading the Packages (in Advance)</h3><p>If the computational node does not have access to internet, we provide a command to download the needed packages (otherwise the next commands still include just in time download):</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">cd $SCALFMM_TEST_DIR -wget https://www.open-mpi.org/software/hwloc/v1.11/downloads/hwloc-1.11.2.tar.gz -wget http://download.savannah.gnu.org/releases/fkt/fxt-0.2.11.tar.gz # Optional -wget http://www.fftw.org/fftw-3.3.4.tar.gz -svn export svn://scm.gforge.inria.fr/svnroot/starpu/trunk starpu -git clone --depth=1 https://scm.gforge.inria.fr/anonscm/git/scalfmm-public/scalfmm-public.git -</code></pre>"><span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span> -wget https://www.open-mpi.org/software/hwloc/v1.<span class="hljs-number">11</span>/downloads/hwloc-<span class="hljs-number">1.11</span>.<span class="hljs-number">2</span>.tar.gz -wget http://download.savannah.gnu.org/releases/fkt/fxt-<span class="hljs-number">0.2</span>.<span class="hljs-number">11</span>.tar.gz <span class="hljs-comment"># Optional</span> -wget http://www.fftw.org/fftw-<span class="hljs-number">3.3</span>.<span class="hljs-number">4</span>.tar.gz -svn <span class="hljs-built_in">export</span> svn://scm.gforge.inria.fr/svnroot/starpu/trunk starpu -git <span class="hljs-built_in">clone</span> --depth=<span class="hljs-number">1</span> https://scm.gforge.inria.fr/anonscm/git/scalfmm-public/scalfmm-public.git -</code></pre><h3 id="hwloc"><a name="hwloc" href="#hwloc"></a>HWLOC</h3><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">cd $SCALFMM_TEST_DIR -if [[ ! -f hwloc-1.11.2.tar.gz ]] ; then - wget https://www.open-mpi.org/software/hwloc/v1.11/downloads/hwloc-1.11.2.tar.gz -fi -tar xvf hwloc-1.11.2.tar.gz -cd hwloc-1.11.2/ -export SCALFMM_HWLOC_DIR=$SCALFMM_TEST_DIR/hwlocinstall -./configure --prefix=$SCALFMM_HWLOC_DIR -make install -</code></pre>"><span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span> -<span class="hljs-keyword">if</span> [[ ! <span class="hljs-operator">-f</span> hwloc-<span class="hljs-number">1.11</span>.<span class="hljs-number">2</span>.tar.gz ]] ; <span class="hljs-keyword">then</span> - wget https://www.open-mpi.org/software/hwloc/v1.<span class="hljs-number">11</span>/downloads/hwloc-<span class="hljs-number">1.11</span>.<span class="hljs-number">2</span>.tar.gz -<span class="hljs-keyword">fi</span> -tar xvf hwloc-<span class="hljs-number">1.11</span>.<span class="hljs-number">2</span>.tar.gz -<span class="hljs-built_in">cd</span> hwloc-<span class="hljs-number">1.11</span>.<span class="hljs-number">2</span>/ -<span class="hljs-built_in">export</span> SCALFMM_HWLOC_DIR=<span class="hljs-variable">$SCALFMM_TEST_DIR</span>/hwlocinstall -./configure --prefix=<span class="hljs-variable">$SCALFMM_HWLOC_DIR</span> -make install -</code></pre><p><em>Output variables:</em> <code>scalfmmRegisterVariable SCALFMM_HWLOC_DIR</code></p><p>Valid-if:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">if [[ -n $SCALFMM_HWLOC_DIR ]] &amp;&amp; [[ -d $SCALFMM_HWLOC_DIR/lib/ ]] &amp;&amp; [[ -f $SCALFMM_HWLOC_DIR/lib/libhwloc.so ]]; then - echo "STEP-OK" -fi -</code></pre>"><span class="hljs-keyword">if</span> [[ -n <span class="hljs-variable">$SCALFMM_HWLOC_DIR</span> ]] && [[ <span class="hljs-operator">-d</span> <span class="hljs-variable">$SCALFMM_HWLOC_DIR</span>/lib/ ]] && [[ <span class="hljs-operator">-f</span> <span class="hljs-variable">$SCALFMM_HWLOC_DIR</span>/lib/libhwloc.so ]]; <span class="hljs-keyword">then</span> - <span class="hljs-built_in">echo</span> <span class="hljs-string">"STEP-OK"</span> -<span class="hljs-keyword">fi</span> -</code></pre><h3 id="fxt-(__optional__)"><a name="fxt-(__optional__)" href="#fxt-(__optional__)"></a>FXT (<strong>Optional</strong>)</h3><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">cd $SCALFMM_TEST_DIR -if [[ ! -f fxt-0.2.11.tar.gz ]] ; then - wget http://download.savannah.gnu.org/releases/fkt/fxt-0.2.11.tar.gz -fi -tar xvf fxt-0.2.11.tar.gz -cd fxt-0.2.11/ -export SCALFMM_FXT_DIR=$SCALFMM_TEST_DIR/fxtinstall -./configure --prefix=$SCALFMM_FXT_DIR -make install -</code></pre>"><span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span> -<span class="hljs-keyword">if</span> [[ ! <span class="hljs-operator">-f</span> fxt-<span class="hljs-number">0.2</span>.<span class="hljs-number">11</span>.tar.gz ]] ; <span class="hljs-keyword">then</span> - wget http://download.savannah.gnu.org/releases/fkt/fxt-<span class="hljs-number">0.2</span>.<span class="hljs-number">11</span>.tar.gz -<span class="hljs-keyword">fi</span> -tar xvf fxt-<span class="hljs-number">0.2</span>.<span class="hljs-number">11</span>.tar.gz -<span class="hljs-built_in">cd</span> fxt-<span class="hljs-number">0.2</span>.<span class="hljs-number">11</span>/ -<span class="hljs-built_in">export</span> SCALFMM_FXT_DIR=<span class="hljs-variable">$SCALFMM_TEST_DIR</span>/fxtinstall -./configure --prefix=<span class="hljs-variable">$SCALFMM_FXT_DIR</span> -make install -</code></pre><p><em>Output variables:</em> <code>scalfmmRegisterVariable SCALFMM_FXT_DIR</code></p><p>Valid-if:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">if [[ -n $SCALFMM_FXT_DIR ]] &amp;&amp; [[ -d $SCALFMM_FXT_DIR/lib/ ]] &amp;&amp; [[ -f $SCALFMM_FXT_DIR/lib/libfxt.so ]]; then - echo "STEP-OK" -fi -</code></pre>"><span class="hljs-keyword">if</span> [[ -n <span class="hljs-variable">$SCALFMM_FXT_DIR</span> ]] && [[ <span class="hljs-operator">-d</span> <span class="hljs-variable">$SCALFMM_FXT_DIR</span>/lib/ ]] && [[ <span class="hljs-operator">-f</span> <span class="hljs-variable">$SCALFMM_FXT_DIR</span>/lib/libfxt.so ]]; <span class="hljs-keyword">then</span> - <span class="hljs-built_in">echo</span> <span class="hljs-string">"STEP-OK"</span> -<span class="hljs-keyword">fi</span> -</code></pre><h3 id="fftw-(if-no-mkl-fft)"><a name="fftw-(if-no-mkl-fft)" href="#fftw-(if-no-mkl-fft)"></a>FFTW (If No MKL-FFT)</h3><p>For those who do not use MKL FFT interface, they have to install FFTW (float/double):</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">cd $SCALFMM_TEST_DIR -if [[ ! -f fftw-3.3.4.tar.gz ]] ; then - wget http://www.fftw.org/fftw-3.3.4.tar.gz -fi -tar xvf fftw-3.3.4.tar.gz -cd fftw-3.3.4/ -export SCALFMM_FFTW_DIR=$SCALFMM_TEST_DIR/fftinstall -./configure --prefix=$SCALFMM_FFTW_DIR -make install -./configure --prefix=$SCALFMM_FFTW_DIR --enable-float -make install -</code></pre>"><span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span> -<span class="hljs-keyword">if</span> [[ ! <span class="hljs-operator">-f</span> fftw-<span class="hljs-number">3.3</span>.<span class="hljs-number">4</span>.tar.gz ]] ; <span class="hljs-keyword">then</span> - wget http://www.fftw.org/fftw-<span class="hljs-number">3.3</span>.<span class="hljs-number">4</span>.tar.gz -<span class="hljs-keyword">fi</span> -tar xvf fftw-<span class="hljs-number">3.3</span>.<span class="hljs-number">4</span>.tar.gz -<span class="hljs-built_in">cd</span> fftw-<span class="hljs-number">3.3</span>.<span class="hljs-number">4</span>/ -<span class="hljs-built_in">export</span> SCALFMM_FFTW_DIR=<span class="hljs-variable">$SCALFMM_TEST_DIR</span>/fftinstall -./configure --prefix=<span class="hljs-variable">$SCALFMM_FFTW_DIR</span> -make install -./configure --prefix=<span class="hljs-variable">$SCALFMM_FFTW_DIR</span> --enable-float -make install -</code></pre><p><em>Output variables:</em> <code>scalfmmRegisterVariable SCALFMM_FFTW_DIR</code></p><p>Valid-if:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">if [[ -n $SCALFMM_FFTW_DIR ]] &amp;&amp; [[ -d $SCALFMM_FFTW_DIR/lib/ ]] &amp;&amp; [[ -f $SCALFMM_FFTW_DIR/lib/libfftw3.a ]] &amp;&amp; [[ -f $SCALFMM_FFTW_DIR/lib/libfftw3f.a ]]; then - echo "STEP-OK" -fi -</code></pre>"><span class="hljs-keyword">if</span> [[ -n <span class="hljs-variable">$SCALFMM_FFTW_DIR</span> ]] && [[ <span class="hljs-operator">-d</span> <span class="hljs-variable">$SCALFMM_FFTW_DIR</span>/lib/ ]] && [[ <span class="hljs-operator">-f</span> <span class="hljs-variable">$SCALFMM_FFTW_DIR</span>/lib/libfftw3.a ]] && [[ <span class="hljs-operator">-f</span> <span class="hljs-variable">$SCALFMM_FFTW_DIR</span>/lib/libfftw3f.a ]]; <span class="hljs-keyword">then</span> - <span class="hljs-built_in">echo</span> <span class="hljs-string">"STEP-OK"</span> -<span class="hljs-keyword">fi</span> -</code></pre><h3 id="starpu"><a name="starpu" href="#starpu"></a>StarPU</h3><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">cd $SCALFMM_TEST_DIR -if [[ ! -d starpu ]] ; then - svn export svn://scm.gforge.inria.fr/svnroot/starpu/trunk starpu -fi -cd starpu/ -export SCALFMM_STARPU_DIR=$SCALFMM_TEST_DIR/starpuinstall -./autogen.sh -./configure --prefix=$SCALFMM_STARPU_DIR --with-fxt=$SCALFMM_FXT_DIR --with-hwloc=$SCALFMM_HWLOC_DIR --with-cuda-dir=$CUDA_PATH --disable-opencl -make install -</code></pre>"><span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span> -<span class="hljs-keyword">if</span> [[ ! <span class="hljs-operator">-d</span> starpu ]] ; <span class="hljs-keyword">then</span> - svn <span class="hljs-built_in">export</span> svn://scm.gforge.inria.fr/svnroot/starpu/trunk starpu -<span class="hljs-keyword">fi</span> -<span class="hljs-built_in">cd</span> starpu/ -<span class="hljs-built_in">export</span> SCALFMM_STARPU_DIR=<span class="hljs-variable">$SCALFMM_TEST_DIR</span>/starpuinstall -./autogen.sh -./configure --prefix=<span class="hljs-variable">$SCALFMM_STARPU_DIR</span> --with-fxt=<span class="hljs-variable">$SCALFMM_FXT_DIR</span> --with-hwloc=<span class="hljs-variable">$SCALFMM_HWLOC_DIR</span> --with-cuda-dir=<span class="hljs-variable">$CUDA_PATH</span> --disable-opencl -make install -</code></pre><blockquote> -<p><strong>Optional</strong> In case you do not want to use trace (FXT) please remove the <code>--with-fxt=$SCALFMM_FXT_DIR</code> parameter from the command</p> -</blockquote><p><em>Output variables:</em> <code>scalfmmRegisterVariable SCALFMM_STARPU_DIR</code></p><p>Valid-if:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">if [[ -n $SCALFMM_STARPU_DIR ]] &amp;&amp; [[ -d $SCALFMM_STARPU_DIR/lib/ ]] &amp;&amp; [[ -f $SCALFMM_STARPU_DIR/lib/libstarpu.so ]] ; then - echo "STEP-OK" -fi -</code></pre>"><span class="hljs-keyword">if</span> [[ -n <span class="hljs-variable">$SCALFMM_STARPU_DIR</span> ]] && [[ <span class="hljs-operator">-d</span> <span class="hljs-variable">$SCALFMM_STARPU_DIR</span>/lib/ ]] && [[ <span class="hljs-operator">-f</span> <span class="hljs-variable">$SCALFMM_STARPU_DIR</span>/lib/libstarpu.so ]] ; <span class="hljs-keyword">then</span> - <span class="hljs-built_in">echo</span> <span class="hljs-string">"STEP-OK"</span> -<span class="hljs-keyword">fi</span> -</code></pre><h3 id="scalfmm"><a name="scalfmm" href="#scalfmm"></a>ScalFMM</h3><h4 id="configure"><a name="configure" href="#configure"></a>Configure</h4><ul> -<li>Getting the source from the last commit:<pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">cd $SCALFMM_TEST_DIR -if [[ ! -d scalfmm-public ]] ; then - git clone --depth=1 https://scm.gforge.inria.fr/anonscm/git/scalfmm-public/scalfmm-public.git -fi -cd scalfmm-public/ -export SCALFMM_SOURCE_DIR=`pwd` -cd Build/ -export SCALFMM_BUILD_DIR=`pwd` -</code></pre>"><span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_TEST_DIR</span> -<span class="hljs-keyword">if</span> [[ ! <span class="hljs-operator">-d</span> scalfmm-public ]] ; <span class="hljs-keyword">then</span> - git <span class="hljs-built_in">clone</span> --depth=<span class="hljs-number">1</span> https://scm.gforge.inria.fr/anonscm/git/scalfmm-public/scalfmm-public.git -<span class="hljs-keyword">fi</span> -<span class="hljs-built_in">cd</span> scalfmm-public/ -<span class="hljs-built_in">export</span> SCALFMM_SOURCE_DIR=`<span class="hljs-built_in">pwd</span>` -<span class="hljs-built_in">cd</span> Build/ -<span class="hljs-built_in">export</span> SCALFMM_BUILD_DIR=`<span class="hljs-built_in">pwd</span>` -</code></pre> -</li></ul><p><em>Output variables:</em> <code>scalfmmRegisterVariable SCALFMM_BUILD_DIR</code> <code>scalfmmRegisterVariable SCALFMM_SOURCE_DIR</code></p><ul> -<li>Configure (No MKL):<pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF \ - -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF \ - -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=OFF \ - -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON \ - -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF \ - -DHWLOC_DIR=$SCALFMM_HWLOC_DIR -DSTARPU_DIR=$SCALFMM_STARPU_DIR \ - -DSCALFMM_USE_FFT=ON -DFFT_DIR=$SCALFMM_FFT_DIR -</code></pre>">cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF \ - -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF \ - -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=OFF \ - -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON \ - -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF \ - -DHWLOC_DIR=<span class="hljs-variable">$SCALFMM_HWLOC_DIR</span> -DSTARPU_DIR=<span class="hljs-variable">$SCALFMM_STARPU_DIR</span> \ - -DSCALFMM_USE_FFT=ON -DFFT_DIR=<span class="hljs-variable">$SCALFMM_FFT_DIR</span> -</code></pre> -</li><li>Configure (MKL BLAS/LAPACK and FFTW):<pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF \ - -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF \ - -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=ON \ - -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON \ - -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF \ - -DHWLOC_DIR=$SCALFMM_HWLOC_DIR -DSTARPU_DIR=$SCALFMM_STARPU_DIR \ - -DSCALFMM_USE_FFT=ON -DFFT_DIR=$SCALFMM_FFT_DIR -</code></pre>">cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF \ - -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF \ - -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=ON \ - -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON \ - -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF \ - -DHWLOC_DIR=<span class="hljs-variable">$SCALFMM_HWLOC_DIR</span> -DSTARPU_DIR=<span class="hljs-variable">$SCALFMM_STARPU_DIR</span> \ - -DSCALFMM_USE_FFT=ON -DFFT_DIR=<span class="hljs-variable">$SCALFMM_FFT_DIR</span> -</code></pre> -</li><li>Configure (MKL BLAS/LAPACK/FFT and No FFTW):</li></ul><blockquote> -<p>[Plafrim-Developers] Should use that one</p> -</blockquote><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF \ - -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF \ - -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=ON \ - -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON \ - -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF \ - -DHWLOC_DIR=$SCALFMM_HWLOC_DIR -DSTARPU_DIR=$SCALFMM_STARPU_DIR \ - -DSCALFMM_USE_FFT=ON -DSCALFMM_USE_MKL_AS_FFTW=ON -</code></pre>">cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF \ - -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF \ - -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=ON \ - -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON \ - -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF \ - -DHWLOC_DIR=<span class="hljs-variable">$SCALFMM_HWLOC_DIR</span> -DSTARPU_DIR=<span class="hljs-variable">$SCALFMM_STARPU_DIR</span> \ - -DSCALFMM_USE_FFT=ON -DSCALFMM_USE_MKL_AS_FFTW=ON -</code></pre><p>Valid-if:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">cmake .. ; if [[ "$?" == "0" ]] ; then echo "STEP-OK" ; fi -</code></pre>">cmake .. ; <span class="hljs-keyword">if</span> [[ <span class="hljs-string">"$?"</span> == <span class="hljs-string">"0"</span> ]] ; <span class="hljs-keyword">then</span> <span class="hljs-built_in">echo</span> <span class="hljs-string">"STEP-OK"</span> ; <span class="hljs-keyword">fi</span> -</code></pre><h4 id="build"><a name="build" href="#build"></a>Build</h4><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">cd $SCALFMM_BUILD_DIR -make testBlockedUnifCudaBench -</code></pre>"><span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_BUILD_DIR</span> -make <span class="hljs-built_in">test</span>BlockedUnifCudaBench -</code></pre><p>Valid-if:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">ls ./Tests/Release/testBlockedUnifCudaBench ; if [[ "$?" == "0" ]] ; then echo "STEP-OK" ; fi -</code></pre>">ls ./Tests/Release/<span class="hljs-built_in">test</span>BlockedUnifCudaBench ; <span class="hljs-keyword">if</span> [[ <span class="hljs-string">"$?"</span> == <span class="hljs-string">"0"</span> ]] ; <span class="hljs-keyword">then</span> <span class="hljs-built_in">echo</span> <span class="hljs-string">"STEP-OK"</span> ; <span class="hljs-keyword">fi</span> -</code></pre><h4 id="first-execution"><a name="first-execution" href="#first-execution"></a>First Execution</h4><p>In this section we compute a simulation and look at the resulting trace.<br>ScalFMM binary parameters and descriptions:</p><ul> -<li>Passing <code>--help</code> as parameter provide the possible/valid parameters</li><li>Simulation properties are choosen by :<ul> -<li><code>-h</code> : height of the tree</li><li><code>-bs</code> : granularity/size of the group</li><li><code>-nb</code> : number of particles generated</li></ul> -</li><li>Execution properties are choosen by the StarPU environment variables :<ul> -<li><code>STARPU_NCPUS</code> : the number of CPU workers</li><li><code>STARPU_NCUDA</code> : the number of GPU workers (for heterogeneous binary)</li></ul> -</li><li>By default the application will not compare the FMM interactions against the direct method (which is N^2) and so it is recommended to avoid the validation for large test cases. But to get the accuracy one must pass the parameter <code>-validation</code></li><li><code>-p2p-m2l-cuda-only</code> : to compute the P2P and the M2L only on GPU (the rest on the CPU)</li></ul><p>Examples:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">export STARPU_NCPUS=12 -export STARPU_NCUDA=2 -./Tests/Release/testBlockedUnifCudaBench -nb 30000000 -h 7 -bs 800 -</code></pre>"><span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-number">12</span> -<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-number">2</span> -./Tests/Release/<span class="hljs-built_in">test</span>BlockedUnifCudaBench -nb <span class="hljs-number">30000000</span> -h <span class="hljs-number">7</span> -bs <span class="hljs-number">800</span> -</code></pre><p>Last part of the output should be:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash"> Start FGroupTaskStarPUAlgorithm - directPass in 0.0406482s - inblock in 0.000780428s - outblock in 0.0398674s - bottomPass in 0.00586269s - upwardPass in 0.00265723s - transferPass in 0.00323571s - inblock in 0.000124817s - outblock in 0.00298331s - downardPass in 0.00257975s - transferPass in 0.0652285s - inblock in 0.00164774s - outblock in 0.0635799s - L2P in 0.0115733s - Submitting the tasks took 0.139101s - Moving data to the host took 0.0578765s -@EXEC TIME = 14.6321s -</code></pre>"> Start FGroupTaskStarPUAlgorithm - directPass <span class="hljs-keyword">in</span> <span class="hljs-number">0.0406482</span>s - inblock <span class="hljs-keyword">in</span> <span class="hljs-number">0.000780428</span>s - outblock <span class="hljs-keyword">in</span> <span class="hljs-number">0.0398674</span>s - bottomPass <span class="hljs-keyword">in</span> <span class="hljs-number">0.00586269</span>s - upwardPass <span class="hljs-keyword">in</span> <span class="hljs-number">0.00265723</span>s - transferPass <span class="hljs-keyword">in</span> <span class="hljs-number">0.00323571</span>s - inblock <span class="hljs-keyword">in</span> <span class="hljs-number">0.000124817</span>s - outblock <span class="hljs-keyword">in</span> <span class="hljs-number">0.00298331</span>s - downardPass <span class="hljs-keyword">in</span> <span class="hljs-number">0.00257975</span>s - transferPass <span class="hljs-keyword">in</span> <span class="hljs-number">0.0652285</span>s - inblock <span class="hljs-keyword">in</span> <span class="hljs-number">0.00164774</span>s - outblock <span class="hljs-keyword">in</span> <span class="hljs-number">0.0635799</span>s - L2P <span class="hljs-keyword">in</span> <span class="hljs-number">0.0115733</span>s - Submitting the tasks took <span class="hljs-number">0.139101</span>s - Moving data to the host took <span class="hljs-number">0.0578765</span>s -@EXEC TIME = <span class="hljs-number">14.6321</span>s -</code></pre><ul> -<li>Visualize the execution trace (<strong>Optional</strong>)</li></ul><p>Convert the fxt file</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">$SCALFMM_STARPU_DIR/bin/starpu_fxt_tool -i "/tmp/prof_file_"$USER"_0" -</code></pre>"><span class="hljs-variable">$SCALFMM_STARPU_DIR</span>/bin/starpu_fxt_tool -i <span class="hljs-string">"/tmp/prof_file_"</span><span class="hljs-variable">$USER</span><span class="hljs-string">"_0"</span> -</code></pre><p>Then visualize the output with <code>vite</code> (maybe by copying the paje.trace file locally)</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">vite ./paje.trace -</code></pre>">vite ./paje.trace -</code></pre><p>Should be like:<br><img src="/home/bbramas/Projects/ScalfmmGit/scalfmm/Addons/BenchEfficiency/trace-example.png" alt="Trace"></p><p>We can convert the color of the trace by (requiere Qt5 library):</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">$SCALFMM_SOURCE_DIR/Addons/BenchEfficiency/pajecolor paje.trace $SCALFMM_SOURCE_DIR/Addons/BenchEfficiency/paintmodel.fmm.colors -vite ./paje.trace.painted -</code></pre>"><span class="hljs-variable">$SCALFMM_SOURCE_DIR</span>/Addons/BenchEfficiency/pajecolor paje.trace <span class="hljs-variable">$SCALFMM_SOURCE_DIR</span>/Addons/BenchEfficiency/paintmodel.fmm.colors -vite ./paje.trace.painted -</code></pre><p>Should be like:<br><img src="/home/bbramas/Projects/ScalfmmGit/scalfmm/Addons/BenchEfficiency/trace-example-colors.png" alt="Trace"></p><ul> -<li>Get execution times</li></ul><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">python $SCALFMM_STARPU_DIR/bin/starpu_trace_state_stats.py -t trace.rec -</code></pre>">python <span class="hljs-variable">$SCALFMM_STARPU_DIR</span>/bin/starpu_trace_state_stats.py -t trace.rec -</code></pre><p>Should give something like:</p><pre><code data-origin="<pre><code>"Name","Count","Type","Duration" -"Initializing",14,"Runtime",7153.096196 -"Overhead",57010,"Runtime",376.473463 -"Idle",14355,"Other",12.815899 -"Scheduling",28441,"Runtime",238.367394 -"Sleeping",610,"Other",13786.513208 -"FetchingInput",14341,"Runtime",13918.805814 -"execute_on_all_wrapper",30,"Task",21.288802 -"Executing",414,"Runtime",26852.864578 -"PushingOutput",14341,"Runtime",284.96123 -"P2P-out",3846,"Task",60378.266619 -"Callback",13559,"Runtime",4.210633 -"P2P",328,"Task",15383.426991 -"M2L-level-5",41,"Task",2354.702554 -"M2L-level-6",328,"Task",18349.915495 -"Deinitializing",14,"Runtime",109.87483 -"M2L-level-4",6,"Task",275.088295 -"P2M",328,"Task",11312.022842 -"M2M-level-5",328,"Task",829.9055 -"M2M-level-4",41,"Task",93.130498 -"M2L-out-level-5",638,"Task",1914.900053 -"M2M-level-3",6,"Task",11.053067 -"M2M-level-2",1,"Task",1.363157 -"M2L-out-level-4",22,"Task",159.580457 -"L2L-level-4",41,"Task",84.554065 -"L2L-level-5",328,"Task",1087.717767 -"M2L-out-level-6",7692,"Task",18322.518045 -"L2P",328,"Task",27146.256793 -"M2L-level-2",1,"Task",2.661235 -"L2L-level-3",6,"Task",11.346978 -"M2L-level-3",1,"Task",47.612555 -"L2L-level-2",1,"Task",1.471873 -</code></pre>">"Name","Count","Type","Duration" -"Initializing",14,"Runtime",7153.096196 -"Overhead",57010,"Runtime",376.473463 -"Idle",14355,"Other",12.815899 -"Scheduling",28441,"Runtime",238.367394 -"Sleeping",610,"Other",13786.513208 -"FetchingInput",14341,"Runtime",13918.805814 -"execute_on_all_wrapper",30,"Task",21.288802 -"Executing",414,"Runtime",26852.864578 -"PushingOutput",14341,"Runtime",284.96123 -"P2P-out",3846,"Task",60378.266619 -"Callback",13559,"Runtime",4.210633 -"P2P",328,"Task",15383.426991 -"M2L-level-5",41,"Task",2354.702554 -"M2L-level-6",328,"Task",18349.915495 -"Deinitializing",14,"Runtime",109.87483 -"M2L-level-4",6,"Task",275.088295 -"P2M",328,"Task",11312.022842 -"M2M-level-5",328,"Task",829.9055 -"M2M-level-4",41,"Task",93.130498 -"M2L-out-level-5",638,"Task",1914.900053 -"M2M-level-3",6,"Task",11.053067 -"M2M-level-2",1,"Task",1.363157 -"M2L-out-level-4",22,"Task",159.580457 -"L2L-level-4",41,"Task",84.554065 -"L2L-level-5",328,"Task",1087.717767 -"M2L-out-level-6",7692,"Task",18322.518045 -"L2P",328,"Task",27146.256793 -"M2L-level-2",1,"Task",2.661235 -"L2L-level-3",6,"Task",11.346978 -"M2L-level-3",1,"Task",47.612555 -"L2L-level-2",1,"Task",1.471873 -</code></pre><p>Most of the script are in the addon directories</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">export SCALFMM_AB=$SCALFMM_SOURCE_DIR/Addons/BenchEfficiency/ -</code></pre>"><span class="hljs-built_in">export</span> SCALFMM_AB=<span class="hljs-variable">$SCALFMM_SOURCE_DIR</span>/Addons/BenchEfficiency/ -</code></pre><p><em>Output variable:</em> <code>scalfmmRegisterVariable SCALFMM_AB</code></p><h2 id="homogeneous-efficiencies"><a name="homogeneous-efficiencies" href="#homogeneous-efficiencies"></a>Homogeneous Efficiencies</h2><p>Here we compute the efficiencies for a given test case on CPU only.</p><p>Go in the build dir and create output dir</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">cd $SCALFMM_BUILD_DIR -export SCALFMM_RES_DIR=$SCALFMM_BUILD_DIR/homogeneous -mkdir $SCALFMM_RES_DIR -</code></pre>"><span class="hljs-built_in">cd</span> <span class="hljs-variable">$SCALFMM_BUILD_DIR</span> -<span class="hljs-built_in">export</span> SCALFMM_RES_DIR=<span class="hljs-variable">$SCALFMM_BUILD_DIR</span>/homogeneous -mkdir <span class="hljs-variable">$SCALFMM_RES_DIR</span> -</code></pre><p><em>Output variable:</em> <code>scalfmmRegisterVariable SCALFMM_RES_DIR</code> </p><p>Set up the configuration variables:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">export SCALFMM_NB=10000000 -export SCALFMM_H=7 -export SCALFMM_MIN_BS=100 -export SCALFMM_MAX_BS=10000 -export SCALFMM_MAX_NB_CPU=24 -</code></pre>"><span class="hljs-built_in">export</span> SCALFMM_NB=<span class="hljs-number">10000000</span> -<span class="hljs-built_in">export</span> SCALFMM_H=<span class="hljs-number">7</span> -<span class="hljs-built_in">export</span> SCALFMM_MIN_BS=<span class="hljs-number">100</span> -<span class="hljs-built_in">export</span> SCALFMM_MAX_BS=<span class="hljs-number">10000</span> -<span class="hljs-built_in">export</span> SCALFMM_MAX_NB_CPU=<span class="hljs-number">24</span> -</code></pre><p>Find best granularity in sequential and in parallel:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">export STARPU_NCPUS=1 -export STARPU_NCUDA=0 -export SCALFMM_BS_CPU_SEQ=`$SCALFMM_AB/scalfmmFindBs.sh "./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs" $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmmExtractKey.sh "@BEST BS" ` -if [[ `which gnuplot | wc -l` == "1" ]] ; then - gnuplot -e "filename='seq-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot -fi - -export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -export STARPU_NCUDA=0 -export SCALFMM_BS_CPU_PAR=`$SCALFMM_AB/scalfmmFindBs.sh "./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs" $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` -if [[ `which gnuplot | wc -l` == "1" ]] ; then - gnuplot -e "filename='par-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot -fi -</code></pre>"><span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-number">1</span> -<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-number">0</span> -<span class="hljs-built_in">export</span> SCALFMM_BS_CPU_SEQ=`<span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.sh <span class="hljs-string">"./Tests/Release/testBlockedUnifCudaBench -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> -bs"</span> <span class="hljs-variable">$SCALFMM_MIN_BS</span> <span class="hljs-variable">$SCALFMM_MAX_BS</span> | <span class="hljs-variable">$SCALFMM_AB</span>/scalfmmExtractKey.sh <span class="hljs-string">"@BEST BS"</span> ` -<span class="hljs-keyword">if</span> [[ `<span class="hljs-built_in">which</span> gnuplot | wc <span class="hljs-operator">-l</span>` == <span class="hljs-string">"1"</span> ]] ; <span class="hljs-keyword">then</span> - gnuplot <span class="hljs-operator">-e</span> <span class="hljs-string">"filename='seq-bs-search'"</span> <span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.gplot -<span class="hljs-keyword">fi</span> - -<span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-variable">$SCALFMM_MAX_NB_CPU</span> -<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-number">0</span> -<span class="hljs-built_in">export</span> SCALFMM_BS_CPU_PAR=`<span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.sh <span class="hljs-string">"./Tests/Release/testBlockedUnifCudaBench -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> -bs"</span> <span class="hljs-variable">$SCALFMM_MIN_BS</span> <span class="hljs-variable">$SCALFMM_MAX_BS</span> | <span class="hljs-variable">$SCALFMM_AB</span>/scalfmm_extract_key <span class="hljs-string">"@BEST BS"</span> ` -<span class="hljs-keyword">if</span> [[ `<span class="hljs-built_in">which</span> gnuplot | wc <span class="hljs-operator">-l</span>` == <span class="hljs-string">"1"</span> ]] ; <span class="hljs-keyword">then</span> - gnuplot <span class="hljs-operator">-e</span> <span class="hljs-string">"filename='par-bs-search'"</span> <span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.gplot -<span class="hljs-keyword">fi</span> -</code></pre><p>In our case we get 9710 and 5385.</p><p><em>Output variable:</em> <code>scalfmmRegisterVariable SCALFMM_BS_CPU_SEQ</code> <code>scalfmmRegisterVariable SCALFMM_BS_CPU_PAR</code></p><p>We can look to the work that has been done to find the best granularity:<br><img src="/home/bbramas/Projects/ScalfmmGit/scalfmm/Addons/BenchEfficiency/seq-bs-search.png" alt="In sequential"><br><img src="/home/bbramas/Projects/ScalfmmGit/scalfmm/Addons/BenchEfficiency/par-bs-search.png" alt="In parallel"></p><p>Then we compute the efficiency using both granulirities and keep the .rec files:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">export SCALFMM_MAX_NB_CPU=24 -export STARPU_NCUDA=0 -source "$SCALFMM_AB/execAllHomogeneous.sh" -</code></pre>"><span class="hljs-built_in">export</span> SCALFMM_MAX_NB_CPU=<span class="hljs-number">24</span> -<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-number">0</span> -<span class="hljs-built_in">source</span> <span class="hljs-string">"<span class="hljs-variable">$SCALFMM_AB</span>/execAllHomogeneous.sh"</span> -</code></pre><p>We should end with all the .rec files and their corresponding time files and <code>ls "$SCALFMM_RES_DIR"</code> should return something like:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">trace-nb_10000000-h_7-bs_5385-CPU_10.rec trace-nb_10000000-h_7-bs_5385-CPU_16.rec.time trace-nb_10000000-h_7-bs_5385-CPU_22.rec trace-nb_10000000-h_7-bs_5385-CPU_5.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_10.rec.time trace-nb_10000000-h_7-bs_5385-CPU_17.rec trace-nb_10000000-h_7-bs_5385-CPU_22.rec.time trace-nb_10000000-h_7-bs_5385-CPU_6.rec -trace-nb_10000000-h_7-bs_5385-CPU_11.rec trace-nb_10000000-h_7-bs_5385-CPU_17.rec.time trace-nb_10000000-h_7-bs_5385-CPU_23.rec trace-nb_10000000-h_7-bs_5385-CPU_6.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_11.rec.time trace-nb_10000000-h_7-bs_5385-CPU_18.rec trace-nb_10000000-h_7-bs_5385-CPU_23.rec.time trace-nb_10000000-h_7-bs_5385-CPU_7.rec -trace-nb_10000000-h_7-bs_5385-CPU_12.rec trace-nb_10000000-h_7-bs_5385-CPU_18.rec.time trace-nb_10000000-h_7-bs_5385-CPU_24.rec trace-nb_10000000-h_7-bs_5385-CPU_7.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_12.rec.time trace-nb_10000000-h_7-bs_5385-CPU_19.rec trace-nb_10000000-h_7-bs_5385-CPU_24.rec.time trace-nb_10000000-h_7-bs_5385-CPU_8.rec -trace-nb_10000000-h_7-bs_5385-CPU_13.rec trace-nb_10000000-h_7-bs_5385-CPU_19.rec.time trace-nb_10000000-h_7-bs_5385-CPU_2.rec trace-nb_10000000-h_7-bs_5385-CPU_8.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_13.rec.time trace-nb_10000000-h_7-bs_5385-CPU_1.rec trace-nb_10000000-h_7-bs_5385-CPU_2.rec.time trace-nb_10000000-h_7-bs_5385-CPU_9.rec -trace-nb_10000000-h_7-bs_5385-CPU_14.rec trace-nb_10000000-h_7-bs_5385-CPU_1.rec.time trace-nb_10000000-h_7-bs_5385-CPU_3.rec trace-nb_10000000-h_7-bs_5385-CPU_9.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_14.rec.time trace-nb_10000000-h_7-bs_5385-CPU_20.rec trace-nb_10000000-h_7-bs_5385-CPU_3.rec.time trace-nb_10000000-h_7-bs_9710-CPU_1.rec -trace-nb_10000000-h_7-bs_5385-CPU_15.rec trace-nb_10000000-h_7-bs_5385-CPU_20.rec.time trace-nb_10000000-h_7-bs_5385-CPU_4.rec trace-nb_10000000-h_7-bs_9710-CPU_1.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_15.rec.time trace-nb_10000000-h_7-bs_5385-CPU_21.rec trace-nb_10000000-h_7-bs_5385-CPU_4.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_16.rec trace-nb_10000000-h_7-bs_5385-CPU_21.rec.time trace-nb_10000000-h_7-bs_5385-CPU_5.rec -</code></pre>">trace-nb_10000000-h_7-bs_5385-CPU_10.rec trace-nb_10000000-h_7-bs_5385-CPU_16.rec.time trace-nb_10000000-h_7-bs_5385-CPU_22.rec trace-nb_10000000-h_7-bs_5385-CPU_5.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_10.rec.time trace-nb_10000000-h_7-bs_5385-CPU_17.rec trace-nb_10000000-h_7-bs_5385-CPU_22.rec.time trace-nb_10000000-h_7-bs_5385-CPU_6.rec -trace-nb_10000000-h_7-bs_5385-CPU_11.rec trace-nb_10000000-h_7-bs_5385-CPU_17.rec.time trace-nb_10000000-h_7-bs_5385-CPU_23.rec trace-nb_10000000-h_7-bs_5385-CPU_6.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_11.rec.time trace-nb_10000000-h_7-bs_5385-CPU_18.rec trace-nb_10000000-h_7-bs_5385-CPU_23.rec.time trace-nb_10000000-h_7-bs_5385-CPU_7.rec -trace-nb_10000000-h_7-bs_5385-CPU_12.rec trace-nb_10000000-h_7-bs_5385-CPU_18.rec.time trace-nb_10000000-h_7-bs_5385-CPU_24.rec trace-nb_10000000-h_7-bs_5385-CPU_7.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_12.rec.time trace-nb_10000000-h_7-bs_5385-CPU_19.rec trace-nb_10000000-h_7-bs_5385-CPU_24.rec.time trace-nb_10000000-h_7-bs_5385-CPU_8.rec -trace-nb_10000000-h_7-bs_5385-CPU_13.rec trace-nb_10000000-h_7-bs_5385-CPU_19.rec.time trace-nb_10000000-h_7-bs_5385-CPU_2.rec trace-nb_10000000-h_7-bs_5385-CPU_8.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_13.rec.time trace-nb_10000000-h_7-bs_5385-CPU_1.rec trace-nb_10000000-h_7-bs_5385-CPU_2.rec.time trace-nb_10000000-h_7-bs_5385-CPU_9.rec -trace-nb_10000000-h_7-bs_5385-CPU_14.rec trace-nb_10000000-h_7-bs_5385-CPU_1.rec.time trace-nb_10000000-h_7-bs_5385-CPU_3.rec trace-nb_10000000-h_7-bs_5385-CPU_9.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_14.rec.time trace-nb_10000000-h_7-bs_5385-CPU_20.rec trace-nb_10000000-h_7-bs_5385-CPU_3.rec.time trace-nb_10000000-h_7-bs_9710-CPU_1.rec -trace-nb_10000000-h_7-bs_5385-CPU_15.rec trace-nb_10000000-h_7-bs_5385-CPU_20.rec.time trace-nb_10000000-h_7-bs_5385-CPU_4.rec trace-nb_10000000-h_7-bs_9710-CPU_1.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_15.rec.time trace-nb_10000000-h_7-bs_5385-CPU_21.rec trace-nb_10000000-h_7-bs_5385-CPU_4.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_16.rec trace-nb_10000000-h_7-bs_5385-CPU_21.rec.time trace-nb_10000000-h_7-bs_5385-CPU_5.rec -</code></pre><p>We then compute the efficiencies from these files</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">g++ -std=c++11 $SCALFMM_AB/mergetimefile.cpp -o $SCALFMM_AB/mergetimefile.exe -$SCALFMM_AB/mergetimefile.exe \ - "$SCALFMM_RES_DIR/trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_BS_CPU_SEQ-CPU_1.rec.time" \ - "$SCALFMM_RES_DIR/trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_BS_CPU_PAR-CPU_%d.rec.time"\ - $SCALFMM_MAX_NB_CPU -</code></pre>">g++ -std=c++<span class="hljs-number">11</span> <span class="hljs-variable">$SCALFMM_AB</span>/mergetimefile.cpp -o <span class="hljs-variable">$SCALFMM_AB</span>/mergetimefile.exe -<span class="hljs-variable">$SCALFMM_AB</span>/mergetimefile.exe \ - <span class="hljs-string">"<span class="hljs-variable">$SCALFMM_RES_DIR</span>/trace-nb_<span class="hljs-variable">$SCALFMM_NB</span>-h_<span class="hljs-variable">$SCALFMM_H</span>-bs_<span class="hljs-variable">$SCALFMM_BS_CPU_SEQ</span>-CPU_1.rec.time"</span> \ - <span class="hljs-string">"<span class="hljs-variable">$SCALFMM_RES_DIR</span>/trace-nb_<span class="hljs-variable">$SCALFMM_NB</span>-h_<span class="hljs-variable">$SCALFMM_H</span>-bs_<span class="hljs-variable">$SCALFMM_BS_CPU_PAR</span>-CPU_%d.rec.time"</span>\ - <span class="hljs-variable">$SCALFMM_MAX_NB_CPU</span> -</code></pre><p>We end-up with the global efficiencies (for the application) but also for the different operators.</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">Create global-eff.data -Create task-eff.data -Create task-gr-eff.dat -</code></pre>">Create global-eff.data -Create task-eff.data -Create task-gr-eff.dat -</code></pre><p>We can plot each of them</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">gnuplot -e "filename='global-eff'" $SCALFMM_AB/scalfmmPlotAll.gplot -gnuplot -e "filename='task-eff'" $SCALFMM_AB/scalfmmPlotAll.gplot -gnuplot -e "filename='task-gr-eff'" $SCALFMM_AB/scalfmmPlotAll.gplot -</code></pre>">gnuplot <span class="hljs-operator">-e</span> <span class="hljs-string">"filename='global-eff'"</span> <span class="hljs-variable">$SCALFMM_AB</span>/scalfmmPlotAll.gplot -gnuplot <span class="hljs-operator">-e</span> <span class="hljs-string">"filename='task-eff'"</span> <span class="hljs-variable">$SCALFMM_AB</span>/scalfmmPlotAll.gplot -gnuplot <span class="hljs-operator">-e</span> <span class="hljs-string">"filename='task-gr-eff'"</span> <span class="hljs-variable">$SCALFMM_AB</span>/scalfmmPlotAll.gplot -</code></pre><p>In our case it gives:<br><img src="/home/bbramas/Projects/ScalfmmGit/scalfmm/Addons/BenchEfficiency/global-eff.png" alt="global-eff"><br><img src="/home/bbramas/Projects/ScalfmmGit/scalfmm/Addons/BenchEfficiency/task-eff.png" alt="task-eff"><br><img src="/home/bbramas/Projects/ScalfmmGit/scalfmm/Addons/BenchEfficiency/task-gr-eff.png" alt="task-gr-eff"></p><h2 id="heterogeneous"><a name="heterogeneous" href="#heterogeneous"></a>Heterogeneous</h2><p><strong>NOT FINISHED!!!!</strong></p><p>For test case <code>-nb 10000000</code> (10 million) and <code>-h 6</code> (height of the tree equal to 6),<br>we first want to know the best granularity <code>-bs</code>.</p><p>This parameter will certainly not be the same for sequential/parallel/heterogenous configurations.</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">export SCALFMM_NB=10000000 -export SCALFMM_H=7 -export SCALFMM_MIN_BS=100 -export SCALFMM_MAX_BS=3000 -export SCALFMM_MAX_NB_CPU=24 -export SCALFMM_MAX_NB_GPU=4 -</code></pre>"><span class="hljs-built_in">export</span> SCALFMM_NB=<span class="hljs-number">10000000</span> -<span class="hljs-built_in">export</span> SCALFMM_H=<span class="hljs-number">7</span> -<span class="hljs-built_in">export</span> SCALFMM_MIN_BS=<span class="hljs-number">100</span> -<span class="hljs-built_in">export</span> SCALFMM_MAX_BS=<span class="hljs-number">3000</span> -<span class="hljs-built_in">export</span> SCALFMM_MAX_NB_CPU=<span class="hljs-number">24</span> -<span class="hljs-built_in">export</span> SCALFMM_MAX_NB_GPU=<span class="hljs-number">4</span> -</code></pre><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">export STARPU_NCPUS=1 -export STARPU_NCUDA=0 -export SCALFMM_BS_CPU_SEQ=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` -if [[ `which gnuplot | wc -l` == "1" ]] ; then - gnuplot -e "filename='seq-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot -fi - -export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -export STARPU_NCUDA=0 -export SCALFMM_BS_CPU_PAR=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` -if [[ `which gnuplot | wc -l` == "1" ]] ; then - gnuplot -e "filename='par-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot -fi - -export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -export STARPU_NCUDA=$SCALFMM_MAX_NB_GPU -export SCALFMM_BS_CPU_GPU=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` -if [[ `which gnuplot | wc -l` == "1" ]] ; then - gnuplot -e "filename='cpugpu-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot -fi -</code></pre>"><span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-number">1</span> -<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-number">0</span> -<span class="hljs-built_in">export</span> SCALFMM_BS_CPU_SEQ=`<span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.sh -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> <span class="hljs-variable">$SCALFMM_MIN_BS</span> <span class="hljs-variable">$SCALFMM_MAX_BS</span> | <span class="hljs-variable">$SCALFMM_AB</span>/scalfmm_extract_key <span class="hljs-string">"@BEST BS"</span> ` -<span class="hljs-keyword">if</span> [[ `<span class="hljs-built_in">which</span> gnuplot | wc <span class="hljs-operator">-l</span>` == <span class="hljs-string">"1"</span> ]] ; <span class="hljs-keyword">then</span> - gnuplot <span class="hljs-operator">-e</span> <span class="hljs-string">"filename='seq-bs-search'"</span> <span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.gplot -<span class="hljs-keyword">fi</span> - -<span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-variable">$SCALFMM_MAX_NB_CPU</span> -<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-number">0</span> -<span class="hljs-built_in">export</span> SCALFMM_BS_CPU_PAR=`<span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.sh -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> <span class="hljs-variable">$SCALFMM_MIN_BS</span> <span class="hljs-variable">$SCALFMM_MAX_BS</span> | <span class="hljs-variable">$SCALFMM_AB</span>/scalfmm_extract_key <span class="hljs-string">"@BEST BS"</span> ` -<span class="hljs-keyword">if</span> [[ `<span class="hljs-built_in">which</span> gnuplot | wc <span class="hljs-operator">-l</span>` == <span class="hljs-string">"1"</span> ]] ; <span class="hljs-keyword">then</span> - gnuplot <span class="hljs-operator">-e</span> <span class="hljs-string">"filename='par-bs-search'"</span> <span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.gplot -<span class="hljs-keyword">fi</span> - -<span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-variable">$SCALFMM_MAX_NB_CPU</span> -<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-variable">$SCALFMM_MAX_NB_GPU</span> -<span class="hljs-built_in">export</span> SCALFMM_BS_CPU_GPU=`<span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.sh -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> <span class="hljs-variable">$SCALFMM_MIN_BS</span> <span class="hljs-variable">$SCALFMM_MAX_BS</span> | <span class="hljs-variable">$SCALFMM_AB</span>/scalfmm_extract_key <span class="hljs-string">"@BEST BS"</span> ` -<span class="hljs-keyword">if</span> [[ `<span class="hljs-built_in">which</span> gnuplot | wc <span class="hljs-operator">-l</span>` == <span class="hljs-string">"1"</span> ]] ; <span class="hljs-keyword">then</span> - gnuplot <span class="hljs-operator">-e</span> <span class="hljs-string">"filename='cpugpu-bs-search'"</span> <span class="hljs-variable">$SCALFMM_AB</span>/scalfmmFindBs.gplot -<span class="hljs-keyword">fi</span> -</code></pre><p>Then, we can execute three best configurations, and keep .rec for each of them:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">export STARPU_NCPUS=1 -export STARPU_NCUDA=0 -./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_CPU_SEQ -export SCALFMM_SEQ_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" -mv trace.rec $SCALFMM_SEQ_REC - -export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -export STARPU_NCUDA=0 -./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_PAR -export SCALFMM_PAR_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" -mv trace.rec $SCALFMM_PAR_REC - -export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -export STARPU_NCUDA=$SCALFMM_MAX_NB_GPU -./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_GPU -export SCALFMM_PAR_CPU_GPU_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" -mv trace.rec $SCALFMM_PAR_CPU_GPU_REC -</code></pre>"><span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-number">1</span> -<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-number">0</span> -./Tests/Release/<span class="hljs-built_in">test</span>BlockedUnifCudaBench -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> -bs <span class="hljs-variable">$SCALFMM_CPU_SEQ</span> -<span class="hljs-built_in">export</span> SCALFMM_SEQ_REC=<span class="hljs-string">"trace-nb_<span class="hljs-variable">$SCALFMM_NB</span>-h_<span class="hljs-variable">$SCALFMM_H</span>-bs_<span class="hljs-variable">$SCALFMM_CPU_SEQ</span>-CPU_<span class="hljs-variable">$STARPU_NCPUS</span>-GPU_<span class="hljs-variable">$STARPU_NCUDA</span>.rec"</span> -mv trace.rec <span class="hljs-variable">$SCALFMM_SEQ_REC</span> - -<span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-variable">$SCALFMM_MAX_NB_CPU</span> -<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-number">0</span> -./Tests/Release/<span class="hljs-built_in">test</span>BlockedUnifCudaBench -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> -bs <span class="hljs-variable">$SCALFMM_BS_CPU_PAR</span> -<span class="hljs-built_in">export</span> SCALFMM_PAR_REC=<span class="hljs-string">"trace-nb_<span class="hljs-variable">$SCALFMM_NB</span>-h_<span class="hljs-variable">$SCALFMM_H</span>-bs_<span class="hljs-variable">$SCALFMM_CPU_SEQ</span>-CPU_<span class="hljs-variable">$STARPU_NCPUS</span>-GPU_<span class="hljs-variable">$STARPU_NCUDA</span>.rec"</span> -mv trace.rec <span class="hljs-variable">$SCALFMM_PAR_REC</span> - -<span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-variable">$SCALFMM_MAX_NB_CPU</span> -<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-variable">$SCALFMM_MAX_NB_GPU</span> -./Tests/Release/<span class="hljs-built_in">test</span>BlockedUnifCudaBench -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> -bs <span class="hljs-variable">$SCALFMM_BS_CPU_GPU</span> -<span class="hljs-built_in">export</span> SCALFMM_PAR_CPU_GPU_REC=<span class="hljs-string">"trace-nb_<span class="hljs-variable">$SCALFMM_NB</span>-h_<span class="hljs-variable">$SCALFMM_H</span>-bs_<span class="hljs-variable">$SCALFMM_CPU_SEQ</span>-CPU_<span class="hljs-variable">$STARPU_NCPUS</span>-GPU_<span class="hljs-variable">$STARPU_NCUDA</span>.rec"</span> -mv trace.rec <span class="hljs-variable">$SCALFMM_PAR_CPU_GPU_REC</span> -</code></pre><p>And we also want the GPU tasks only on GPU</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -export STARPU_NCUDA=$SCALFMM_MAX_NB_GPU -./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_GPU -p2p-m2l-cuda-only -export SCALFMM_PAR_GPU_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA-GPUONLY.rec" -mv trace.rec $SCALFMM_PAR_GPU_REC -</code></pre>"><span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-variable">$SCALFMM_MAX_NB_CPU</span> -<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-variable">$SCALFMM_MAX_NB_GPU</span> -./Tests/Release/<span class="hljs-built_in">test</span>BlockedUnifCudaBench -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> -bs <span class="hljs-variable">$SCALFMM_BS_CPU_GPU</span> -p2p-m2l-cuda-only -<span class="hljs-built_in">export</span> SCALFMM_PAR_GPU_REC=<span class="hljs-string">"trace-nb_<span class="hljs-variable">$SCALFMM_NB</span>-h_<span class="hljs-variable">$SCALFMM_H</span>-bs_<span class="hljs-variable">$SCALFMM_CPU_SEQ</span>-CPU_<span class="hljs-variable">$STARPU_NCPUS</span>-GPU_<span class="hljs-variable">$STARPU_NCUDA</span>-GPUONLY.rec"</span> -mv trace.rec <span class="hljs-variable">$SCALFMM_PAR_GPU_REC</span> -</code></pre><p>And we want the sequential version with parallel granularity:</p><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">export STARPU_NCPUS=1 -export STARPU_NCUDA=0 - -./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_PAR -SCALFMM_SEQ_CPU_BS_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" -mv trace.rec $SCALFMM_SEQ_CPU_BS_REC - -./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_GPU -SCALFMM_SEQ_GPU_BS_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" -mv trace.rec $SCALFMM_SEQ_GPU_BS_REC -</code></pre>"><span class="hljs-built_in">export</span> STARPU_NCPUS=<span class="hljs-number">1</span> -<span class="hljs-built_in">export</span> STARPU_NCUDA=<span class="hljs-number">0</span> - -./Tests/Release/<span class="hljs-built_in">test</span>BlockedUnifCudaBench -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> -bs <span class="hljs-variable">$SCALFMM_BS_CPU_PAR</span> -SCALFMM_SEQ_CPU_BS_REC=<span class="hljs-string">"trace-nb_<span class="hljs-variable">$SCALFMM_NB</span>-h_<span class="hljs-variable">$SCALFMM_H</span>-bs_<span class="hljs-variable">$SCALFMM_CPU_SEQ</span>-CPU_<span class="hljs-variable">$STARPU_NCPUS</span>-GPU_<span class="hljs-variable">$STARPU_NCUDA</span>.rec"</span> -mv trace.rec <span class="hljs-variable">$SCALFMM_SEQ_CPU_BS_REC</span> - -./Tests/Release/<span class="hljs-built_in">test</span>BlockedUnifCudaBench -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> -bs <span class="hljs-variable">$SCALFMM_BS_CPU_GPU</span> -SCALFMM_SEQ_GPU_BS_REC=<span class="hljs-string">"trace-nb_<span class="hljs-variable">$SCALFMM_NB</span>-h_<span class="hljs-variable">$SCALFMM_H</span>-bs_<span class="hljs-variable">$SCALFMM_CPU_SEQ</span>-CPU_<span class="hljs-variable">$STARPU_NCPUS</span>-GPU_<span class="hljs-variable">$STARPU_NCUDA</span>.rec"</span> -mv trace.rec <span class="hljs-variable">$SCALFMM_SEQ_GPU_BS_REC</span> -</code></pre><p>From these files, we are able to get the different efficencies.</p><h2 id="post-processing-and-plot"><a name="post-processing-and-plot" href="#post-processing-and-plot"></a>Post-processing and Plot</h2><p>From the file:</p><ul> -<li><code>$SCALFMM_SEQ_REC</code> : the resulting file from the sequential execution with best sequential granularity</li><li><code>$SCALFMM_PAR_REC</code> : the resulting file from a parallel execution (no GPU) with best parallel granularity</li><li><code>$SCALFMM_PAR_CPU_GPU_REC</code> : the resulting file from a parallel execution (hybrid) with best parallel-hybrid granularity</li><li><code>$SCALFMM_PAR_GPU_REC</code> : the resulting file with all possible tasks on GPU with best parallel-hybrid granularity</li><li><code>$SCALFMM_SEQ_CPU_BS_REC</code> : the resulting file from sequential execution with best parallel granularity</li><li><code>$SCALFMM_SEQ_GPU_BS_REC</code> : the resulting file from sequential execution with best parallel-hybrid granularity</li></ul><p>Getting all the efficency<br>Solving the linear programming problem</p><p>Plotting the results</p><h2 id="automatization"><a name="automatization" href="#automatization"></a>Automatization</h2><pre class="bash hljs"><code class="bash" data-origin="<pre><code class="bash">SCALFMM_NB=10000000 -SCALFMM_H=7 -SCALFMM_MIN_BS=100 -SCALFMM_MAX_BS=3000 -SCALFMM_MAX_NB_CPU=24 -SCALFMM_MAX_NB_GPU=4 - -scalfmm_generate_efficiency -nb $SCALFMM_NB -h $SCALFMM_H -start $SCALFMM_MIN_BS -end $SCALFMM_MAX_BS -</code></pre>">SCALFMM_NB=<span class="hljs-number">10000000</span> -SCALFMM_H=<span class="hljs-number">7</span> -SCALFMM_MIN_BS=<span class="hljs-number">100</span> -SCALFMM_MAX_BS=<span class="hljs-number">3000</span> -SCALFMM_MAX_NB_CPU=<span class="hljs-number">24</span> -SCALFMM_MAX_NB_GPU=<span class="hljs-number">4</span> - -scalfmm_generate_efficiency -nb <span class="hljs-variable">$SCALFMM_NB</span> -h <span class="hljs-variable">$SCALFMM_H</span> -start <span class="hljs-variable">$SCALFMM_MIN_BS</span> -end <span class="hljs-variable">$SCALFMM_MAX_BS</span> -</code></pre> - -<footer style="position:fixed; font-size:.8em; text-align:right; bottom:0px; margin-left:-25px; height:20px; width:100%;">generated by <a href="http://pad.haroopress.com" target="_blank">haroopad</a></footer> -</body> -</html> diff --git a/Addons/BenchEfficiency/scalfmm.md b/Addons/BenchEfficiency/scalfmm.md deleted file mode 100644 index 0cf00f00e179fba2cfd4d579d4aa97712394876d..0000000000000000000000000000000000000000 --- a/Addons/BenchEfficiency/scalfmm.md +++ /dev/null @@ -1,585 +0,0 @@ -ScalFMM with StarPU+CUDA -======================== - -In this tutorial, we provide the commands to install ScalFMM and the needed tools in order to compute parallel efficiencies. -We first show how to obtain the homogeneous efficencies and then the heterogeneous ones (not done yet). - -## Installing the libraries - -For some installation steps, we provide a "valid-if" test which shows if the previous command has been done correctly or not. -In case of success `STEP-OK` will be print-out. -In addition, if a library is already installed on the system, it is possible to set the output variables directly and test with the "valid-if" command if it will work. - -It is possible to follow these steps only to compile ScalFMM above StarPU and so we marked the installation of execution-trace tools as __Optional__. -However, we higly recommended to install them and to follow all the steps since they let have the efficiencies. -But if one wants to execute without any overhead, it might need to remove the usage of FXT. - -### Pre-requiste: -In order to follow this tutorial, it is needed to have the following applications installed: - -* autoconf (>= 2.69) -* gawk (Awk >= 4.0.1) -* make (>= 3.81) -* cmake (>= 3.2.2) -* gcc/g++ (>= 4.9) and the gcc/g++ names should point to the correct binaries -* BLAS/LAPACK (The configure of ScalFMM is different if the MKL is used or not, but with the MKL it is recommended to set environment variable `MKLROOT`) -* CUDA (>= 7) and `CUDA_PATH` must be set. In our case, `CUDA_PATH=/usr/local/cuda-7.5/` -* __Optional__ Vite (from `sudo apt-get install vite` or see [http://vite.gforge.inria.fr/download.php](http://vite.gforge.inria.fr/download.php)) -* __Optional__ Qt5 library to be able to change the colors of the execution traces in order to visualize the different FMM operators -* gnuplot to generate the figures - -> [Remark] Some installations of CUDA does not have libcuda file. -> In this case, one needs to create a link : `sudo ln /usr/local/cuda-7.5/lib64/libcudart.so /usr/local/cuda-7.5/lib64/libcuda.so` - -> [Plafrim-Developers] -> -> For those who use this tutorial on Plafrim (or a similar cluster), we provide extra informations. -> -> To allocate an heterogeneous node : `salloc -N 1 --time=03:00:00 --exclusive -p court_sirocco -CHaswell --gres=gpu:4 -x sirocco06` -> -> Then, find it using `squeue` and access it by `ssh`. -> -> We have run this tutorial with the modules : `module load compiler/gcc/4.9.2 cuda75/toolkit/7.5.18 intel/mkl/64/11.2/2016.0.0 build/cmake/3.2.1` - -### Working directory - -The variable `SCALFMM_TEST_DIR` is used to specify the working directory where all the tools are going to be installed: -```bash -export SCALFMM_TEST_DIR=~/scalfmm_test -cd $SCALFMM_TEST_DIR -``` - -In order to be able to stop the tutorial in the middle and restart later, we will register the variables in a file that should be source to restart later: -```bash -# function scalfmmRegisterVariable() { echo "export $1=${!1}" >> "$SCALFMM_TEST_DIR/environment.source"; } -echo "function scalfmmRegisterVariable() { echo \"export \$1=\${!1}\" >> \"$SCALFMM_TEST_DIR/environment.source\"; }" > "$SCALFMM_TEST_DIR/environment.source" -source "$SCALFMM_TEST_DIR/environment.source" -``` - -*Output variables:* `scalfmmRegisterVariable SCALFMM_TEST_DIR` - -Valid-if -```bash -if [[ -n $SCALFMM_TEST_DIR ]] && [[ -d $SCALFMM_TEST_DIR ]] ; then - echo “STEP-OK” -fi -``` - -- Restarting the tutorial - -To restart the tutorial, one needs to re-define the working directory and to source the save file before to resume: -```bash -export SCALFMM_TEST_DIR=~/scalfmm_test -if [[ ! -d $SCALFMM_TEST_DIR ]] ; then - mkdir $SCALFMM_TEST_DIR -else - source "$SCALFMM_TEST_DIR/environment.source" -fi -cd $SCALFMM_TEST_DIR -``` - -### Downloading the Packages (in Advance) - -If the computational node does not have access to internet, we provide a command to download the needed packages (otherwise the next commands still include just in time download): -```bash -cd $SCALFMM_TEST_DIR -wget https://www.open-mpi.org/software/hwloc/v1.11/downloads/hwloc-1.11.2.tar.gz -wget http://download.savannah.gnu.org/releases/fkt/fxt-0.2.11.tar.gz # Optional -wget http://www.fftw.org/fftw-3.3.4.tar.gz -svn export svn://scm.gforge.inria.fr/svnroot/starpu/trunk starpu -git clone --depth=1 https://scm.gforge.inria.fr/anonscm/git/scalfmm-public/scalfmm-public.git -``` - -### HWLOC -```bash -cd $SCALFMM_TEST_DIR -if [[ ! -f hwloc-1.11.2.tar.gz ]] ; then - wget https://www.open-mpi.org/software/hwloc/v1.11/downloads/hwloc-1.11.2.tar.gz -fi -tar xvf hwloc-1.11.2.tar.gz -cd hwloc-1.11.2/ -export SCALFMM_HWLOC_DIR=$SCALFMM_TEST_DIR/hwlocinstall -./configure --prefix=$SCALFMM_HWLOC_DIR -make install -``` - -*Output variables:* `scalfmmRegisterVariable SCALFMM_HWLOC_DIR` - -Valid-if: -```bash -if [[ -n $SCALFMM_HWLOC_DIR ]] && [[ -d $SCALFMM_HWLOC_DIR/lib/ ]] && [[ -f $SCALFMM_HWLOC_DIR/lib/libhwloc.so ]]; then - echo "STEP-OK" -fi -``` - -### FXT (__Optional__) -```bash -cd $SCALFMM_TEST_DIR -if [[ ! -f fxt-0.2.11.tar.gz ]] ; then - wget http://download.savannah.gnu.org/releases/fkt/fxt-0.2.11.tar.gz -fi -tar xvf fxt-0.2.11.tar.gz -cd fxt-0.2.11/ -export SCALFMM_FXT_DIR=$SCALFMM_TEST_DIR/fxtinstall -./configure --prefix=$SCALFMM_FXT_DIR -make install -``` - -*Output variables:* `scalfmmRegisterVariable SCALFMM_FXT_DIR` - -Valid-if: -```bash -if [[ -n $SCALFMM_FXT_DIR ]] && [[ -d $SCALFMM_FXT_DIR/lib/ ]] && [[ -f $SCALFMM_FXT_DIR/lib/libfxt.so ]]; then - echo "STEP-OK" -fi -``` - -### FFTW (If No MKL-FFT) -For those who do not use MKL FFT interface, they have to install FFTW (float/double): -```bash -cd $SCALFMM_TEST_DIR -if [[ ! -f fftw-3.3.4.tar.gz ]] ; then - wget http://www.fftw.org/fftw-3.3.4.tar.gz -fi -tar xvf fftw-3.3.4.tar.gz -cd fftw-3.3.4/ -export SCALFMM_FFTW_DIR=$SCALFMM_TEST_DIR/fftinstall -./configure --prefix=$SCALFMM_FFTW_DIR -make install -./configure --prefix=$SCALFMM_FFTW_DIR --enable-float -make install -``` - -*Output variables:* `scalfmmRegisterVariable SCALFMM_FFTW_DIR` - -Valid-if: -```bash -if [[ -n $SCALFMM_FFTW_DIR ]] && [[ -d $SCALFMM_FFTW_DIR/lib/ ]] && [[ -f $SCALFMM_FFTW_DIR/lib/libfftw3.a ]] && [[ -f $SCALFMM_FFTW_DIR/lib/libfftw3f.a ]]; then - echo "STEP-OK" -fi -``` - -### StarPU -```bash -cd $SCALFMM_TEST_DIR -if [[ ! -d starpu ]] ; then - svn export svn://scm.gforge.inria.fr/svnroot/starpu/trunk starpu -fi -cd starpu/ -export SCALFMM_STARPU_DIR=$SCALFMM_TEST_DIR/starpuinstall -./autogen.sh -./configure --prefix=$SCALFMM_STARPU_DIR --with-fxt=$SCALFMM_FXT_DIR --with-hwloc=$SCALFMM_HWLOC_DIR --with-cuda-dir=$CUDA_PATH --disable-opencl -make install -``` -> __Optional__ In case you do not want to use trace (FXT) please remove the `--with-fxt=$SCALFMM_FXT_DIR` parameter from the command - -*Output variables:* `scalfmmRegisterVariable SCALFMM_STARPU_DIR` - -Valid-if: -```bash -if [[ -n $SCALFMM_STARPU_DIR ]] && [[ -d $SCALFMM_STARPU_DIR/lib/ ]] && [[ -f $SCALFMM_STARPU_DIR/lib/libstarpu.so ]] ; then - echo "STEP-OK" -fi -``` - -### ScalFMM - -#### Configure -+ Getting the source from the last commit: -```bash -cd $SCALFMM_TEST_DIR -if [[ ! -d scalfmm-public ]] ; then - git clone --depth=1 https://scm.gforge.inria.fr/anonscm/git/scalfmm-public/scalfmm-public.git -fi -cd scalfmm-public/ -export SCALFMM_SOURCE_DIR=`pwd` -cd Build/ -export SCALFMM_BUILD_DIR=`pwd` -``` - -*Output variables:* `scalfmmRegisterVariable SCALFMM_BUILD_DIR` `scalfmmRegisterVariable SCALFMM_SOURCE_DIR` - -+ Configure (No MKL): -```bash -cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF \ - -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF \ - -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=OFF \ - -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON \ - -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF \ - -DHWLOC_DIR=$SCALFMM_HWLOC_DIR -DSTARPU_DIR=$SCALFMM_STARPU_DIR \ - -DSCALFMM_USE_FFT=ON -DFFT_DIR=$SCALFMM_FFT_DIR -``` -+ Configure (MKL BLAS/LAPACK and FFTW): -```bash -cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF \ - -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF \ - -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=ON \ - -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON \ - -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF \ - -DHWLOC_DIR=$SCALFMM_HWLOC_DIR -DSTARPU_DIR=$SCALFMM_STARPU_DIR \ - -DSCALFMM_USE_FFT=ON -DFFT_DIR=$SCALFMM_FFT_DIR -``` -+ Configure (MKL BLAS/LAPACK/FFT and No FFTW): - -> [Plafrim-Developers] Should use that one - -```bash -cmake .. -DSCALFMM_BUILD_DEBUG=OFF -DSCALFMM_USE_MPI=OFF \ - -DSCALFMM_BUILD_TESTS=ON -DSCALFMM_BUILD_UTESTS=OFF \ - -DSCALFMM_USE_BLAS=ON -DSCALFMM_USE_MKL_AS_BLAS=ON \ - -DSCALFMM_USE_LOG=ON -DSCALFMM_USE_STARPU=ON \ - -DSCALFMM_USE_CUDA=ON -DSCALFMM_USE_OPENCL=OFF \ - -DHWLOC_DIR=$SCALFMM_HWLOC_DIR -DSTARPU_DIR=$SCALFMM_STARPU_DIR \ - -DSCALFMM_USE_FFT=ON -DSCALFMM_USE_MKL_AS_FFTW=ON -``` - -Valid-if: -```bash -cmake .. ; if [[ "$?" == "0" ]] ; then echo "STEP-OK" ; fi -``` - -#### Build - -```bash -cd $SCALFMM_BUILD_DIR -make testBlockedUnifCudaBench -``` - -Valid-if: -```bash -ls ./Tests/Release/testBlockedUnifCudaBench ; if [[ "$?" == "0" ]] ; then echo "STEP-OK" ; fi -``` - -#### First Execution - -In this section we compute a simulation and look at the resulting trace. -ScalFMM binary parameters and descriptions: - -* Passing `--help` as parameter provide the possible/valid parameters -* Simulation properties are choosen by : - * `-h` : height of the tree - * `-bs` : granularity/size of the group - * `-nb` : number of particles generated -* Execution properties are choosen by the StarPU environment variables : - * `STARPU_NCPUS` : the number of CPU workers - * `STARPU_NCUDA` : the number of GPU workers (for heterogeneous binary) -* By default the application will not compare the FMM interactions against the direct method (which is N^2) and so it is recommended to avoid the validation for large test cases. But to get the accuracy one must pass the parameter `-validation` -* `-p2p-m2l-cuda-only` : to compute the P2P and the M2L only on GPU (the rest on the CPU) - -Examples: - -```bash -export STARPU_NCPUS=12 -export STARPU_NCUDA=2 -./Tests/Release/testBlockedUnifCudaBench -nb 30000000 -h 7 -bs 800 -``` - -Last part of the output should be: -```bash - Start FGroupTaskStarPUAlgorithm - directPass in 0.0406482s - inblock in 0.000780428s - outblock in 0.0398674s - bottomPass in 0.00586269s - upwardPass in 0.00265723s - transferPass in 0.00323571s - inblock in 0.000124817s - outblock in 0.00298331s - downardPass in 0.00257975s - transferPass in 0.0652285s - inblock in 0.00164774s - outblock in 0.0635799s - L2P in 0.0115733s - Submitting the tasks took 0.139101s - Moving data to the host took 0.0578765s -@EXEC TIME = 14.6321s -``` - -+ Visualize the execution trace (__Optional__) - -Convert the fxt file -```bash -$SCALFMM_STARPU_DIR/bin/starpu_fxt_tool -i "/tmp/prof_file_"$USER"_0" -``` -Then visualize the output with `vite` (maybe by copying the paje.trace file locally) -```bash -vite ./paje.trace -``` - -Should be like: - - -We can convert the color of the trace by (requiere Qt5 library): - -```bash -$SCALFMM_SOURCE_DIR/Addons/BenchEfficiency/pajecolor paje.trace $SCALFMM_SOURCE_DIR/Addons/BenchEfficiency/paintmodel.fmm.colors -vite ./paje.trace.painted -``` - -Should be like: - - -+ Get execution times - -```bash -python $SCALFMM_STARPU_DIR/bin/starpu_trace_state_stats.py -t trace.rec -``` - -Should give something like: -``` -"Name","Count","Type","Duration" -"Initializing",14,"Runtime",7153.096196 -"Overhead",57010,"Runtime",376.473463 -"Idle",14355,"Other",12.815899 -"Scheduling",28441,"Runtime",238.367394 -"Sleeping",610,"Other",13786.513208 -"FetchingInput",14341,"Runtime",13918.805814 -"execute_on_all_wrapper",30,"Task",21.288802 -"Executing",414,"Runtime",26852.864578 -"PushingOutput",14341,"Runtime",284.96123 -"P2P-out",3846,"Task",60378.266619 -"Callback",13559,"Runtime",4.210633 -"P2P",328,"Task",15383.426991 -"M2L-level-5",41,"Task",2354.702554 -"M2L-level-6",328,"Task",18349.915495 -"Deinitializing",14,"Runtime",109.87483 -"M2L-level-4",6,"Task",275.088295 -"P2M",328,"Task",11312.022842 -"M2M-level-5",328,"Task",829.9055 -"M2M-level-4",41,"Task",93.130498 -"M2L-out-level-5",638,"Task",1914.900053 -"M2M-level-3",6,"Task",11.053067 -"M2M-level-2",1,"Task",1.363157 -"M2L-out-level-4",22,"Task",159.580457 -"L2L-level-4",41,"Task",84.554065 -"L2L-level-5",328,"Task",1087.717767 -"M2L-out-level-6",7692,"Task",18322.518045 -"L2P",328,"Task",27146.256793 -"M2L-level-2",1,"Task",2.661235 -"L2L-level-3",6,"Task",11.346978 -"M2L-level-3",1,"Task",47.612555 -"L2L-level-2",1,"Task",1.471873 -``` - -Most of the script are in the addon directories -```bash -export SCALFMM_AB=$SCALFMM_SOURCE_DIR/Addons/BenchEfficiency/ -``` - -*Output variable:* `scalfmmRegisterVariable SCALFMM_AB` - -## Homogeneous Efficiencies - -Here we compute the efficiencies for a given test case on CPU only. - -Go in the build dir and create output dir -```bash -cd $SCALFMM_BUILD_DIR -export SCALFMM_RES_DIR=$SCALFMM_BUILD_DIR/homogeneous -mkdir $SCALFMM_RES_DIR -``` -*Output variable:* `scalfmmRegisterVariable SCALFMM_RES_DIR` - -Set up the configuration variables: -```bash -export SCALFMM_NB=10000000 -export SCALFMM_H=7 -export SCALFMM_MIN_BS=100 -export SCALFMM_MAX_BS=10000 -export SCALFMM_MAX_NB_CPU=24 -``` - -Find best granularity in sequential and in parallel: -```bash -export STARPU_NCPUS=1 -export STARPU_NCUDA=0 -export SCALFMM_BS_CPU_SEQ=`$SCALFMM_AB/scalfmmFindBs.sh "./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs" $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmmExtractKey.sh "@BEST BS" ` -if [[ `which gnuplot | wc -l` == "1" ]] ; then - gnuplot -e "filename='seq-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot -fi - -export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -export STARPU_NCUDA=0 -export SCALFMM_BS_CPU_PAR=`$SCALFMM_AB/scalfmmFindBs.sh "./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs" $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` -if [[ `which gnuplot | wc -l` == "1" ]] ; then - gnuplot -e "filename='par-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot -fi -``` -In our case we get 9710 and 5385. - -*Output variable:* `scalfmmRegisterVariable SCALFMM_BS_CPU_SEQ` `scalfmmRegisterVariable SCALFMM_BS_CPU_PAR` - -We can look to the work that has been done to find the best granularity: - - - - -Then we compute the efficiency using both granulirities and keep the .rec files: -```bash -export SCALFMM_MAX_NB_CPU=24 -export STARPU_NCUDA=0 -source "$SCALFMM_AB/execAllHomogeneous.sh" -``` - -We should end with all the .rec files and their corresponding time files and `ls "$SCALFMM_RES_DIR"` should return something like: -```bash -trace-nb_10000000-h_7-bs_5385-CPU_10.rec trace-nb_10000000-h_7-bs_5385-CPU_16.rec.time trace-nb_10000000-h_7-bs_5385-CPU_22.rec trace-nb_10000000-h_7-bs_5385-CPU_5.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_10.rec.time trace-nb_10000000-h_7-bs_5385-CPU_17.rec trace-nb_10000000-h_7-bs_5385-CPU_22.rec.time trace-nb_10000000-h_7-bs_5385-CPU_6.rec -trace-nb_10000000-h_7-bs_5385-CPU_11.rec trace-nb_10000000-h_7-bs_5385-CPU_17.rec.time trace-nb_10000000-h_7-bs_5385-CPU_23.rec trace-nb_10000000-h_7-bs_5385-CPU_6.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_11.rec.time trace-nb_10000000-h_7-bs_5385-CPU_18.rec trace-nb_10000000-h_7-bs_5385-CPU_23.rec.time trace-nb_10000000-h_7-bs_5385-CPU_7.rec -trace-nb_10000000-h_7-bs_5385-CPU_12.rec trace-nb_10000000-h_7-bs_5385-CPU_18.rec.time trace-nb_10000000-h_7-bs_5385-CPU_24.rec trace-nb_10000000-h_7-bs_5385-CPU_7.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_12.rec.time trace-nb_10000000-h_7-bs_5385-CPU_19.rec trace-nb_10000000-h_7-bs_5385-CPU_24.rec.time trace-nb_10000000-h_7-bs_5385-CPU_8.rec -trace-nb_10000000-h_7-bs_5385-CPU_13.rec trace-nb_10000000-h_7-bs_5385-CPU_19.rec.time trace-nb_10000000-h_7-bs_5385-CPU_2.rec trace-nb_10000000-h_7-bs_5385-CPU_8.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_13.rec.time trace-nb_10000000-h_7-bs_5385-CPU_1.rec trace-nb_10000000-h_7-bs_5385-CPU_2.rec.time trace-nb_10000000-h_7-bs_5385-CPU_9.rec -trace-nb_10000000-h_7-bs_5385-CPU_14.rec trace-nb_10000000-h_7-bs_5385-CPU_1.rec.time trace-nb_10000000-h_7-bs_5385-CPU_3.rec trace-nb_10000000-h_7-bs_5385-CPU_9.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_14.rec.time trace-nb_10000000-h_7-bs_5385-CPU_20.rec trace-nb_10000000-h_7-bs_5385-CPU_3.rec.time trace-nb_10000000-h_7-bs_9710-CPU_1.rec -trace-nb_10000000-h_7-bs_5385-CPU_15.rec trace-nb_10000000-h_7-bs_5385-CPU_20.rec.time trace-nb_10000000-h_7-bs_5385-CPU_4.rec trace-nb_10000000-h_7-bs_9710-CPU_1.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_15.rec.time trace-nb_10000000-h_7-bs_5385-CPU_21.rec trace-nb_10000000-h_7-bs_5385-CPU_4.rec.time -trace-nb_10000000-h_7-bs_5385-CPU_16.rec trace-nb_10000000-h_7-bs_5385-CPU_21.rec.time trace-nb_10000000-h_7-bs_5385-CPU_5.rec -``` - -We then compute the efficiencies from these files -```bash -g++ -std=c++11 $SCALFMM_AB/mergetimefile.cpp -o $SCALFMM_AB/mergetimefile.exe -$SCALFMM_AB/mergetimefile.exe \ - "$SCALFMM_RES_DIR/trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_BS_CPU_SEQ-CPU_1.rec.time" \ - "$SCALFMM_RES_DIR/trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_BS_CPU_PAR-CPU_%d.rec.time"\ - $SCALFMM_MAX_NB_CPU -``` - -We end-up with the global efficiencies (for the application) but also for the different operators. -```bash -Create global-eff.data -Create task-eff.data -Create task-gr-eff.dat -``` - -We can plot each of them -```bash -gnuplot -e "filename='global-eff'" $SCALFMM_AB/scalfmmPlotAll.gplot -gnuplot -e "filename='task-eff'" $SCALFMM_AB/scalfmmPlotAll.gplot -gnuplot -e "filename='task-gr-eff'" $SCALFMM_AB/scalfmmPlotAll.gplot -``` - -In our case it gives: - - - - - -## Heterogeneous - -__NOT FINISHED!!!!__ - -For test case `-nb 10000000` (10 million) and `-h 6` (height of the tree equal to 6), -we first want to know the best granularity `-bs`. - -This parameter will certainly not be the same for sequential/parallel/heterogenous configurations. - -```bash -export SCALFMM_NB=10000000 -export SCALFMM_H=7 -export SCALFMM_MIN_BS=100 -export SCALFMM_MAX_BS=3000 -export SCALFMM_MAX_NB_CPU=24 -export SCALFMM_MAX_NB_GPU=4 -``` - -```bash -export STARPU_NCPUS=1 -export STARPU_NCUDA=0 -export SCALFMM_BS_CPU_SEQ=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` -if [[ `which gnuplot | wc -l` == "1" ]] ; then - gnuplot -e "filename='seq-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot -fi - -export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -export STARPU_NCUDA=0 -export SCALFMM_BS_CPU_PAR=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` -if [[ `which gnuplot | wc -l` == "1" ]] ; then - gnuplot -e "filename='par-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot -fi - -export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -export STARPU_NCUDA=$SCALFMM_MAX_NB_GPU -export SCALFMM_BS_CPU_GPU=`$SCALFMM_AB/scalfmmFindBs.sh -nb $SCALFMM_NB -h $SCALFMM_H $SCALFMM_MIN_BS $SCALFMM_MAX_BS | $SCALFMM_AB/scalfmm_extract_key "@BEST BS" ` -if [[ `which gnuplot | wc -l` == "1" ]] ; then - gnuplot -e "filename='cpugpu-bs-search'" $SCALFMM_AB/scalfmmFindBs.gplot -fi -``` - -Then, we can execute three best configurations, and keep .rec for each of them: -```bash -export STARPU_NCPUS=1 -export STARPU_NCUDA=0 -./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_CPU_SEQ -export SCALFMM_SEQ_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" -mv trace.rec $SCALFMM_SEQ_REC - -export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -export STARPU_NCUDA=0 -./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_PAR -export SCALFMM_PAR_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" -mv trace.rec $SCALFMM_PAR_REC - -export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -export STARPU_NCUDA=$SCALFMM_MAX_NB_GPU -./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_GPU -export SCALFMM_PAR_CPU_GPU_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" -mv trace.rec $SCALFMM_PAR_CPU_GPU_REC -``` - -And we also want the GPU tasks only on GPU -```bash -export STARPU_NCPUS=$SCALFMM_MAX_NB_CPU -export STARPU_NCUDA=$SCALFMM_MAX_NB_GPU -./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_GPU -p2p-m2l-cuda-only -export SCALFMM_PAR_GPU_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA-GPUONLY.rec" -mv trace.rec $SCALFMM_PAR_GPU_REC -``` - -And we want the sequential version with parallel granularity: -```bash -export STARPU_NCPUS=1 -export STARPU_NCUDA=0 - -./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_PAR -SCALFMM_SEQ_CPU_BS_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" -mv trace.rec $SCALFMM_SEQ_CPU_BS_REC - -./Tests/Release/testBlockedUnifCudaBench -nb $SCALFMM_NB -h $SCALFMM_H -bs $SCALFMM_BS_CPU_GPU -SCALFMM_SEQ_GPU_BS_REC="trace-nb_$SCALFMM_NB-h_$SCALFMM_H-bs_$SCALFMM_CPU_SEQ-CPU_$STARPU_NCPUS-GPU_$STARPU_NCUDA.rec" -mv trace.rec $SCALFMM_SEQ_GPU_BS_REC -``` - -From these files, we are able to get the different efficencies. - -## Post-processing and Plot - -From the file: - -+ `$SCALFMM_SEQ_REC` : the resulting file from the sequential execution with best sequential granularity -+ `$SCALFMM_PAR_REC` : the resulting file from a parallel execution (no GPU) with best parallel granularity -+ `$SCALFMM_PAR_CPU_GPU_REC` : the resulting file from a parallel execution (hybrid) with best parallel-hybrid granularity -+ `$SCALFMM_PAR_GPU_REC` : the resulting file with all possible tasks on GPU with best parallel-hybrid granularity -+ `$SCALFMM_SEQ_CPU_BS_REC` : the resulting file from sequential execution with best parallel granularity -+ `$SCALFMM_SEQ_GPU_BS_REC` : the resulting file from sequential execution with best parallel-hybrid granularity - -Getting all the efficency -Solving the linear programming problem - -Plotting the results - - -## Automatization - -```bash -SCALFMM_NB=10000000 -SCALFMM_H=7 -SCALFMM_MIN_BS=100 -SCALFMM_MAX_BS=3000 -SCALFMM_MAX_NB_CPU=24 -SCALFMM_MAX_NB_GPU=4 - -scalfmm_generate_efficiency -nb $SCALFMM_NB -h $SCALFMM_H -start $SCALFMM_MIN_BS -end $SCALFMM_MAX_BS -``` \ No newline at end of file diff --git a/Addons/BenchEfficiency/scalfmmExtractKey.sh b/Addons/BenchEfficiency/scalfmmExtractKey.sh index a1909bcb4db4a77e7238db47dba78721c8045043..dbdc7f0e20309561a11cf009161fc305bbb18765 100644 --- a/Addons/BenchEfficiency/scalfmmExtractKey.sh +++ b/Addons/BenchEfficiency/scalfmmExtractKey.sh @@ -6,5 +6,5 @@ if [[ $# -ne 1 ]] ; then fi input=$(cat) -res=`echo "$input" | grep "$3" | cut -d'=' -f2 | cut -d's' -f1` +res=`echo "$input" | grep "$1" | cut -d'=' -f2 | cut -d' ' -f2` echo $res diff --git a/Addons/BenchEfficiency/seq-bs-search.png b/Addons/BenchEfficiency/seq-bs-search.png deleted file mode 100644 index 3bc1ece1cf46f09a17f2b95f59040589fdd91bd9..0000000000000000000000000000000000000000 Binary files a/Addons/BenchEfficiency/seq-bs-search.png and /dev/null differ diff --git a/Addons/BenchEfficiency/task-eff.data b/Addons/BenchEfficiency/task-eff.data deleted file mode 100644 index d921d6a94df131fea43c5d9e12cdb61f290322a2..0000000000000000000000000000000000000000 --- a/Addons/BenchEfficiency/task-eff.data +++ /dev/null @@ -1,25 +0,0 @@ -0 L2L M2M P2M L2P M2L-out M2L P2P-out P2P -1 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 -2 9.565659e-01 9.665736e-01 1.031103e+00 1.004286e+00 9.715094e-01 9.208541e-01 9.697996e-01 9.763831e-01 -3 9.263068e-01 1.024516e+00 1.029574e+00 9.889095e-01 9.937418e-01 9.954310e-01 1.001689e+00 1.000994e+00 -4 1.005226e+00 1.006333e+00 1.033745e+00 1.010624e+00 9.534195e-01 9.864280e-01 9.895790e-01 9.995851e-01 -5 8.615300e-01 9.844517e-01 9.938413e-01 1.009990e+00 9.569465e-01 9.791331e-01 9.887700e-01 9.975625e-01 -6 8.535893e-01 9.410083e-01 1.014109e+00 1.018876e+00 9.739749e-01 9.860534e-01 9.782539e-01 9.964238e-01 -7 1.046813e+00 9.975072e-01 1.037954e+00 1.003486e+00 9.786087e-01 9.933857e-01 1.004895e+00 9.965736e-01 -8 9.995985e-01 1.013025e+00 9.895591e-01 1.013030e+00 9.652670e-01 9.907845e-01 1.000561e+00 9.971405e-01 -9 1.039365e+00 1.013929e+00 1.047827e+00 9.852421e-01 9.711139e-01 9.898517e-01 9.980679e-01 9.993222e-01 -10 9.181035e-01 9.952685e-01 1.031850e+00 1.012496e+00 9.670203e-01 9.852214e-01 9.859215e-01 9.985014e-01 -11 8.717502e-01 9.889525e-01 1.028373e+00 1.011922e+00 9.699808e-01 9.888136e-01 9.826419e-01 9.981512e-01 -12 9.452144e-01 1.040015e+00 1.013514e+00 9.762884e-01 9.389195e-01 9.915452e-01 9.996240e-01 9.998256e-01 -13 1.022490e+00 1.021529e+00 1.014210e+00 9.896566e-01 9.668669e-01 9.898209e-01 1.011145e+00 9.991000e-01 -14 9.383201e-01 9.923898e-01 1.030084e+00 1.009296e+00 9.748870e-01 9.858361e-01 1.005721e+00 9.971995e-01 -15 9.387378e-01 9.986737e-01 1.032522e+00 9.967096e-01 9.675984e-01 9.877332e-01 1.003181e+00 9.974178e-01 -16 9.377196e-01 9.853747e-01 1.043778e+00 1.003874e+00 9.786853e-01 9.873092e-01 1.003464e+00 9.958178e-01 -17 9.293735e-01 1.034251e+00 1.038271e+00 1.003177e+00 9.700248e-01 9.915540e-01 9.899480e-01 9.984129e-01 -18 9.081814e-01 9.992797e-01 1.018655e+00 9.982681e-01 9.627375e-01 9.752319e-01 9.739917e-01 9.297086e-01 -19 9.471672e-01 9.763513e-01 1.026148e+00 1.013503e+00 9.656781e-01 9.868543e-01 9.891711e-01 9.992051e-01 -20 9.376034e-01 1.008523e+00 1.015422e+00 9.988900e-01 9.763451e-01 9.917410e-01 1.016855e+00 9.974959e-01 -21 9.649789e-01 9.941223e-01 1.023371e+00 9.720318e-01 9.427889e-01 9.864717e-01 1.011408e+00 1.001528e+00 -22 8.085859e-01 1.003002e+00 1.024132e+00 1.015483e+00 9.586926e-01 9.888563e-01 9.829068e-01 9.982469e-01 -23 9.843031e-01 1.009513e+00 1.041257e+00 1.012564e+00 1.009160e+00 9.949415e-01 9.970272e-01 9.964763e-01 -24 9.408696e-01 9.847445e-01 1.030481e+00 9.726508e-01 9.691133e-01 9.975819e-01 1.022271e+00 1.000680e+00 diff --git a/Addons/BenchEfficiency/task-eff.png b/Addons/BenchEfficiency/task-eff.png deleted file mode 100644 index d8e15fae86dc95d544ca3ca3f1fe85b624b5b32a..0000000000000000000000000000000000000000 Binary files a/Addons/BenchEfficiency/task-eff.png and /dev/null differ diff --git a/Addons/BenchEfficiency/task-gr-eff.data b/Addons/BenchEfficiency/task-gr-eff.data deleted file mode 100644 index d921d6a94df131fea43c5d9e12cdb61f290322a2..0000000000000000000000000000000000000000 --- a/Addons/BenchEfficiency/task-gr-eff.data +++ /dev/null @@ -1,25 +0,0 @@ -0 L2L M2M P2M L2P M2L-out M2L P2P-out P2P -1 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 -2 9.565659e-01 9.665736e-01 1.031103e+00 1.004286e+00 9.715094e-01 9.208541e-01 9.697996e-01 9.763831e-01 -3 9.263068e-01 1.024516e+00 1.029574e+00 9.889095e-01 9.937418e-01 9.954310e-01 1.001689e+00 1.000994e+00 -4 1.005226e+00 1.006333e+00 1.033745e+00 1.010624e+00 9.534195e-01 9.864280e-01 9.895790e-01 9.995851e-01 -5 8.615300e-01 9.844517e-01 9.938413e-01 1.009990e+00 9.569465e-01 9.791331e-01 9.887700e-01 9.975625e-01 -6 8.535893e-01 9.410083e-01 1.014109e+00 1.018876e+00 9.739749e-01 9.860534e-01 9.782539e-01 9.964238e-01 -7 1.046813e+00 9.975072e-01 1.037954e+00 1.003486e+00 9.786087e-01 9.933857e-01 1.004895e+00 9.965736e-01 -8 9.995985e-01 1.013025e+00 9.895591e-01 1.013030e+00 9.652670e-01 9.907845e-01 1.000561e+00 9.971405e-01 -9 1.039365e+00 1.013929e+00 1.047827e+00 9.852421e-01 9.711139e-01 9.898517e-01 9.980679e-01 9.993222e-01 -10 9.181035e-01 9.952685e-01 1.031850e+00 1.012496e+00 9.670203e-01 9.852214e-01 9.859215e-01 9.985014e-01 -11 8.717502e-01 9.889525e-01 1.028373e+00 1.011922e+00 9.699808e-01 9.888136e-01 9.826419e-01 9.981512e-01 -12 9.452144e-01 1.040015e+00 1.013514e+00 9.762884e-01 9.389195e-01 9.915452e-01 9.996240e-01 9.998256e-01 -13 1.022490e+00 1.021529e+00 1.014210e+00 9.896566e-01 9.668669e-01 9.898209e-01 1.011145e+00 9.991000e-01 -14 9.383201e-01 9.923898e-01 1.030084e+00 1.009296e+00 9.748870e-01 9.858361e-01 1.005721e+00 9.971995e-01 -15 9.387378e-01 9.986737e-01 1.032522e+00 9.967096e-01 9.675984e-01 9.877332e-01 1.003181e+00 9.974178e-01 -16 9.377196e-01 9.853747e-01 1.043778e+00 1.003874e+00 9.786853e-01 9.873092e-01 1.003464e+00 9.958178e-01 -17 9.293735e-01 1.034251e+00 1.038271e+00 1.003177e+00 9.700248e-01 9.915540e-01 9.899480e-01 9.984129e-01 -18 9.081814e-01 9.992797e-01 1.018655e+00 9.982681e-01 9.627375e-01 9.752319e-01 9.739917e-01 9.297086e-01 -19 9.471672e-01 9.763513e-01 1.026148e+00 1.013503e+00 9.656781e-01 9.868543e-01 9.891711e-01 9.992051e-01 -20 9.376034e-01 1.008523e+00 1.015422e+00 9.988900e-01 9.763451e-01 9.917410e-01 1.016855e+00 9.974959e-01 -21 9.649789e-01 9.941223e-01 1.023371e+00 9.720318e-01 9.427889e-01 9.864717e-01 1.011408e+00 1.001528e+00 -22 8.085859e-01 1.003002e+00 1.024132e+00 1.015483e+00 9.586926e-01 9.888563e-01 9.829068e-01 9.982469e-01 -23 9.843031e-01 1.009513e+00 1.041257e+00 1.012564e+00 1.009160e+00 9.949415e-01 9.970272e-01 9.964763e-01 -24 9.408696e-01 9.847445e-01 1.030481e+00 9.726508e-01 9.691133e-01 9.975819e-01 1.022271e+00 1.000680e+00 diff --git a/Addons/BenchEfficiency/task-gr-eff.png b/Addons/BenchEfficiency/task-gr-eff.png deleted file mode 100644 index c748b76826f08ea7f7f09769eba322f0dce0cf3e..0000000000000000000000000000000000000000 Binary files a/Addons/BenchEfficiency/task-gr-eff.png and /dev/null differ diff --git a/Addons/BenchEfficiency/trace-example-colors.png b/Addons/BenchEfficiency/trace-example-colors.png deleted file mode 100644 index dcefa9fb53660927f1509d64f89254ee03e60dec..0000000000000000000000000000000000000000 Binary files a/Addons/BenchEfficiency/trace-example-colors.png and /dev/null differ diff --git a/Addons/BenchEfficiency/trace-example.png b/Addons/BenchEfficiency/trace-example.png deleted file mode 100644 index 5e466b94ed15a4d0905484425a75de9d390f45d7..0000000000000000000000000000000000000000 Binary files a/Addons/BenchEfficiency/trace-example.png and /dev/null differ diff --git a/Addons/CKernelApi/Src/FInterEngine.hpp b/Addons/CKernelApi/Src/FInterEngine.hpp index 499e74e1a6fb4942799b36ccc58fd7574c601d95..6d304220cc07a5a4bc16228e7b65704a3d6e136e 100644 --- a/Addons/CKernelApi/Src/FInterEngine.hpp +++ b/Addons/CKernelApi/Src/FInterEngine.hpp @@ -115,12 +115,12 @@ public: }else{ if(type==SOURCE){ for(FSize idPart = 0; idPart<NbPositions ; ++idPart){ - octree->insert(FPoint<FReal>(&XYZ[3*idPart]),FParticleTypeSource,idPart); + octree->insert(FPoint<FReal>(&XYZ[3*idPart]),FParticleType::FParticleTypeSource,idPart); } FScalFMMEngine<FReal>::nbPart += NbPositions; }else{ for(FSize idPart = 0; idPart<NbPositions ; ++idPart){ - octree->insert(FPoint<FReal>(&XYZ[3*idPart]),FParticleTypeTarget,idPart); + octree->insert(FPoint<FReal>(&XYZ[3*idPart]),FParticleType::FParticleTypeTarget,idPart); } FScalFMMEngine<FReal>::nbPart += NbPositions; } @@ -138,12 +138,12 @@ public: }else{ if(type==SOURCE){ for(FSize idPart = 0; idPart<NbPositions ; ++idPart){ - octree->insert(FPoint<FReal>(X[idPart],Y[idPart],Z[idPart]),FParticleTypeSource,idPart); + octree->insert(FPoint<FReal>(X[idPart],Y[idPart],Z[idPart]),FParticleType::FParticleTypeSource,idPart); } FScalFMMEngine<FReal>::nbPart += NbPositions; }else{ for(FSize idPart = 0; idPart<NbPositions ; ++idPart){ - octree->insert(FPoint<FReal>(X[idPart],Y[idPart],Z[idPart]),FParticleTypeTarget,idPart); + octree->insert(FPoint<FReal>(X[idPart],Y[idPart],Z[idPart]),FParticleType::FParticleTypeTarget,idPart); } FScalFMMEngine<FReal>::nbPart += NbPositions; } diff --git a/Addons/CKernelApi/Src/FUserKernelEngine.hpp b/Addons/CKernelApi/Src/FUserKernelEngine.hpp index 27a0243bdbd2865d1174ec1018720c931bd95959..a4fb90b9164d67205c035fede02f1c4868edf28b 100644 --- a/Addons/CKernelApi/Src/FUserKernelEngine.hpp +++ b/Addons/CKernelApi/Src/FUserKernelEngine.hpp @@ -388,12 +388,12 @@ public: }else{ if(type==SOURCE){ for(FSize idPart = 0; idPart<NbPositions ; ++idPart){ - octree->insert(FPoint<FReal>(X[idPart],Y[idPart],Z[idPart]),FParticleTypeSource,idPart); + octree->insert(FPoint<FReal>(X[idPart],Y[idPart],Z[idPart]),FParticleType::FParticleTypeSource,idPart); } FScalFMMEngine<FReal>::nbPart += NbPositions; }else{ for(FSize idPart = 0; idPart<NbPositions ; ++idPart){ - octree->insert(FPoint<FReal>(X[idPart],Y[idPart],Z[idPart]),FParticleTypeTarget,idPart); + octree->insert(FPoint<FReal>(X[idPart],Y[idPart],Z[idPart]),FParticleType::FParticleTypeTarget,idPart); } FScalFMMEngine<FReal>::nbPart += NbPositions; } @@ -411,12 +411,12 @@ public: }else{ if(type==SOURCE){ for(FSize idPart = 0; idPart<NbPositions ; ++idPart){ - octree->insert(FPoint<FReal>(&XYZ[3*idPart]),FParticleTypeSource,idPart); + octree->insert(FPoint<FReal>(&XYZ[3*idPart]),FParticleType::FParticleTypeSource,idPart); } FScalFMMEngine<FReal>::nbPart += NbPositions; }else{ for(FSize idPart = 0; idPart<NbPositions ; ++idPart){ - octree->insert(FPoint<FReal>(&XYZ[3*idPart]),FParticleTypeTarget,idPart); + octree->insert(FPoint<FReal>(&XYZ[3*idPart]),FParticleType::FParticleTypeTarget,idPart); } FScalFMMEngine<FReal>::nbPart += NbPositions; } diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b71fb71acdf6b68151022d9a22248b680088781..ecba3000c7fafc93b3927d9d19135ce5268bb6b9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -321,10 +321,16 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/ ############################################################################## # if( SCALFMM_USE_BLAS ) + # include(FortranCInterface) + # # Define a Fortran interface file (FCMangle.hpp) + # FortranCInterface_HEADER( ${CMAKE_CURRENT_SOURCE_DIR}/Src/FCMangle.hpp + # MACRO_NAMESPACE "PM_" + # SYMBOL_NAMESPACE "PM_" + # SYMBOLS init testPPM:init) message(STATUS "CMAKE_CXX_COMPILER_ID STREQUAL ${CMAKE_CXX_COMPILER_ID}") - + option( SCALFMM_USE_MKL_AS_BLAS "Set to ON to use MKL CBLAS" OFF ) - + if( SCALFMM_USE_MKL_AS_BLAS ) set(BLA_VENDOR "Intel10_64lp_seq") find_package(BLASEXT QUIET) # not REQUIRED @@ -346,7 +352,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/ list(APPEND BLASLAPACK_LIBRARIES "${BLAS_LIBRARIES}") endif() endif() - + if(BLAS_FOUND) set(SCALFMM_LIBRARIES "${SCALFMM_LIBRARIES};${BLASLAPACK_LIBRARIES}") if(BLAS_LIBRARY_DIRS) @@ -357,6 +363,35 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/ # the RPATH to be used when installing list(APPEND CMAKE_INSTALL_RPATH "${LAPACK_LIBRARY_DIRS}") endif() + # check blas and lapack symbols naming + set(CMAKE_REQUIRED_LIBRARIES "${BLAS_LIBRARIES}") + check_function_exists(dgemv_ DGEMV_ADD_) + set (SCALFMM_BLAS_UPCASE OFF) + set (SCALFMM_BLAS_NOCHANGE OFF) + message (STATUS "BLAS dgemv_ " ${DGEMV_ADD_} ${SCALFMM_BLAS_UPCASE}) + if (DGEMV_ADD_) + set (SCALFMM_BLAS_ADD_ ON) + message (STATUS "BLAS dgemv_ symbol found, SCALFMM_BLAS_ADD_ is ON") + else (DGEMV_ADD_) + set (SCALFMM_BLAS_ADD_ OFF) + check_function_exists(DGEMV DGEMV_UPCASE) + if (DGEMV_UPCASE) + set (SCALFMM_BLAS_UPCASE ON) + message (STATUS "BLAS DGEMV symbol found, SCALFMM_BLAS_UPCASE is ON") + else (DGEMV_UPCASE) + # set (SCALFMM_BLAS_UPCASE OFF) + check_function_exists(dgemv DGEMV_NOCHANGE) + if (DGEMV_NOCHANGE) + set (SCALFMM_BLAS_NOCHANGE ON) + message (STATUS "BLAS dgemv symbol found, SCALFMM_BLAS_NOCHANGE is ON") + # else (DGEMV_NOCHANGE) + # set (SCALFMM_BLAS_NOCHANGE OFF) + endif (DGEMV_NOCHANGE) + endif (DGEMV_UPCASE) + endif (DGEMV_ADD_) + if ( (NOT DGEMV_ADD_) AND (NOT DGEMV_UPCASE) AND (NOT DGEMV_NOCHANGE) ) + message(FATAL_ERROR "BLAS Fortran mangling cannot be properly detected") + endif () else() message(WARNING "BLAS has not been found, SCALFMM will continue to compile but some applications will be disabled.") message(WARNING "If you have BLAS set BLAS_LIBDIR, BLAS_INCDIR or BLAS_DIR (CMake variables using -D or environment variables).") @@ -612,7 +647,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/ OUTPUT_VARIABLE COMPILE_AVX_OUTPUT) if(${COMPILE_AVX}) message(STATUS "%%%%%%%%%%%% COMPILE_AVX = ${COMPILE_AVX} %%%%< ${AVX_FLAGS}") - + set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS} ${AVX_FLAGS}") message(STATUS "%%%%%%%%%%%% SCALFMM_CXX_FLAGS = ${SCALFMM_CXX_FLAGS}") #set( SCALFMM_USE_SSE OFF FORCE) # ne marche pas @@ -725,7 +760,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/ endif(PKG_CONFIG_FOUND) endif(SCALFMM_USE_EZTRACE) - + ################################################################## # # To catch signals @@ -738,7 +773,6 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/ IF( NOT APPLE) SET(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS} -rdynamic") ENDIF() - endif() ################################################################## # # @@ -806,6 +840,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/ ################################################################## # Add - doc # ################################################################## + message(STATUS "SCALFMM_BUILD_DOC = ${SCALFMM_BUILD_DOC}" ) if(SCALFMM_BUILD_DOC) add_subdirectory(Doc) endif() diff --git a/CMakeModules/morse/find/FindBLAS.cmake b/CMakeModules/morse/find/FindBLAS.cmake index cbf7769442aeb7f72f5cf59b83e106a0ba0fb177..073e2c1134ebf2b26e798996e24d616785d20ef2 100644 --- a/CMakeModules/morse/find/FindBLAS.cmake +++ b/CMakeModules/morse/find/FindBLAS.cmake @@ -279,6 +279,7 @@ macro(Check_Fortran_Libraries LIBRARIES _prefix _name _flags _list _thread) find_library(${_prefix}_${_library}_LIBRARY NAMES ${_library} HINTS ${_libdir} + NO_DEFAULT_PATH ) mark_as_advanced(${_prefix}_${_library}_LIBRARY) # Print status if not found @@ -293,6 +294,10 @@ macro(Check_Fortran_Libraries LIBRARIES _prefix _name _flags _list _thread) if(_libraries_work) # Test this combination of libraries. + if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND BLA_STATIC) + list(INSERT ${LIBRARIES} 0 "-Wl,--start-group") + list(APPEND ${LIBRARIES} "-Wl,--end-group") + endif() set(CMAKE_REQUIRED_LIBRARIES "${_flags};${${LIBRARIES}};${_thread}") set(CMAKE_REQUIRED_FLAGS "${BLAS_COMPILER_FLAGS}") if (BLAS_VERBOSE) @@ -901,7 +906,7 @@ if (BLA_VENDOR STREQUAL "IBMESSL" OR BLA_VENDOR STREQUAL "All") BLAS sgemm "" - "essl;blas" + "essl" "" ) endif() diff --git a/CMakeModules/morse/find/FindBLASEXT.cmake b/CMakeModules/morse/find/FindBLASEXT.cmake index f13b6c9fc64cd3be0c4bf13eb5d29ba52474e83a..86330f4224b9d706f35760cc03d810cf222b6cba 100644 --- a/CMakeModules/morse/find/FindBLASEXT.cmake +++ b/CMakeModules/morse/find/FindBLASEXT.cmake @@ -259,9 +259,17 @@ endif() # extract libs paths # remark: because it is not given by find_package(BLAS) set(BLAS_LIBRARY_DIRS "") +string(REPLACE " " ";" BLAS_LIBRARIES "${BLAS_LIBRARIES}") foreach(blas_lib ${BLAS_LIBRARIES}) - get_filename_component(a_blas_lib_dir "${blas_lib}" PATH) - list(APPEND BLAS_LIBRARY_DIRS "${a_blas_lib_dir}" ) + string(REPLACE "-L" "" blas_lib "${blas_lib}") + if (EXISTS "${blas_lib}") + list(APPEND BLAS_LIBRARY_DIRS "${blas_lib}" ) + else() + get_filename_component(a_blas_lib_dir "${blas_lib}" PATH) + if (EXISTS "${a_blas_lib_dir}") + list(APPEND BLAS_LIBRARY_DIRS "${a_blas_lib_dir}" ) + endif() + endif() endforeach() if (BLAS_LIBRARY_DIRS) list(REMOVE_DUPLICATES BLAS_LIBRARY_DIRS) diff --git a/CMakeModules/morse/find/FindFFTW.cmake b/CMakeModules/morse/find/FindFFTW.cmake index f187b7c80d788a03a9d4df2be2f337884346501a..f259c58feb78c7f1a576456b5e68f90d807d379e 100644 --- a/CMakeModules/morse/find/FindFFTW.cmake +++ b/CMakeModules/morse/find/FindFFTW.cmake @@ -172,13 +172,16 @@ find_package(PkgConfig QUIET) if( PKG_CONFIG_EXECUTABLE AND NOT FFTW_GIVEN_BY_USER ) if(FFTW_LOOK_FOR_FFTW_SIMPLE) - pkg_search_module(FFTW fftw3f) + pkg_search_module(FFTW3F fftw3f) + pkg_search_module(FFTW3 fftw3) elseif(FFTW_LOOK_FOR_FFTW_LONG) - pkg_search_module(FFTW fftw3) + pkg_search_module(FFTW3L fftw3l) + pkg_search_module(FFTW3 fftw3) elseif(FFTW_LOOK_FOR_FFTW_QUAD) - pkg_search_module(FFTW fftw3q) + pkg_search_module(FFTW3Q fftw3q) + pkg_search_module(FFTW3 fftw3) else() - pkg_search_module(FFTW fftw3) + pkg_search_module(FFTW3 fftw3) endif() if (NOT FFTW_FIND_QUIETLY) @@ -198,7 +201,19 @@ if( PKG_CONFIG_EXECUTABLE AND NOT FFTW_GIVEN_BY_USER ) set(FFTW_INCLUDE_DIRS_DEP "${FFTW_INCLUDE_DIRS}") set(FFTW_LIBRARY_DIRS_DEP "${FFTW_LIBRARY_DIRS}") - set(FFTW_LIBRARIES_DEP "${FFTW_LIBRARIES}") + set(FFTW_LIBRARIES_DEP) + if( FFTW3Q_LIBRARIES ) + list(APPEND FFTW_LIBRARIES_DEP "${FFTW3Q_LIBRARIES}") + endif() + if( FFTW3L_LIBRARIES ) + list(APPEND FFTW_LIBRARIES_DEP "${FFTW3L_LIBRARIES}") + endif() + if( FFTW3F_LIBRARIES ) + list(APPEND FFTW_LIBRARIES_DEP "${FFTW3F_LIBRARIES}") + endif() +if( FFTW3_LIBRARIES ) + list(APPEND FFTW_LIBRARIES_DEP "${FFTW3_LIBRARIES}") +endif() set(FFTW_WORKS TRUE) endif( PKG_CONFIG_EXECUTABLE AND NOT FFTW_GIVEN_BY_USER ) @@ -551,7 +566,13 @@ endif() # check that FFTW has been found # ------------------------------- include(FindPackageHandleStandardArgs) +if( (NOT PKG_CONFIG_EXECUTABLE) OR (PKG_CONFIG_EXECUTABLE AND NOT FFTW_FOUND) OR (FFTW_GIVEN_BY_USER) ) find_package_handle_standard_args(FFTW DEFAULT_MSG FFTW_LIBRARIES FFTW_INCLUDE_DIRS FFTW_WORKS) +else() +find_package_handle_standard_args(FFTW DEFAULT_MSG + FFTW_LIBRARIES + FFTW_WORKS) +endif() diff --git a/CMakeModules/morse/find/FindLAPACK.cmake b/CMakeModules/morse/find/FindLAPACK.cmake index 81e3869f731bc7c096a17b2d8749ad64a2332a50..668453dab6eb413fe24cf3a1ea90ee4b37dae8fd 100644 --- a/CMakeModules/morse/find/FindLAPACK.cmake +++ b/CMakeModules/morse/find/FindLAPACK.cmake @@ -154,7 +154,7 @@ macro(Check_Lapack_Libraries LIBRARIES _prefix _name _flags _list _blas _threads # N.B. _prefix is the prefix applied to the names of all cached variables that # are generated internally and marked advanced by this macro. - +set(_libdir ${ARGN}) set(_libraries_work TRUE) set(${LIBRARIES}) set(_combined_name) @@ -263,6 +263,7 @@ foreach(_library ${_list}) find_library(${_prefix}_${_library}_LIBRARY NAMES ${_library} HINTS ${_libdir} + NO_DEFAULT_PATH ) mark_as_advanced(${_prefix}_${_library}_LIBRARY) # Print status if not found @@ -277,6 +278,10 @@ endforeach(_library ${_list}) if(_libraries_work) # Test this combination of libraries. + if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND BLA_STATIC) + list(INSERT ${LIBRARIES} 0 "-Wl,--start-group") + list(APPEND ${LIBRARIES} "-Wl,--end-group") + endif() if(UNIX AND BLA_STATIC) set(CMAKE_REQUIRED_LIBRARIES ${_flags} "-Wl,--start-group" ${${LIBRARIES}} ${_blas} "-Wl,--end-group" ${_threads}) else(UNIX AND BLA_STATIC) diff --git a/CMakeModules/morse/find/FindLAPACKEXT.cmake b/CMakeModules/morse/find/FindLAPACKEXT.cmake index dc608cc741221f20010cf11b5f42839ac7e3b0db..420b898d9f10b9c2f8588ca2ea700f639c4e14cf 100644 --- a/CMakeModules/morse/find/FindLAPACKEXT.cmake +++ b/CMakeModules/morse/find/FindLAPACKEXT.cmake @@ -211,9 +211,17 @@ endif() # extract libs paths # remark: because it is not given by find_package(LAPACK) set(LAPACK_LIBRARY_DIRS "") +string(REPLACE " " ";" LAPACK_LIBRARIES "${LAPACK_LIBRARIES}") foreach(lapack_lib ${LAPACK_LIBRARIES}) - get_filename_component(a_lapack_lib_dir "${lapack_lib}" PATH) - list(APPEND LAPACK_LIBRARY_DIRS "${a_lapack_lib_dir}" ) + string(REPLACE "-L" "" lapack_lib "${lapack_lib}") + if (EXISTS "${lapack_lib}") + list(APPEND LAPACK_LIBRARY_DIRS "${lapack_lib}" ) + else() + get_filename_component(a_lapack_lib_dir "${lapack_lib}" PATH) + if (EXISTS "${a_lapack_lib_dir}") + list(APPEND LAPACK_LIBRARY_DIRS "${a_lapack_lib_dir}" ) + endif() + endif() endforeach() if (LAPACK_LIBRARY_DIRS) list(REMOVE_DUPLICATES LAPACK_LIBRARY_DIRS) diff --git a/CMakeModules/morse/find/FindPASTIX.cmake b/CMakeModules/morse/find/FindPASTIX.cmake index a4c6e742dc9af5c5f58bf6d6d8e9f2ff60f0042f..f6f4c9573a51669bfc70bab976b60b6a343ced76 100644 --- a/CMakeModules/morse/find/FindPASTIX.cmake +++ b/CMakeModules/morse/find/FindPASTIX.cmake @@ -17,6 +17,7 @@ # # PASTIX depends on the following libraries: # - Threads, m, rt +# - MPI # - HWLOC # - BLAS # diff --git a/CMakeModules/morse/find/FindSTARPU.cmake b/CMakeModules/morse/find/FindSTARPU.cmake index 0e8d82382f6b4401df0441bc823ab674e8d38765..a2b1ed209b7444fc4c10f31d248cd2d78972795e 100644 --- a/CMakeModules/morse/find/FindSTARPU.cmake +++ b/CMakeModules/morse/find/FindSTARPU.cmake @@ -225,6 +225,10 @@ if(PKG_CONFIG_EXECUTABLE AND NOT STARPU_GIVEN_BY_USER) # "Perhaps the path to starpu headers is already present in your" # "C(PLUS)_INCLUDE_PATH environment variable.${ColourReset}") #endif() + set(STARPU_VERSION_STRING "${STARPU_SHM_VERSION}") + string(REPLACE "." ";" STARPU_VERSION_STRING_LIST ${STARPU_VERSION_STRING}) + list(GET STARPU_VERSION_STRING_LIST 0 STARPU_VERSION_MAJOR) + list(GET STARPU_VERSION_STRING_LIST 1 STARPU_VERSION_MINOR) else() message("${Magenta}Looking for STARPU - not found using PkgConfig." "Perhaps you should add the directory containing libstarpu.pc" @@ -461,14 +465,23 @@ if( (NOT PKG_CONFIG_EXECUTABLE) OR (PKG_CONFIG_EXECUTABLE AND NOT STARPU_FOUND) find_path(STARPU_${starpu_hdr}_INCLUDE_DIRS NAMES ${starpu_hdr} HINTS ${STARPU_DIR} - PATH_SUFFIXES "include/starpu/${STARPU_VERSION_STRING}") + PATH_SUFFIXES "include" + "include/starpu/1.0" + "include/starpu/1.1" + "include/starpu/1.2" + "include/starpu/1.3") endforeach() else() foreach(starpu_hdr ${STARPU_hdrs_to_find}) set(STARPU_${starpu_hdr}_INCLUDE_DIRS "STARPU_${starpu_hdr}_INCLUDE_DIRS-NOTFOUND") find_path(STARPU_${starpu_hdr}_INCLUDE_DIRS NAMES ${starpu_hdr} - HINTS ${_inc_env}) + HINTS ${_inc_env} + PATH_SUFFIXES + "starpu/1.0" + "starpu/1.1" + "starpu/1.2" + "starpu/1.3") endforeach() endif() endif() diff --git a/Data/test20k.tsm.fma b/Data/test20k.tsm.fma index e46c371875748ab60d2704a31554d912d54f1ab5..5c0b2ea3b365b43fa316114fe37199b59c60ecb7 100644 --- a/Data/test20k.tsm.fma +++ b/Data/test20k.tsm.fma @@ -1,6 +1,4 @@ -8 4 -20000 -0.5 0.5 0.5 0.5 +20000 1 0.5 0.5 0.5 0.840188 0.394383 0.783099 0.01 1 0.911647 0.197551 0.335223 0.01 1 0.277775 0.55397 0.477397 0.01 1 @@ -20000,4 +19998,4 @@ 0.00448784 0.00539908 0.182474 0.01 0 0.0237434 0.139661 0.412617 0.01 1 0.514349 0.627817 0.0209046 0.01 1 -0.56572 0.990817 0.904442 0.01 0 \ No newline at end of file +0.56572 0.990817 0.904442 0.01 0 diff --git a/Doc/CMakeLists.txt b/Doc/CMakeLists.txt index 0c71df2038f54e13e54647e176a721fe19355158..d0634f5264b2b1b9cce465f10b2374ec5e4b02c4 100644 --- a/Doc/CMakeLists.txt +++ b/Doc/CMakeLists.txt @@ -1,8 +1,9 @@ # add a target to generate API documentation with Doxygen find_package(Doxygen) if(DOXYGEN_FOUND) - configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY) - add_custom_target( + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in + ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY) + add_custom_target( doc ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} @@ -11,5 +12,5 @@ if(DOXYGEN_FOUND) # INSTALL(FILES ${SCALFMM_BINARY_DIR}/Doc/scalfmm.tag DESTINATION doc/ ) # INSTALL(DIRECTORY ${SCALFMM_BINARY_DIR}/Doc/html DESTINATION doc/ ) else() - message( WARNING "You ask to enable the doc generation but Doxygen cannot be found." ) + message( FATAL_ERROR "You ask to enable the doc generation but Doxygen cannot be found." ) endif(DOXYGEN_FOUND) diff --git a/Doc/Site_dox/FInterpolationFMM.dox b/Doc/Site_dox/FInterpolationFMM.dox new file mode 100644 index 0000000000000000000000000000000000000000..8cd90f2ac5ff93cab811338fd62c2ebad583efbe --- /dev/null +++ b/Doc/Site_dox/FInterpolationFMM.dox @@ -0,0 +1,25 @@ +/*! \page interFMM Kernel Independent FMM + + * In this section, we briefly discuss the + + * \section general + + * \section MatrixKernel + * + * \subsection AddKernel How add a new Matrix Kernel + + * \subsection predKernel predefined Matrix Kernel + * Different kernels are predefined in ScalFMM. The kernels are in located + * in FInterpMatrixKernel.hpp + *<ul> + * <li> Laplacian kernel K(x,y)= 1/r with r=|x-y| <--> class FInterpMatrixKernelR + * <li> Laplacian kernel K(x,y)=1/rh with rh=sqrt(L_i*(x_i-y_i)^2) <--> class FInterpMatrixKernelRH + * <li> K(x,y)=1/r^2 with r=|x-y| <--> class FInterpMatrixKernelRR + * <li> Lennard Jones K(x,y)=1/r^12 - 1/r^6 with r=|x-y|" <--> class FInterpMatrixKernelLJ + * <li>Modified Laplacian kernel K(x,y)=1/r exp(-lambda r) with r=|x-y| <--> FInterpMatrixKernelML + * <li> K(x,y)=1/(r^2 + coreWidth) with r=|x-y| <--> FInterpMatrixKernelAPLUSRR + * </ul> + + + +*/ diff --git a/Doc/noDist/Notes/distribution.pdf b/Doc/noDist/Notes/distribution.pdf index 2f70d6c5854372d66fc3bd9b6a60a9363b8e2e39..3a94147abfbf8b57536bb69d7677c7ef8e247a71 100644 Binary files a/Doc/noDist/Notes/distribution.pdf and b/Doc/noDist/Notes/distribution.pdf differ diff --git a/Doc/noDist/Notes/distribution.tex b/Doc/noDist/Notes/distribution.tex index 04ec7836a1887d65629e7b2cec29f50ee2c184b1..c822fe6cd37083d4c6204803f11f5bf0d75a45a8 100644 --- a/Doc/noDist/Notes/distribution.tex +++ b/Doc/noDist/Notes/distribution.tex @@ -107,9 +107,9 @@ If you consider the \subsection{Plummer Model} This is a hard test case in astrophysics problem, and it models a globular cluster of stars, which is highly non uniform. It is called the plummer distribution. To construct such distribution, first we construct a uniform points distribution on the unit sphere. Second, the radius is chosen according to the plummer distribution (double power law in astrophysics). We consider $u$ a random number between 0 and 1, then the associated radius is given by \begin{equation*} -r = \sqrt{\frac{u^{2/3}}{u^{2/3}-1}} +r = 1.0/\sqrt{u^{-2/3}-1}, \end{equation*} - +and the total mass is one. Then, $m_i = \frac{1}{Npt}$. \begin{figure}[h] \centering \begin{minipage}{0.45\textwidth}% @@ -140,6 +140,8 @@ The corresponding potential is \begin{equation} \Phi_P(r) = - \frac{G M}{\sqrt{r^2+a^2}} \end{equation} + +In N-body units, $G = M = 1$ and $a = 3\pi/16 \sim 0.589$ \subsection{Diagonal Model} %, shape end size=.5cm},decoration={shape start size=.5cm, shape end size=.125cm diff --git a/Examples/LagrangeInterpolationFMM.cpp b/Examples/LagrangeInterpolationFMM.cpp index efb3d1b19361729c12650c5b48a586ea6636a7cf..2562741529393cf234c8ecfc2dec0792bc01834d 100755 --- a/Examples/LagrangeInterpolationFMM.cpp +++ b/Examples/LagrangeInterpolationFMM.cpp @@ -28,17 +28,21 @@ #include <string> #include "ScalFmmConfig.h" +#include "Utils/FGlobal.hpp" -#include "Files/FFmaGenericLoader.hpp" +#include "Utils/FParameters.hpp" +#include "Utils/FParameterNames.hpp" +#include "Files/FFmaGenericLoader.hpp" +// UFMM #include "Kernels/Uniform/FUnifCell.hpp" #include "Kernels/Interpolation/FInterpMatrixKernel.hpp" #include "Kernels/Uniform/FUnifKernel.hpp" - +// Leaves #include "Components/FSimpleLeaf.hpp" #include "Kernels/P2P/FP2PParticleContainerIndexed.hpp" -#include "Utils/FParameters.hpp" + #include "Containers/FOctree.hpp" @@ -48,7 +52,6 @@ #include "Core/FFmmAlgorithm.hpp" #endif -#include "Utils/FParameterNames.hpp" #include <memory> diff --git a/Examples/changeFmaFormat.cpp b/Examples/changeFmaFormat.cpp index 800555c886b77d12de6b3a73304ed984d477916f..45932744e733e6d5090c0bd19c549872cf1868eb 100644 --- a/Examples/changeFmaFormat.cpp +++ b/Examples/changeFmaFormat.cpp @@ -11,16 +11,15 @@ #include <string> #include <cstdlib> // -#include "Files/FFmaGenericLoader.hpp" -#include "Files/FDlpolyLoader.hpp" // #include "Utils/FGlobal.hpp" -#include "Utils/FPoint.hpp" + #include "Utils/FParameters.hpp" -#include "Files/FGenerateDistribution.hpp" +#include "Utils/FParameterNames.hpp" + +#include "Files/FFmaGenericLoader.hpp" #include "Files/FExportWriter.hpp" -#include "Utils/FParameterNames.hpp" // /// \file changeFmaFormat.cpp @@ -88,7 +87,7 @@ int main(int argc, char ** argv){ // Generate file for visualization purpose // if(FParameters::existParameter(argc, argv, FParameterDefinitions::OutputVisuFile.options)){ - std::string outfilename(FParameters::getStr(argc,argv,FParameterDefinitions::OutputFile.options, "output.vtp")); + std::string outfilename(FParameters::getStr(argc,argv,FParameterDefinitions::OutputVisuFile.options, "output.vtp")); driverExportData(outfilename, particles , NbPoints,loader.getNbRecordPerline() ); } // diff --git a/Examples/fuseDistributions.cpp b/Examples/fuseDistributions.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d5a6535cc3aa4e249836e23ace2db64b856aed39 --- /dev/null +++ b/Examples/fuseDistributions.cpp @@ -0,0 +1,287 @@ +/** + * \file + * \brief Fuses FMA files to create a new distribution + * + * \author Quentin Khan + * \copyright ScalFmm 2016 INRIA + * \copyright [CeCILL-C licence](http://www.cecill.info) + * + * + */ + +#include <algorithm> +#include <fstream> +#include <sstream> +#include <string> +#include <vector> + + +#include "Files/FFmaGenericLoader.hpp" + + +void usage(const std::string& progname) { + std::size_t start = progname.find_last_of('/'); + std::string name = progname.substr(start+1); + std::cout << + "usage: " << name << + " --file [[-s scale] [-c cx:cy:cz] [-g gx:gy:gz]] filename" + " -fout output_file" + " [--extra-length length]" + "\n" + "\n" + "Fuses multiple particle distributions into a bigger one." + "\n" + "\n" + "Options:\n" + " -fout output_file\n" + " The output file name, must hase .bfma or .fma extension\n" + "\n" + " --file [opts] filename [opts]\n" + " Add a .fma or .bfma distibution file. Multiple files may be specified by\n" + " adding more --file options. 'opts' is a combination of:\n" + " -s scale\n" + " Scale the distribution by 'scale' factor.\n" + " -c cx:cy:cz\n" + " Center the distribution at given coordinates. cx, cy and cz are\n" + " floating point numbers.\n" + " -g gx:gy:gz\n" + " Duplicate the distribution inside a grid of gx by gy by gz dimensions.\n" + " gx, gy and gz are integers. The grid center is governed by the -c \n" + " option.\n" + " -r rx:ry:rz\n" + " Rotate the distribution around its x, y and z axes. The rotation \n" + " center is the distribution center. rx, ry and rz are in radians.\n" + "\n" + " --extra-length length\n" + " Length to be added to the final box width.\n" + "\n" + " --help\n" + " Print this message." + "\n" + ; +} + + +using FReal = double; + +struct Particle { + FPoint<FReal> pos; + FReal val; +}; + +/// Distribution options +struct distribution { + /// Distribution filename + std::string filename = ""; + /// Distribution offset from center + FPoint<FReal> offset = {0,0,0}; + /// Distribution rotation around its center + FPoint<FReal> rot = {0,0,0}; + /// Distribution scale factor + FReal scale = 1; +}; + + + +struct parameters { + std::string output_filename; + std::vector<distribution> distributions; + FReal extra_length; +}; + +std::vector<distribution> subparse_file(const std::vector<std::string>& args, std::size_t& i) { + std::stringstream sstr; + // Grid layout + unsigned int gx = 1, gy = 1, gz = 1; + // Final distributions, one per grid part + std::vector<distribution> distributions; + // Distributions options + distribution dist; + + while(i < args.size() && args[i] != "--file") { + sstr.clear(); + if(false) { + } else if(args[i] == "-s") { + ++i; + sstr.str(args.at(i)); + sstr >> dist.scale; + } else if (args[i] == "-c") { + ++i; + char c; // Used to discard the ':' from argument format + sstr.str(args.at(i)); + sstr >> dist.offset[0] >> c >> dist.offset[1] >> c >> dist.offset[2]; + } else if(args[i] == "-g") { + ++i; + char c; // Used to discard the ':' from argument format + sstr.str(args.at(i)); + sstr >> gx >> c >> gy >> c >> gz; + } else if(args[i] == "-r") { + ++i; + char c; // Used to discard the ':' from argument format + sstr.str(args.at(i)); + sstr >> dist.rot[0] >> c >> dist.rot[1] >> c >> dist.rot[2]; + } else { + if(dist.filename != "") { + --i; + break; + } + dist.filename = args[i]; + } + ++i; + } + + if(gx > 1 || gy > 1 || gz > 1) { + + // Compute offset of lowest left grid offset + FFmaGenericLoader<FReal> loader(dist.filename); + FReal box_width = loader.getBoxWidth() * dist.scale; + dist.offset[0] -= (gx-1) * box_width / 2; + dist.offset[1] -= (gy-1) * box_width / 2; + dist.offset[2] -= (gz-1) * box_width / 2; + + // Create one distribution for each part of the grid layout + for(unsigned int x = 0; x < gx; ++x) { + for(unsigned int y = 0; y < gy; ++y) { + for(unsigned int z = 0; z < gz; ++z) { + distribution tmp_dist = dist; + tmp_dist.offset[0] += x * box_width; + tmp_dist.offset[1] += y * box_width; + tmp_dist.offset[2] += z * box_width; + distributions.push_back(tmp_dist); + } + } + } + } else { + distributions.push_back(dist); + } + + return distributions; +} + + +parameters parse(const std::vector<std::string>& args) { + parameters params; + std::stringstream sstr; + for(std::size_t i = 1; i < args.size(); ++i) { + if(args[i] == "--help") { + usage(args[0]); + exit(0); + } else if(args[i] == "--file") { + ++i; + auto ds = subparse_file(args, i); + params.distributions.insert(params.distributions.end(), + ds.begin(), ds.end()); + } else if(args[i] == "--extra-length") { + ++i; + sstr.str(args.at(i)); + sstr >> params.extra_length; + } else if(args[i] == "-fout") { + ++i; + params.output_filename = args.at(i); + } else { + std::cerr << "Unknown or misplaced parameters: " << args[i] << '\n'; + } + } + return params; +} + + +void rotate(Particle& p, const distribution& dist) { + // Rotate around x axis + if(dist.rot[0] > 1e-5 || dist.rot[0] < -1e-5) { + FReal alpha = dist.rot[0]; + p.pos[1] = p.pos[1] * cos(alpha) - p.pos[2] * sin(alpha); + p.pos[2] = p.pos[1] * sin(alpha) + p.pos[2] * cos(alpha); + } + // Rotate around y axis + if(dist.rot[1] > 1e-5 || dist.rot[1] < -1e-5) { + FReal alpha = dist.rot[1]; + p.pos[0] = p.pos[0] * cos(alpha) + p.pos[2] * sin(alpha); + p.pos[2] = -p.pos[0] * sin(alpha) + p.pos[2] * cos(alpha); + } + // Rotate around z axis + if(dist.rot[2] > 1e-5 || dist.rot[2] < -1e-5) { + FReal alpha = dist.rot[1]; + p.pos[0] = p.pos[0] * cos(alpha) - p.pos[1] * sin(alpha); + p.pos[1] = p.pos[0] * sin(alpha) + p.pos[1] * cos(alpha); + } +} + + + + + + +int main(int argc, char** argv) { + auto params = parse({argv,argv+argc}); + + // Fail early if output file raises an error + FFmaGenericWriter<FReal> writer(params.output_filename); + + // Fuse particle distributions + std::vector<Particle> particles; + FReal axis_max = 0; + + for(distribution& dist : params.distributions) { + // Load particles into array + FFmaGenericLoader<FReal> loader(dist.filename); + const std::size_t count = loader.getParticleCount(); + // Particle array: x1, y1, z1, val1, x2, y2... + particles.reserve(particles.size() + count); + + FPoint<FReal> center = loader.getBoxCenter(); + + // Temp particle + Particle p; + for(std::size_t i = 0; i < count; ++i) { + loader.fillParticle(&p.pos, &p.val); + // Move distribution center to origin + p.pos -= center; + // Scale distribution + p.pos *= dist.scale; + // Rotate distribution + rotate(p, dist); + // Move to new position + p.pos += dist.offset; + // Add particle to list + particles.push_back(p); + + // Save particle x,y,z min/max to compute final box + axis_max = std::max(std::abs(p.pos[0]), axis_max); + axis_max = std::max(std::abs(p.pos[1]), axis_max); + axis_max = std::max(std::abs(p.pos[2]), axis_max); + } + } + + + // Write final distribution + FPoint<FReal> center(0,0,0); + // Compute final box width + FReal box_width = 2 * (axis_max + params.extra_length); + + // Write header + writer.writeHeader(center, box_width, particles.size(), 8, 4); + + // Write all particles + + // Buffer avoids duplicating particle vector + std::vector<FReal> buffer; + buffer.reserve(4*1024); // Avoid reallocations, size is a multiple of 4 + + auto cur = particles.begin(); + auto sentinel = particles.end(); + + // Fill and write buffer until we're done + while(cur != sentinel) { + buffer.clear(); + while(buffer.size() != buffer.capacity() && cur != sentinel) { + buffer.push_back(cur->pos[0]); + buffer.push_back(cur->pos[1]); + buffer.push_back(cur->pos[2]); + buffer.push_back(cur->val); + ++cur; + } + writer.writeArrayOfReal(buffer.data(), 4, buffer.size()/4); + } + +} diff --git a/Examples/generateDistributions.cpp b/Examples/generateDistributions.cpp index e06911f4a613dbb866756e3c70791025ccd92246..ff3deb9234808c2d2a12a841140c5d7da2f935bb 100644 --- a/Examples/generateDistributions.cpp +++ b/Examples/generateDistributions.cpp @@ -5,12 +5,12 @@ * Author: Olivier Coulaud */ - +#include <algorithm> #include <iostream> #include <fstream> #include <sstream> #include <string> -// + #include "Utils/FGlobal.hpp" #include "Utils/FMath.hpp" #include "Utils/FPoint.hpp" @@ -20,240 +20,244 @@ #include "Utils/FParameterNames.hpp" -// -/// \file generateDistributions.cpp -//! -//! \brief generateDistributions: Driver to generate N points (non)uniformly distributed on a given geometry -//! -//! The goal of this driver is to generate uniform or non uniform points on the following geometries -//! -//! Uniform : cube, cuboid, sphere, prolate, -//! -//! Non uniform : ellipsoid, prolate -//! -//! You can set two kind of physical values depending of your problem. By default all values are between 0 and 1. -//! If you select the argument -charge (see bellow) the values are between -1 and 1. -//! The arguments available are -//! -//! <b> General arguments:</b> -//! \param -help (-h) to see the parameters available in this driver -//! \param -N The number of points in the distribution (default 20000) -//! \param -fout name: generic name for files (with extension) and save data -//! with following format in name.fma or name.bfma in -bin is set" -//! \param -fvisuout Filename for the visu file (vtk, vtp, cvs or cosmo). vtp is the default -//! \param -extraLength value extra length to add to the boxWidth (default 0.0) -//! <b> Geometry arguments:</b> -//! \param -unitCube uniform distribution on unit cube -//! \param -cube uniform distribution on a cube -//! \arg -length R - default value for R is 2.0 -//! \param -unitSphere uniform distribution on unit sphere -//! \param -sphere uniform distribution on sphere of radius given by -//! \arg -radius R - default value for R is 2.0 -//! \param -ellipsoid non uniform distribution on an ellipsoid of aspect ratio given by -//! \arg -size a:b:c with a, b and c > 0 -//! \param -prolate ellipsoid with aspect ratio a:a:c given by -//! \arg -size a:a:c with c > a > 0 -//! \param -plummer (Highly non uniform) plummer distribution (astrophysics) -//! \arg -radius R - default value 10.0" -//! -//! -//! <b> Physical values argument:</b> -//! \param -charge generate physical values between -1 and 1 otherwise generate between 0 and 1 -//! \param -zeromean the average of the physical values is zero -//! -//! -//! \b examples -//! -//! generateDistributions -prolate -size 2:2:4 -N 20000 -fout prolate -//! -//! or -//! -//! generateDistributions -cuboid 2:2:4 -N 100000 -fout cuboid.bfma -fvisuout cuboid.vtp -charge -zeromean -//! +/** + * \file + * + * \brief Generates points (non)uniformly distributed on a given geometry + * + * The goal of this driver is to generate uniform or non uniform points on the + * following geometries + * + * - Uniform : cube, cuboid, sphere, prolate, + * - Non uniform : ellipsoid, prolate + * + * You can set two kind of physical values depending of your problem. By + * default all values are between 0 and 1. If you select the argument -charge + * (see bellow) the values are between -1 and 1. The arguments available are + * + * <b> General arguments:</b> + * \param -help (-h) to see the parameters available in this driver + * \param -N The number of points in the distribution (default 20000) + * \param -fout name: generic name for files (with extension) and save data with + * following format in name.fma or name.bfma in -bin is set" + * \param -fvisuout Filename for the visu file (vtk, vtp, cvs or cosmo). vtp is + * the default + * \param -extraLength value extra length to add to the boxWidth (default 0.0) + * <b> Geometry arguments:</b> + * \param -unitCube uniform distribution in unit cube + * \param -cube uniform distribution in a cube + * \arg -size LX:LY:LZ - default value for R is 1.0:1.0:2.0 + * \param -unitSphere uniform distribution on unit sphere + * \param -sphere uniform distribution on sphere of radius given by + * \arg -radius R - default value for R is 2.0 + * \param -ball uniform distribution in ball of radius given by + * \arg -radius R - default value for R is 2.0 + * \param -ellipsoid non uniform distribution on an ellipsoid of aspect ratio + * given by + * \arg -size a:b:c with a, b and c > 0 + * \param -prolate ellipsoid with aspect ratio a:a:c given by + * \arg -size a:a:c with c > a > 0 + * \param -plummer (Highly non uniform) plummer distribution (astrophysics) + * \arg -radius R - default value 10.0" + * + * + * <b> Physical values argument:</b> + * \param -charge generate physical values between -1 and 1 otherwise generate between 0 and 1 + * \param -zeromean the average of the physical values is zero + * + * + * <b> examples</b> + * + * generateDistributions -prolate -size 2:2:4 -N 20000 -fout prolate + * + * or + * + * generateDistributions -cuboid 2:2:4 -N 100000 -fout cuboid.bfma -fvisuout cuboid.vtp -charge -zeromean + * + */ -int main(int argc, char ** argv){ - const FParameterNames LocalOptionEllipsoid = {{"-ellipsoid"} , - " non uniform distribution on an ellipsoid of aspect ratio given by -size a:b:c with a, b and c > 0"}, - LocalOptionUnitCube ={ {"-unitCube"} , - " uniform distribution on unit cube"}, - LocalOptionCube ={ {"-cuboid"} , - " uniform distribution on rectangular cuboid of size -lengths a:b:c - default values are 1.0:1.0:2.0 "}, - LocalOptionSize ={{"-size"} , - " Size of the geometry a:b:c - default values are 1.0:1.0:2.0"}, - LocalOptionUnitSphere ={ {"-unitSphere"} , - " uniform distribution on unit sphere"}, - LocalOptionSphere ={ {"-sphere"} , - " uniform distribution on sphere of radius given by -radius R - default value for R is 2.0"}, - LocalOptionProlate ={ {"-prolate"} ," ellipsoid with aspect ratio a:a:cs given by -size a:a:c with c > a > 0"}, - LocalOptionPlummer ={ {"-plummer"} ," (Highly non uniform) plummer distribution (astrophysics) -radius R - default value 10.0"}, - LocalOptionRadius ={ {"-radius"} , - " used to specified the radius of the sphere an dthe plummer distribution or R - default value for R is 2.0"}, - LocalOptionCharge ={{"-charge"} ," generate physical values between -1 and 1 otherwise generate between 0 and 1"}, - LocalOptionZM ={{"-zeromean"} , " the average of the physical values is zero"}, - LocalOptionEL ={{"-extraLength"} , - " -extraLength value extra length to add to the boxWidth"}; -; - FHelpDescribeAndExit(argc, argv, - ">> Driver to generate N points (non)uniformly distributed on a given geometry.\n" - "Options \n" - " -help to see the parameters ", - FParameterDefinitions::OutputFile, - FParameterDefinitions::NbParticles,FParameterDefinitions::OutputVisuFile,LocalOptionUnitCube,LocalOptionCube, - LocalOptionUnitSphere,LocalOptionSphere,LocalOptionRadius,LocalOptionEllipsoid,LocalOptionProlate,LocalOptionSize, - LocalOptionPlummer,LocalOptionCharge,LocalOptionZM,LocalOptionEL); +namespace Param { + const FParameterNames Ellipsoid + = {{"-ellipsoid"}, "non uniform distribution on an ellipsoid of aspect ratio given by -size a:b:c with a, b and c > 0"}; + const FParameterNames UnitCube + = {{"-unitCube"}, "uniform distribution on unit cube"}; + const FParameterNames Cube + = {{"-cuboid"}, "uniform distribution on rectangular cuboid of size -size a:b:c - default values are 1.0:1.0:2.0 "}; + const FParameterNames UnitSphere + = {{"-unitSphere"}, "uniform distribution on unit sphere"}; + const FParameterNames Ball + = {{"-ball"}, "uniform distribution in a ball of radius given by -radius R - default value for R is 2.0"}; + const FParameterNames Sphere + = {{"-sphere"}, "uniform distribution on sphere of radius given by -radius R - default value for R is 2.0"}; + const FParameterNames Prolate + = {{"-prolate"}, "ellipsoid with aspect ratio a:a:c given by -size a:a:c with c > a > 0"}; + const FParameterNames Plummer + = {{"-plummer"}, "(Highly non uniform) plummer distribution (astrophysics) -radius R - default value 10.0"}; + const FParameterNames Size + = {{"-size"}, "Size of the geometry a:b:c - default values are 1.0:1.0:2.0"}; + const FParameterNames Radius + = {{"-radius"}, "used to specified the radius of the sphere and the plummer distribution or R - default value for R is 2.0"}; + const FParameterNames Charge + = {{"-charge"}, "generate physical values between -1 and 1 otherwise generate between 0 and 1"}; + const FParameterNames ZM + = {{"-zeromean"}, "the average of the physical values is zero"}; + const FParameterNames EL + = {{"-extraLength"}, "-extraLength value extra length to add to the boxWidth"}; +} + +#define getParamV(name, default) \ + FParameters::getValue(argc,argv,(name).options,(default)) + +#define getParamS(name, default) \ + FParameters::getStr(argc,argv,(name).options,(default)) + +int main(int argc, char ** argv){ + FHelpDescribeAndExit( + argc, argv, + ">> Driver to generate N points (non)uniformly distributed on a given geometry.\n" + "Options \n" + " -help to see the parameters ", + FParameterDefinitions::OutputFile, FParameterDefinitions::NbParticles, + FParameterDefinitions::OutputVisuFile, + Param::UnitCube, Param::Cube, Param::UnitSphere, Param::Sphere, + Param::Radius, Param::Ellipsoid, Param::Prolate, Param::Plummer, + Param::Ball, + Param::Charge, Param::ZM, Param::EL, Param::Size + ); - - typedef double FReal; - FReal extraRadius = 0.000 ; + using FReal = double; - const FSize NbPoints = FParameters::getValue(argc,argv,FParameterDefinitions::NbParticles.options, FSize(20000)); - const std::string genericFileName(FParameters::getStr(argc,argv,FParameterDefinitions::OutputFile.options, "unifPointDist")); + const FSize NbPoints = getParamV(FParameterDefinitions::NbParticles, FSize(20000)); + FReal extraRadius = 0.000; FReal BoxWith = 0.0; FPoint<FReal> Centre(0.0, 0.0,0.0); - // - // Allocation - // - FReal * particles ; - particles = new FReal[4*NbPoints] ; - memset(particles,0,4*NbPoints*sizeof(FReal)); - FmaRWParticle<FReal, 4,4> *ppart = (FmaRWParticle<FReal, 4,4>*)(&particles[0]); - // - // Generate physical values - // + // Allocate particle array + FReal * particles; + particles = new FReal[4*NbPoints] ; + memset(particles, 0, 4*NbPoints*sizeof(FReal)); + FmaRWParticle<FReal, 4, 4>* ppart = (FmaRWParticle<FReal, 4, 4>*)(&particles[0]); + + // Generate physical values + FReal sum = 0; + FReal a = 1.0; + FReal b = 0.0; + if(FParameters::existParameter(argc, argv, "-charge")){ + a = 2.0; b = -1.0; + } - FReal phyVal, sum,a,b ; - if(FParameters::existParameter(argc, argv, "-charge")){ - a= 2.0 ; b = -1.0 ; - } - else { - a= 1.0 ; b = 0.0 ; - } - sum = 0.0 ; - int j = 3 ; - for(int i = 0 ; i< NbPoints; ++i, j+=4){ - phyVal = a*getRandom<FReal>() +b ; - sum += phyVal ; - particles[j] = phyVal ; - } - if(FParameters::existParameter(argc, argv, "-zeromean")){ - FReal rm = FReal(sum)/FReal(NbPoints) ; sum = 0.0 ; - j = 3 ; - for(int i = 0 ; i< NbPoints; ++i, j+=4){ - particles[j] -= rm ; - sum += particles[j] ; - } - } - std::cout << "Sum physical value "<< sum << " Mean Value " << sum/FReal(NbPoints)<<std::endl ; - // - // Point generation - // - if(FParameters::existParameter(argc, argv, "-unitCube")){ - unifRandonPointsOnUnitCube(NbPoints, particles) ; - Centre.setPosition(0.5,0.5,0.5); - BoxWith = 1.0 ; - std::cout << "Unit cube "<<std::endl; - } - else if(FParameters::existParameter(argc, argv, "-cuboid")){ - std::string dd(":"),aspectRatio = FParameters::getStr(argc,argv,"-size", "1:1:2"); - FReal A,B,C ; - size_t pos = aspectRatio.find(":"); aspectRatio.replace(pos,1," "); - pos = aspectRatio.find(":"); aspectRatio.replace(pos,1," "); - std::stringstream ss(aspectRatio); ss >>A >> B >> C ; - unifRandonPointsOnCube(NbPoints, A,B,C,particles) ; - BoxWith = FMath::Max(A,FMath::Max(B,C) ); - FReal halfBW = BoxWith*0.5; - Centre.setPosition(halfBW,halfBW,halfBW); - std::cout << "Cuboid "<< A << ":"<< B<<":"<<C<<std::endl; - } - else if(FParameters::existParameter(argc, argv, "-unitSphere")){ - unifRandonPointsOnUnitSphere(NbPoints, particles) ; - BoxWith = 2.0 ; - } - else if(FParameters::existParameter(argc, argv, "-sphere")){ - const FReal Radius = FParameters::getValue(argc,argv,"-radius", 2.0); - unifRandonPointsOnSphere(NbPoints, Radius,particles) ; - BoxWith = 2.0*Radius ; - std::cout << "Sphere radius: "<<Radius<<std::endl; - } - else if(FParameters::existParameter(argc, argv, "-prolate")){ - std::string dd(":"),aspectRatio = FParameters::getStr(argc,argv,"-size", "1:1:2"); - FReal A,B,C ; - size_t pos = aspectRatio.find(":"); aspectRatio.replace(pos,1," "); - pos = aspectRatio.find(":"); aspectRatio.replace(pos,1," "); - std::stringstream ss(aspectRatio); ss >>A >> B >> C ; - if(A != B){ - std::cerr << " A /= B in prolate ellipsoide A =B. Your aspect ratio: "<< aspectRatio<<std::endl; - } - std::cout << "Prolate A: "<<A<<" B: "<< B << " C: " << C<<std::endl; - unifRandonPointsOnProlate(NbPoints,A,C,particles); - BoxWith = 2.0*C; - } //const FSize NbPoints = FParameters::getValue(argc,argv,FParameterDefinitions::NbParticles.options, FSize(20000)); - else if(FParameters::existParameter(argc, argv, "-hyperpara")){ - std::string dd(":"),aspectRatio = FParameters::getStr(argc,argv,"-size", "1:1:2"); - FReal A,B,C ; - size_t pos = aspectRatio.find(":"); aspectRatio.replace(pos,1," "); - pos = aspectRatio.find(":"); aspectRatio.replace(pos,1," "); - std::stringstream ss(aspectRatio); ss >>A >> B >> C ; - unifRandonPointsOnHyperPara(NbPoints,A,B,C,particles); - BoxWith = 2.0*FMath::Max( A,FMath::Max( B,C)) ; - std::cout << "Hyperpara "<< A << ":"<< B<<":"<<C<<std::endl; - std::cout << "BoxWith: " << BoxWith<<std::endl; + for(int i = 0, j = 3 ; i< NbPoints; ++i, j+=4){ + particles[j] = a * getRandom<FReal>() + b; + sum += particles[j] ; + } + if(FParameters::existParameter(argc, argv, "-zeromean")){ + FReal rm = FReal(sum) / FReal(NbPoints) ; + sum -= static_cast<FReal>(NbPoints) * rm; + for(int i = 0, j = 3 ; i< NbPoints; ++i, j+=4){ + particles[j] -= rm ; + } + } + + std::cout << "Physical value sum: " << sum + << " mean: " << sum / FReal(NbPoints) + << std::endl; + + // Read arguments + // Radius + const FReal Radius = getParamV(Param::Radius, 2.0); + // Aspect ratio + std::string aspectRatio = getParamS(Param::Size, "1:1:2"); + std::replace(aspectRatio.begin(), aspectRatio.end(), ':', ' '); + FReal A, B, C; + std::stringstream(aspectRatio) >> A >> B >> C; + + // Point generation + if(FParameters::existParameter(argc, argv, "-unitCube")) { + unifRandomPointsInCube<FReal>(NbPoints, 1, 1, 1, particles); + Centre.setPosition(0.5,0.5,0.5); + BoxWith = 1.0; + std::cout << "Unit cube "<< std::endl; + } + else if(FParameters::existParameter(argc, argv, "-ball")) { + unifRandomPointsInBall<FReal>(NbPoints, Radius, particles); + BoxWith = 2.0 * Radius; + std::cout << "Ball radius: " << Radius << std::endl; + } + else if(FParameters::existParameter(argc, argv, "-cuboid")) { + unifRandomPointsInCube(NbPoints, A, B, C, particles); + BoxWith = FMath::Max(A, FMath::Max(B,C)); + FReal halfBW = BoxWith * 0.5; + Centre.setPosition(halfBW, halfBW, halfBW); + std::cout << "Cuboid: "<< A << ":" << B << ":" << C << std::endl; + } + else if(FParameters::existParameter(argc, argv, "-unitSphere")) { + unifRandomPointsOnSphere<FReal>(NbPoints, 1.0, particles); + BoxWith = 2.0; + } + else if(FParameters::existParameter(argc, argv, "-sphere")) { + unifRandomPointsOnSphere(NbPoints, Radius, particles); + BoxWith = 2.0 * Radius; + std::cout << "Sphere radius: " << Radius << std::endl; + } + else if(FParameters::existParameter(argc, argv, "-prolate")) { + if(A != B){ + std::cerr << " A != B in prolate ellipsoid. Your aspect ratio: " + << aspectRatio << std::endl; + } + std::cout << "Prolate A: " << A << " B: " << B << " C: " << C << std::endl; + unifRandomPointsOnProlate(NbPoints, A, C, particles); + BoxWith = 2.0 * C; + } + else if(FParameters::existParameter(argc, argv, "-hyperpara")) { + unifRandomPointsOnHyperPara(NbPoints, A, B, C, particles); + BoxWith = 2.0 * FMath::Max(A, FMath::Max(B, C)); + std::cout << "Hyperpara "<< A << ":"<< B<<":"<<C<<std::endl; + std::cout << "BoxWith: " << BoxWith << std::endl; + + } + else if(FParameters::existParameter(argc, argv, "-ellipsoid")){ + nonunifRandomPointsOnElipsoid(NbPoints, A, B, C, particles); + BoxWith = 2.0 * FMath::Max(A, FMath::Max(B, C)); + std::cout << "Ellipsoid " << A << ":" << B << ":" << C << std::endl; + } + else if(FParameters::existParameter(argc, argv, "-plummer")){ + unifRandomPlummer(NbPoints, Radius, particles); + BoxWith = 2.0 * Radius; + std::cout << "Plummer radius: " << Radius << std::endl; + } + else { + std::cout << "Bad geometry option"<< std::endl; + exit(-1); } - else if(FParameters::existParameter(argc, argv, "-ellipsoid")){ -// else if(FParameters::existParameter(argc, argv, "-ellipsoid")){ - std::string dd(":"),aspectRatio = FParameters::getStr(argc,argv,"-size", "1:1:2"); -// std::string dd(":"),aspectRatio = FParameters::getStr(argc,argv,"-ar", "1:1:2"); - FReal A,B,C ; - size_t pos = aspectRatio.find(":"); aspectRatio.replace(pos,1," "); - pos = aspectRatio.find(":"); aspectRatio.replace(pos,1," "); - std::stringstream ss(aspectRatio); ss >>A >> B >> C ; - nonunifRandonPointsOnElipsoid(NbPoints,A,B,C,particles); - BoxWith = 2.0*FMath::Max( A,FMath::Max( B,C)) ; - std::cout << "Ellipsoid "<< A << ":"<< B<<":"<<C<<std::endl; - } - else if(FParameters::existParameter(argc, argv, "-plummer")){ - const FReal Radius = FParameters::getValue(argc,argv,"-radius", 10.0); - unifRandonPlummer(NbPoints, Radius, sum, particles) ; - BoxWith = 2.0*Radius ; - std::cout << "Plummer radius: "<<Radius<<std::endl; - } - else { - std::cout << "Bad geometry option"<< std::endl; - exit(-1) ; - } ///////////////////////////////////////////////////////////////////////// - // Save data + // Save data ///////////////////////////////////////////////////////////////////////// - // // Generate FMA file for FFmaGenericLoader<FReal> Loader - // - if(FParameters::existParameter(argc, argv, "-extraLength")){ - extraRadius = FParameters::getValue(argc,argv,"-extraLength", 0.0); - BoxWith += 2*extraRadius ; - } - std::string name(genericFileName); - std::cout << "Write "<< NbPoints <<" Particles in file " << name << std::endl; - FFmaGenericWriter<FReal> writer(name) ; - writer.writeHeader(Centre,BoxWith, NbPoints, *ppart) ; - writer.writeArrayOfParticles(ppart, NbPoints); - std::cout << " End of writing "<<std::endl; - - // - // Generate file for visualization - // - if(FParameters::existParameter(argc, argv, FParameterDefinitions::OutputVisuFile.options)){ - std::string visufile(FParameters::getStr(argc,argv,FParameterDefinitions::OutputVisuFile.options, "output.vtp")); - driverExportData(visufile, particles , NbPoints); - } - // - delete [] particles ; + if(FParameters::existParameter(argc, argv, "-extraLength")){ + extraRadius = FParameters::getValue(argc, argv, "-extraLength", 0.0); + BoxWith += 2 * extraRadius; + } + const std::string name(getParamS(FParameterDefinitions::OutputFile, "unifPointDist")); + std::cout << "Write "<< NbPoints <<" particles to '" << name << "'" << std::endl; + FFmaGenericWriter<FReal> writer(name); + writer.writeHeader(Centre, BoxWith, NbPoints, *ppart); + writer.writeArrayOfParticles(ppart, NbPoints); + std::cout << "End of writing" <<std::endl; - // - return 1; + // Generate file for visualization +// if(FParameters::existParameter(argc, argv, FParameterDefinitions::OutputVisuFile.options)){ +// std::string outfilename(FParameters::getStr(argc,argv,FParameterDefinitions::OutputFile.options, "output.vtp")); +// driverExportData(outfilename, particles , NbPoints,loader.getNbRecordPerline() ); +// } + if(FParameters::existParameter(argc, argv, FParameterDefinitions::OutputVisuFile.options)) { + std::string visufile(FParameters::getStr(argc, argv, FParameterDefinitions::OutputVisuFile.options, "output.vtp")); + driverExportData(visufile, particles , NbPoints); + } + // + delete [] particles; } diff --git a/Src/Arranger/FOctreeArranger.hpp b/Src/Arranger/FOctreeArranger.hpp index d5fdec3151824adcdd23a8306aee5dde35d199ea..30b453ecad89cac33e28f28ea29c8591a0880ead 100644 --- a/Src/Arranger/FOctreeArranger.hpp +++ b/Src/Arranger/FOctreeArranger.hpp @@ -85,7 +85,7 @@ public: const MortonIndex particuleIndex = tree->getMortonFromPosition(currentPart); if(particuleIndex != currentMortonIndex){ //Need to move this one - interface->removeFromLeafAndKeep(particles,currentPart,idxPart,FParticleTypeSource); + interface->removeFromLeafAndKeep(particles,currentPart,idxPart,FParticleType::FParticleTypeSource); } else{ //Need to increment idx; @@ -102,7 +102,7 @@ public: const MortonIndex particuleIndex = tree->getMortonFromPosition(currentPart); if(particuleIndex != currentMortonIndex){ //Need to move this one - interface->removeFromLeafAndKeep(particleTargets,currentPart,idxPart, FParticleTypeTarget); + interface->removeFromLeafAndKeep(particleTargets,currentPart,idxPart, FParticleType::FParticleTypeTarget); } else{ //Need to increment idx; diff --git a/Src/Arranger/FParticleTypedIndexedMover.hpp b/Src/Arranger/FParticleTypedIndexedMover.hpp index e4e1b8a5591ab3006b35bc595e133f72f1fa2e01..a8286323d464b1cfb822cfdc5fe2651cddf3ba02 100644 --- a/Src/Arranger/FParticleTypedIndexedMover.hpp +++ b/Src/Arranger/FParticleTypedIndexedMover.hpp @@ -33,11 +33,11 @@ public: for(int idxAttr = 0 ; idxAttr < ContainerClass::NbAttributes ; ++idxAttr){ particleValues[idxAttr] = lf->getAttribute(idxAttr)[idxPart]; } - if(type == FParticleTypeTarget){ - toStoreRemovedTargetParts.push(particlePos,FParticleTypeTarget,lf->getIndexes()[idxPart],particleValues); + if(type == FParticleType::FParticleTypeTarget){ + toStoreRemovedTargetParts.push(particlePos,FParticleType::FParticleTypeTarget,lf->getIndexes()[idxPart],particleValues); } else{ - toStoreRemovedSourceParts.push(particlePos,FParticleTypeSource,lf->getIndexes()[idxPart],particleValues); + toStoreRemovedSourceParts.push(particlePos,FParticleType::FParticleTypeSource,lf->getIndexes()[idxPart],particleValues); } lf->removeParticles(&idxPart,1); } @@ -53,7 +53,7 @@ public: const FPoint<FReal> particlePos(toStoreRemovedSourceParts.getPositions()[0][idxToInsert], toStoreRemovedSourceParts.getPositions()[1][idxToInsert], toStoreRemovedSourceParts.getPositions()[2][idxToInsert]); - tree->insert(particlePos, FParticleTypeSource, toStoreRemovedSourceParts.getIndexes()[idxToInsert], particleValues); + tree->insert(particlePos, FParticleType::FParticleTypeSource, toStoreRemovedSourceParts.getIndexes()[idxToInsert], particleValues); } for(FSize idxToInsert = 0; idxToInsert<toStoreRemovedTargetParts.getNbParticles() ; ++idxToInsert){ @@ -64,7 +64,7 @@ public: toStoreRemovedTargetParts.getPositions()[1][idxToInsert], toStoreRemovedTargetParts.getPositions()[2][idxToInsert]); - tree->insert(particlePos, FParticleTypeTarget, toStoreRemovedTargetParts.getIndexes()[idxToInsert], particleValues); + tree->insert(particlePos, FParticleType::FParticleTypeTarget, toStoreRemovedTargetParts.getIndexes()[idxToInsert], particleValues); } toStoreRemovedSourceParts.clear(); diff --git a/Src/Components/FParticleType.hpp b/Src/Components/FParticleType.hpp index 89f799ccce20d1221d23fd5d37e0a1910df3623b..98d5a7bc6858d0b428568295ed809a8f5f700ab1 100644 --- a/Src/Components/FParticleType.hpp +++ b/Src/Components/FParticleType.hpp @@ -19,7 +19,7 @@ /** * @brief The FParticleType enum is to make a difference between Target and Source (Tsm) */ -enum FParticleType { +enum class FParticleType { FParticleTypeSource = 0, FParticleTypeTarget = 1 }; diff --git a/Src/Components/FTypedLeaf.hpp b/Src/Components/FTypedLeaf.hpp index f29332e5f5092c7a993e5cd2f692e046d6a9f0a3..b4510a24bed0f47ddb33739418974c762ac0de3d 100644 --- a/Src/Components/FTypedLeaf.hpp +++ b/Src/Components/FTypedLeaf.hpp @@ -51,8 +51,8 @@ public: */ template<typename... Args> void push(const FPoint<FReal>& inParticlePosition, const FParticleType type, Args ... args){ - if(type == FParticleTypeTarget) targets.push(inParticlePosition, FParticleTypeTarget, args...); - else sources.push(inParticlePosition, FParticleTypeSource, args...); + if(type == FParticleType::FParticleTypeTarget) targets.push(inParticlePosition, FParticleType::FParticleTypeTarget, args...); + else sources.push(inParticlePosition, FParticleType::FParticleTypeSource, args...); } /** diff --git a/Src/Core/FFmmAlgorithmOmp4.hpp b/Src/Core/FFmmAlgorithmOmp4.hpp index 54d288a0118b3cb9dbd48d0e25a1eb609ebacb40..b8bbd2790c270ed744516640c64006e5f2a3b542 100644 --- a/Src/Core/FFmmAlgorithmOmp4.hpp +++ b/Src/Core/FFmmAlgorithmOmp4.hpp @@ -1,6 +1,8 @@ #ifndef FFMMALGORITHMOMP4_HPP #define FFMMALGORITHMOMP4_HPP +#include <omp.h> + #include "../Utils/FGlobal.hpp" #include "../Utils/FAssert.hpp" #include "../Utils/FLog.hpp" @@ -1007,4 +1009,3 @@ protected: #endif // FFMMALGORITHMOMP4_HPP - diff --git a/Src/Files/FFmaGenericLoader.hpp b/Src/Files/FFmaGenericLoader.hpp index f08e3fc729a43623155c0a4d8eb5c9943799d36c..8f5a8d6fe2e79d646abaa355c6d7f8b191b75fb6 100644 --- a/Src/Files/FFmaGenericLoader.hpp +++ b/Src/Files/FFmaGenericLoader.hpp @@ -4,13 +4,13 @@ // This software is a computer program whose purpose is to compute the FMM. // // This software is governed by the CeCILL-C and LGPL licenses and -// abiding by the rules of distribution of free software. -// +// abiding by the rules of distribution of free software. +// // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public and CeCILL-C Licenses for more details. -// "http://www.cecill.info". +// "http://www.cecill.info". // "http://www.gnu.org/licenses". // =================================================================================== // author Berenger Bramas and Olivier Coulaud @@ -36,7 +36,7 @@ /** \brief Particle class used in FMA loader and writer. * * - * The pieces of data are : PosX, PosY, PosZ, physicalValue, + * The pieces of data are : PosX, PosY, PosZ, physicalValue, * Potential, forceX, forceY, forceZ. The first 4 are mandatory. * Data is stored as FReal. * @@ -165,10 +165,10 @@ public: * example below shows how to use the loader to read from a file. * * - * \code - * // Instanciate the loader with the particle file. - * FFmaGenericLoader<FReal> loader("../Data/unitCubeXYZQ20k.fma"); // extension fma -> ascii format - * // Retrieve the number of particles + * \code + * // Instanciate the loader with the particle file. + * FFmaGenericLoader<FReal> loader("../Data/unitCubeXYZQ20k.fma"); // extension fma -> ascii format + * // Retrieve the number of particles * FSize nbParticles = loader.getNumberOfParticles(); * * // Create an array of particles, initialize to 0. @@ -190,7 +190,7 @@ public: * \endcode * * `DatatypeSize` can have one of two values: - * - 4, float ; + * - 4, float ; * - 8, double. * * `Number_of_records_per_line` gives the data count for each line of @@ -241,7 +241,7 @@ public: * @param binary true if the file to open is in binary mode */ FFmaGenericLoader(const std::string & filename,const bool binary ): - file(nullptr), binaryFile(binary), centerOfBox(0.0,0.0,0.0), boxWidth(0.0), + file(nullptr), binaryFile(binary), centerOfBox(0.0,0.0,0.0), boxWidth(0.0), nbParticles(0), tmpVal(nullptr), otherDataToRead(0) { this->open_file(filename, binary); @@ -254,7 +254,7 @@ public: * * - The opening mode is guessed from the file extension : `.fma` will open * in ASCII mode, `.bfma` will open in binary mode. - * - All information accessible in the header can be retreived after this call. + * - All information accessible in the header can be retreived after this call. * - To test if the file has successfully been opened, call hasNotFinished(). * * @param filename the name of the file to open. Must end with `.fma` or `.bfma`. @@ -268,7 +268,7 @@ public: binaryFile = false; } else { std::cout << "FFmaGenericLoader: " - << "Only .fma or .bfma input file are allowed. Got " + << "Only .fma or .bfma input file are allowed. Got " << filename << "." << std::endl; std::exit ( EXIT_FAILURE) ; @@ -297,10 +297,9 @@ public: /** * To get the number of particles from this loader - * @param the number of particles the loader can fill */ FSize getNumberOfParticles() const{ - return this->nbParticles; + return this->getParticleCount(); } /** @@ -308,8 +307,25 @@ public: * @return box center */ FPoint<FReal> getCenterOfBox() const{ + return this->getBoxCenter(); + } + + /** + * \brief Get the distribution particle count + * \return The distribution particle count + */ + FSize getParticleCount() const { + return this->nbParticles; + } + + /** + * \brief Get distribution center + * \return A point representing the box center + */ + FPoint<FReal> getBoxCenter() const{ return this->centerOfBox; } + /** * The box width from the simulation file opened by the loader * @return box width @@ -502,7 +518,7 @@ private: }; -/**\class FFmaGenericWriter +/** * \warning This class only works in shared memory (doesn't work with MPI). * * \brief Writes a set of particles to an FMA formated file. @@ -529,14 +545,13 @@ private: * \endcode * * `DatatypeSize` can have one of two values: - * - 4, float ; + * - 4, float; * - 8, double. * * `Number_of_records_per_line` gives the data count for each line of * the `Particle_values`. For example : - * - 4, the particle values are X Y Z Q; - * - 8, the particle values are X Y Z Q P FX FY FZ<br> - + * - 4, the particle values are `X Y Z Q`; + * - 8, the particle values are `X Y Z Q P FX FY FZ`. */ template <class FReal> class FFmaGenericWriter { @@ -642,9 +657,9 @@ public: /** * Writes the header of FMA file. - * + * * Should be used if we write the particles with writeArrayOfReal method - * + * * @param centerOfBox The center of the Box (FPoint<FReal> class) * @param boxWidth The width of the box * @param nbParticles Number of particles in the box (or to save) @@ -672,7 +687,7 @@ public: * @tparam dataPart The class of the particle array. * @param dataToWrite Array of particles of type dataPart * @param N Number of element in the array - * + * * Example 1 * \code * FmaRParticle * particles = new FmaRParticle[nbParticles]; @@ -682,7 +697,7 @@ public: * Fwriter.writeHeader(Centre,BoxWith, nbParticles,*particles) ; * Fwriter.writeArrayOfParticles(particles, nbParticles); * \endcode - * + * * Example2 * \code * FReal * particles = new FReal[4*NbPoints] ; // store 4 data per particle @@ -736,13 +751,13 @@ public: /** * Write an array of data in a file Fill - * + * * @param dataToWrite array of particles of type FReal * @param nbData number of data per particle * @param N number of particles - * + * * The size of the array is N*nbData - * + * * example * \code * FmaRParticle * const particles = new FmaRParticle[nbParticles]; @@ -853,5 +868,3 @@ private: #endif //FFmaGenericLoader_HPP - - diff --git a/Src/Files/FFmaTsmLoader.hpp b/Src/Files/FFmaTsmLoader.hpp index eadf1137fa1db31fcc6329a7a3b37e072f42168d..d45a64d0f71dcbeba6e3c43c69e27cf2b9505890 100644 --- a/Src/Files/FFmaTsmLoader.hpp +++ b/Src/Files/FFmaTsmLoader.hpp @@ -131,8 +131,8 @@ public: inParticlePositions->setPosition(x,y,z); *inPhysicalValue = data; - if(isTarget) (*particleType) = FParticleTypeTarget; - else (*particleType) = FParticleTypeSource; + if(isTarget) (*particleType) = FParticleType::FParticleTypeTarget; + else (*particleType) = FParticleType::FParticleTypeSource; } }; diff --git a/Src/Files/FGenerateDistribution.hpp b/Src/Files/FGenerateDistribution.hpp index 64d27f0d13173a045ce428a9064e11600dd02a20..7d31f63c16bbc81ff044b08deb23246954e29ac5 100644 --- a/Src/Files/FGenerateDistribution.hpp +++ b/Src/Files/FGenerateDistribution.hpp @@ -16,7 +16,12 @@ #ifndef FGENERATEDISTRIBUTION_HPP #define FGENERATEDISTRIBUTION_HPP -// @author O. Coulaud +/** + * \file + * \brief Distribution generation implementations + * \author O. Coulaud + */ + #include <cstdlib> #include <ctime> @@ -27,250 +32,273 @@ #include "Utils/FMath.hpp" #include "Utils/FParameters.hpp" -/** return a random number between 0 and 1 */ - +/** + * \brief Seed the random number generator using current time + */ void initRandom() { - srand48( static_cast<long int>(time(nullptr))) ; -} ; -template <class FReal> -FReal getRandom() { - return static_cast<FReal>(drand48()); - //return static_cast<FReal>(rand()/FReal(RAND_MAX)); -} ; -//! \fn unifRandonPointsOnUnitCube(const int N , FReal * points) - -//! \brief Generate N points uniformly distributed on the unit cube + srand48(static_cast<long int>(time(nullptr))); +} -//! -//! \param N the number of points uniformly randomly sample on the unit cube -//! \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0.... -//! \example generateDistributions.cpp +/** + * \brief Generate a random number + * \tparam FReal Floating point type + * \return A random number in [0,1] + */ template <class FReal> -void unifRandonPointsOnUnitCube(const FSize N , FReal * points) { - // - initRandom() ; - int j = 0; - for (FSize i = 0 ; i< N ; ++i, j+=4) { - // - points[j] = getRandom<FReal>() ; - points[j+1] = getRandom<FReal>() ; - points[j+2] = getRandom<FReal>() ; - // - } -}; -//! \fn unifRandonPointsOnCube(const int N , FReal * points) - -//! \brief Generate N points uniformly distributed on the cube of length R +FReal getRandom() { + return static_cast<FReal>(drand48()); +} -//! -//! \param N the number of points uniformly randomly sample on the unit cube -//! \param Lx the the X-length of the cube -//! \param Ly the the Y-length of the cube -//! \param Lz the the Z-length of the cube -//! \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0.... -//! \example generateDistributions.cpp +/** + * \brief Generate points uniformly inside a cuboid + * + * \tparam FReal Floating point type + * + * \param N the number of points uniformly randomly sample on the unit cube + * \param Lx the the X-length of the cuboid + * \param Ly the the Y-length of the cuboid + * \param Lz the the Z-length of the cuboid + * \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0... + */ template <class FReal> -void unifRandonPointsOnCube(const FSize N , const FReal& Lx, const FReal &Ly, const FReal& Lz, FReal * points) { - // - unifRandonPointsOnUnitCube(N , points) ; - FSize j =0 ; - for (FSize i = 0 ; i< N ; ++i, j+=4) { - points[j] *= Lx ; - points[j+1] *= Ly ; - points[j+2] *= Lz ; - } -}; -//! \fn unifRandonPointsOnUnitSphere(const int N , FReal * points) +void unifRandomPointsInCube(const FSize N, const FReal& Lx, const FReal& Ly, + const FReal& Lz, FReal* points) +{ + initRandom(); + for(FSize i = 0, j = 0 ; i< N ; ++i, j+=4) { + points[j] = getRandom<FReal>() * Lx; + points[j+1] = getRandom<FReal>() * Ly; + points[j+2] = getRandom<FReal>() * Lz; + } +} -//! \brief Generate N points uniformly distributed on the unit sphere +/** + * \brief Generate points uniformly inside a ball + * + * \tparam FReal Floating point type + * + * \param R the ball radius + * \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0... + */ +template<class FReal> +void unifRandomPointsInBall(const FSize N, const FReal R, FReal* points) { + initRandom(); -//! -//! \param N the number of points uniformly randomly sample on the unit sphere -//! \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0.... -//! \example generateDistributions.cpp -template <class FReal> -void unifRandonPointsOnUnitSphere(const FSize N , FReal * points) { - FReal u, v, theta, phi, sinPhi ; - // - initRandom() ; - FSize j = 0 ; - for (FSize i = 0 ; i< N ; ++i, j+=4) { - // - u = getRandom<FReal>() ; v = getRandom<FReal>() ; - theta = FMath::FTwoPi<FReal>()*u ; - phi = FMath::ACos(2*v-1); - sinPhi = FMath::Sin(phi); - // - points[j] = FMath::Cos(theta)*sinPhi ; - points[j+1] = FMath::Sin(theta)*sinPhi ; - points[j+2] = 2*v-1 ; - // - } -}; -//! \fn nonunifRandonPointsOnElipsoid(const int N , const FReal &a, const FReal &b, const FReal &c, FReal * points) + auto is_in_sphere = [&R](FReal* p) { + return p[0]*p[0] + p[1]*p[1] + p[2]*p[2] < R*R; + }; -//! \brief Generate N points non uniformly distributed on the ellipsoid of aspect ratio a:b:c + for(FSize i = 0, j = 0 ; i< N ; ++i, j+=4) { + do { + points[j] = (getRandom<FReal>() - 0.5) * 2 * R; + points[j+1] = (getRandom<FReal>() - 0.5) * 2 * R; + points[j+2] = (getRandom<FReal>() - 0.5) * 2 * R; + } while(! is_in_sphere(points + j)); + } +} -//! -//! \param N the number of points -//! \param a the x semi-axe length -//! \param b the y semi-axe length -//! \param c the z semi-axe length -//! \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0.... -//! +/** + * \brief Generate N points non uniformly distributed on the ellipsoid of aspect ratio a:b:c + * + * \tparam FReal Floating point type + * + * \param N the number of points + * \param a the x semi-axe length + * \param b the y semi-axe length + * \param c the z semi-axe length + * \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0.... + */ template <class FReal> -void nonunifRandonPointsOnElipsoid(const FSize N , const FReal &a, const FReal &b, const FReal &c, FReal * points) { - // - FReal u, v , cosu ; - FSize j =0 ; - for (FSize i = 0 ; i< N ; ++i, j+=4) { - u = getRandom<FReal>() ; v = getRandom<FReal>() ; - u = FMath::FPi<FReal>()*u - FMath::FPiDiv2<FReal>(); v = FMath::FTwoPi<FReal>()*v - FMath::FPi<FReal>(); - cosu = FMath::Cos(u) ; - points[j] = a*cosu*FMath::Cos(v) ; - points[j+1] = b*cosu*FMath::Sin(v) ; - points[j+2] = c*FMath::Sin(u) ; - } -}; -//! \fn nonunifRandonPointsOnElipsoid(const int N , const FReal &a, const FReal &c, FReal * points) +void nonunifRandomPointsOnElipsoid(const FSize N, const FReal& a, const FReal& b, + const FReal& c, FReal* points) +{ + FReal u, v, cosu; + for (FSize i = 0, j = 0 ; i< N ; ++i, j+=4) { + u = FMath::FPi<FReal>() * getRandom<FReal>() - FMath::FPiDiv2<FReal>(); + v = FMath::FTwoPi<FReal>() * getRandom<FReal>() - FMath::FPi<FReal>(); + cosu = FMath::Cos(u); + points[j] = a * cosu * FMath::Cos(v); + points[j+1] = b * cosu * FMath::Sin(v); + points[j+2] = c * FMath::Sin(u); + } +} -//! \brief Generate N points uniformly distributed on the ellipsoid of aspect ratio a:a:c -//! -//! \param N the number of points -//! \param a the x semi-axe length -//! \param c the z semi-axe length -//! \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0.... -//! +/** + * \brief Generate N points uniformly distributed on the ellipsoid of aspect ratio a:a:c + * + * \tparam FReal Floating point type + * + * \param N the number of points + * \param a the x semi-axe length + * \param c the z semi-axe length + * \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0.... +*/ template <class FReal> -void unifRandonPointsOnProlate(const FSize N , const FReal &a, const FReal &c, FReal * points){ - // - FReal u, w,v ,ksi ; - FReal e = (a*a*a*a)/(c*c*c*c) ; - bool isgood = false; - FSize j =0 , cpt =0 ; - // - for (FSize i = 0 ; i< N ; ++i, j+=4) { - // Select a random point on the prolate - do { - cpt++ ; - u = getRandom<FReal>() ; v = getRandom<FReal>() ; - u = 2.0*u - 1.0; v = FMath::FTwoPi<FReal>()*v; - w =FMath::Sqrt(1-u*u) ; - points[j] = a*w*FMath::Cos(v) ; - points[j+1] = a*w*FMath::Sin(v) ; - points[j+2] = c*u ; - // Accept the position ? - ksi = a*getRandom<FReal>() ; - // std::cout << "Gradf "<< points[j]*points[j] + points[j+1] *points[j+1] +e*points[j+2] *points[j+2] << std::endl; - isgood = (points[j]*points[j] + points[j+1] *points[j+1] +e*points[j+2] *points[j+2] < ksi*ksi ); - } while (isgood); - } - std::cout.precision(4); - std::cout << "Total tested points: "<< cpt << " % of rejected points: "<<100*static_cast<FReal>(cpt-N)/static_cast<FReal>(cpt) << " %" <<std::endl; - -} ; +void unifRandomPointsOnProlate(const FSize N, const FReal& a, const FReal& c, + FReal* points) +{ + FReal u, w, v, ksi; + FReal e = (a*a*a*a)/(c*c*c*c); + bool isgood = false; + FSize cpt = 0; -//! \fn unifRandonPointsOnHyperPara(const int N , const FReal &a, const FReal &b, const FReal &c, FReal * points) + for (FSize i = 0, j = 0 ; i< N ; ++i, j+=4) { + // Select a random point on the prolate + do { + ++cpt; + u = 2.0 * getRandom<FReal>() - 1.0; + v = FMath::FTwoPi<FReal>() * getRandom<FReal>(); + w = FMath::Sqrt(1 - u*u); + points[j] = a * w * FMath::Cos(v); + points[j+1] = a * w * FMath::Sin(v); + points[j+2] = c * u; + // Accept the position ? + ksi = a * getRandom<FReal>(); + isgood = (points[j]*points[j] + + points[j+1]*points[j+1] + + e*points[j+2]*points[j+2]) < ksi*ksi; + } while(isgood); + } + std::cout.precision(4); + std::cout << "Total tested points: " << cpt + << " % of rejected points: " + << 100 * static_cast<FReal>(cpt-N) / static_cast<FReal>(cpt) << " %" + << std::endl; +} -//! \brief Generate N points uniformly distributed on the hyperbolic paraboloid of aspect ratio a:b:c -//! -//! \param N the number of points -//! \param a the x semi-axe length -//! \param b the y semi-axe length -//! \param c the z semi-axe length -//! \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0.... -//! +/** + * \brief Generate N points uniformly distributed on the hyperbolic paraboloid of aspect ratio a:b:c + * + * \tparam FReal Floating point type + * + * \param N the number of points + * \param a the x semi-axe length + * \param b the y semi-axe length + * \param c the z semi-axe length + * \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0... + */ template <class FReal> -void unifRandonPointsOnHyperPara(const FSize N , const FReal &a, const FReal &b, const FReal &c, FReal * points) { - // - FReal u, v ; - FSize j =0 ; - for (FSize i = 0 ; i< N ; ++i, j+=4) { - u = 2.0*getRandom<FReal>() - 1.0 ; v = 2.0*getRandom<FReal>() - 1.0 ; - points[j] = a*u ; - points[j+1] = b*v ; - points[j+2] = c*(u*u - v*v) ; +void unifRandomPointsOnHyperPara(const FSize N, const FReal &a, const FReal &b, + const FReal &c, FReal * points) +{ + FReal u, v; + for (FSize i = 0, j = 0 ; i< N ; ++i, j+=4) { + u = 2.0 * getRandom<FReal>() - 1.0; + v = 2.0 * getRandom<FReal>() - 1.0; + points[j] = a * u; + points[j+1] = b * v; + points[j+2] = c * (u*u - v*v); } }; -//! \fn unifRandonPointsOnSphere(const int N , const FReal R, FReal * points) - -//! \brief Generate N points uniformly distributed on the sphere of radius R - -//! -//! \param N the number of points uniformly randomly sample on the sphere -//! \param R the radius of the sphere -//! \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0.... -//! +/** + * \brief Generate N points uniformly distributed on the sphere of radius R + * + * \tparam FReal Floating point type + * + * \param N the number of points uniformly randomly sample on the sphere + * \param R the radius of the sphere + * \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0... + */ template <class FReal> -void unifRandonPointsOnSphere(const FSize N , const FReal R, FReal * points) { - // - unifRandonPointsOnUnitSphere(N , points) ; - FSize j =0 ; - for (FSize i = 0 ; i< N ; ++i, j+=4) { - points[j] *= R ; - points[j+1] *= R ; - points[j+2] *= R ; - } +void unifRandomPointsOnSphere(const FSize N, const FReal R, FReal* points) { + initRandom(); + FReal u, v, theta, phi, sinPhi; + for (FSize i = 0, j = 0 ; i< N ; ++i, j+=4) { + u = getRandom<FReal>(); + v = getRandom<FReal>(); + theta = FMath::FTwoPi<FReal>() * u; + phi = FMath::ACos(2*v - 1); + sinPhi = FMath::Sin(phi); + + points[j] = FMath::Cos(theta) * sinPhi * R; + points[j+1] = FMath::Sin(theta) * sinPhi * R; + points[j+2] = (2*v - 1) * R; + } }; -//! \fn void plummerDist(int & cpt, const FReal &R) -//! \brief Radial Plummer distribution -//! -//! \param cpt : counter to know how many random selections we need to obtain a radius less than R -//! \param R : Radius of the sphere that contains the particles -//! @return Return the radius according to the Plummer distribution either double type or float type -//! +/** + * \brief Radial Plummer distribution + * + * \tparam FReal Floating point type + * + * \param cpt counter to know how many random selections we need to obtain a radius less than R + * \param R radius of the sphere that contains the particles + * \return The radius according to the Plummer distribution + */ template <class FReal> -FReal plummerDist(FSize cpt, const FReal &R) { - // - FReal radius ,u ; - do { - // - u = FMath::pow (getRandom<FReal>() , 2.0/3.0) ; - radius = FMath::Sqrt (u/(1.0-u)); - cpt++; - if(radius <=R){ - // std::cout << radius << " " <<std::endl; - return static_cast<FReal>(radius); - } - } while (true); +FReal plummerDist(FSize& cpt, const FReal &R) { + FReal radius, u; + while(true) { + u = FMath::pow(getRandom<FReal>(), 2.0/3.0); + radius = FMath::Sqrt(u/(1.0-u)); + cpt++; + if(radius <= R) { + return static_cast<FReal>(radius); + } + } } -//! \fn void unifRandonPlummer(const int N , const FReal R, const FReal M, FReal * points) - -//! \brief Build N points following the Plummer distribution -//! First we construct N points uniformly distributed on the unit sphere. Then the radius in construct according to the Plummr distribution. -//! -//! \param N the number of points following the Plummer distribution -//! \param R the radius of the sphere that contains all the points -//! \param M the total mass of all the particles inside the Sphere or radius R -//! \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0.... +/** + * \brief Build N points following the Plummer distribution + * + * First we construct N points uniformly distributed on the unit sphere. Then + * the radius in construct according to the Plummer distribution for + * a constant mass of 1/N + * + * \tparam FReal Floating point type + * + * \param N the number of points following the Plummer distribution + * \param R the radius of the sphere that contains all the points + * \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0.... + */ template <class FReal> -void unifRandonPlummer(const FSize N , const FReal R, const FReal M, FReal * points) { - // - unifRandonPointsOnUnitSphere(N , points) ; - // - FReal r , rm= 0.0; - // FReal Coeff = 3.0*M/(4.0*FMath::FPi<FReal>()*R*R*R) ; - //am1 = 0 ;//1/FMath::pow(1+R*R,2.5); - FSize cpt = 0 ; - for (FSize i = 0,j=0 ; i< N ; ++i, j+=4) { - // u \in [] - r = plummerDist(cpt,R) ; - rm = std::max(rm, r); - points[j] *= r ; - points[j+1] *= r ; - points[j+2] *= r ; - } - - std::cout << "Total tested points: "<< cpt << " % of rejected points: " - <<100*static_cast<FReal>(cpt-N)/static_cast<FReal>(cpt) << " %" <<std::endl; +void unifRandomPlummer(const FSize N, const FReal R, FReal * points) { + unifRandomPointsOnSphere<FReal>(N, 1, points); + FReal mc = 1.0/static_cast<FReal>(N); + for (FSize i = 0, j = 0 ; i< N ; ++i, j+=4) { + FReal m = getRandom<FReal>(); + FReal r = FMath::Sqrt( 1.0/(FMath::pow(m, -2.0/3.0) - 1.0)) ; + points[j] *= r; + points[j+1] *= r; + points[j+2] *= r; + points[j+3] = mc; // the mass + } +} +/** + * \brief Build N points following the Plummer like distribution + * + * First we construct N points uniformly distributed on the unit sphere. Then + * the radius in construct according to the Plummer like distribution. + * + * \tparam FReal Floating point type + * + * \param N the number of points following the Plummer distribution + * \param R the radius of the sphere that contains all the points + * \param points array of size 4*N and stores data as follow x,y,z,0,x,y,z,0.... + */ +template <class FReal> +void unifRandomPlummerLike(const FSize N, const FReal R, FReal * points) { + FReal a = 1.0 ; + unifRandomPointsOnSphere<FReal>(N, 1, points); + FReal r, rm = 0.0; + FSize cpt = 0; + for (FSize i = 0, j = 0 ; i< N ; ++i, j+=4) { + r = plummerDist(cpt,R); + rm = std::max(rm, r); + points[j] *= r; + points[j+1] *= r; + points[j+2] *= r; + } -} ; + std::cout << "Total tested points: " << cpt << " % of rejected points: " + << 100 * static_cast<FReal>(cpt-N) / static_cast<FReal>(cpt) + << " %" + << std::endl; +} // #endif diff --git a/Src/Files/FRandomLoader.hpp b/Src/Files/FRandomLoader.hpp index d53417c08e87681afe28e718bc8bd77ae54c64b6..73af9be3ec8622af64b66e13bd51eaf503beb4c0 100644 --- a/Src/Files/FRandomLoader.hpp +++ b/Src/Files/FRandomLoader.hpp @@ -124,8 +124,8 @@ public: void fillParticle(FPoint<FReal>*const inParticlePositions, FParticleType*const isTarget){ FRandomLoader<FReal>::fillParticle(inParticlePositions); - if(FRandomLoader<FReal>::getRandom() > 0.5 ) (*isTarget) = FParticleTypeTarget; - else (*isTarget) = FParticleTypeSource; + if(FRandomLoader<FReal>::getRandom() > 0.5 ) (*isTarget) = FParticleType::FParticleTypeTarget; + else (*isTarget) = FParticleType::FParticleTypeSource; } }; diff --git a/Src/ScalFmmConfig.h.cmake b/Src/ScalFmmConfig.h.cmake index e454565ee48b4eed14b493ca90f8c4f13a7b801e..0c4c2960f17f539e9ae77a78ae489ed6e066c355 100644 --- a/Src/ScalFmmConfig.h.cmake +++ b/Src/ScalFmmConfig.h.cmake @@ -28,7 +28,10 @@ #cmakedefine SCALFMM_USE_BLAS #cmakedefine SCALFMM_USE_MKL_AS_BLAS - +// Fortran Mangling +#cmakedefine SCALFMM_BLAS_ADD_ +#cmakedefine SCALFMM_BLAS_UPCASE +#cmakedefine SCALFMM_BLAS_NOCHANGE //////////////////////////////////////////////////////// // FFT /////////////////////////////////////////////////////// diff --git a/Src/Utils/FAlgorithmTimers.hpp b/Src/Utils/FAlgorithmTimers.hpp index ee41b38f438d0c850e55198a5d8c0074698aae47..f9eb1f2a56a7b70e5cf03258e160a75f8d408eed 100644 --- a/Src/Utils/FAlgorithmTimers.hpp +++ b/Src/Utils/FAlgorithmTimers.hpp @@ -17,6 +17,13 @@ #ifndef FALGORITHMTIMERS_HPP #define FALGORITHMTIMERS_HPP +#include <map> +#include <string> + +#include "FTic.hpp" + +using FTimerMap = std::map<std::string, FTic>; + /** * @brief Collection of timers for FMM operators. * @@ -25,56 +32,38 @@ */ class FAlgorithmTimers{ public: - /// The timer names - enum FTimers { - P2MTimer, - M2MTimer, - M2LTimer, - L2LTimer, - L2PTimer, - P2PTimer, - NearTimer, - nbTimers ///< Timer count - }; + static constexpr const char* P2MTimer = "P2M"; + static constexpr const char* M2MTimer = "M2M"; + static constexpr const char* M2LTimer = "M2L"; + static constexpr const char* L2LTimer = "L2L"; + static constexpr const char* L2PTimer = "L2P"; + static constexpr const char* P2PTimer = "P2P"; + static constexpr const char* M2PTimer = "M2P"; + static constexpr const char* P2LTimer = "P2L"; + static constexpr const char* NearTimer = "Near"; + enum {nbTimers = 9}; -protected: - /// Timer array - FTic Timers[nbTimers]; + /// Timers + FTimerMap Timers; -public: /// Constructor: resets all timers - FAlgorithmTimers() - { - for(int i = 0; i < nbTimers ; ++i){ - Timers[i].reset(); - } - } + FAlgorithmTimers() = default; /// Default copy contructor FAlgorithmTimers(const FAlgorithmTimers&) = default; /// Default move contructor FAlgorithmTimers(FAlgorithmTimers&&) = default; - /// Returns the timer array - const FTic * getAllTimers() const { - return Timers; - } - - /// Returns the timer count - int getNbOfTimerRecorded() const { - return nbTimers; - } - /// Elapsed time between last FTic::tic() and FTic::tac() for given timer. - double getTime(FTimers OpeTimer) const{ + double getTime(std::string TimerName) const{ //assert to verify size - return Timers[OpeTimer].elapsed(); + return Timers.at(TimerName).elapsed(); } /// Cumulated time between all FTic::tic() and FTic::tac() for given timer. - double getCumulatedTime(FTimers OpeTimer) const{ + double getCumulatedTime(std::string TimerName) const{ //assert to verify size - return Timers[OpeTimer].cumulated(); + return Timers.at(TimerName).cumulated(); } }; diff --git a/Src/Utils/FBlas.hpp b/Src/Utils/FBlas.hpp index ed3bb4576c319dc82344717b6423cc8e5f928a51..f722b0f5f372980fcc9197dc0b9dbd6396578fac 100644 --- a/Src/Utils/FBlas.hpp +++ b/Src/Utils/FBlas.hpp @@ -17,6 +17,7 @@ #define FBLAS_HPP #include "FGlobal.hpp" +#include "FFortranMangling.hpp" #ifndef SCALFMM_USE_BLAS #error The BLAS header is included while SCALFMM_USE_BLAS is turned OFF @@ -30,133 +31,133 @@ // for real namespace scalfmm { -const double D_ZERO = 0.0; -const double D_ONE = 1.0; -const double D_MONE = -1.0; -const float S_ZERO = 0.0; -const float S_ONE = 1.0; -const float S_MONE = -1.0; -// for complex -const double Z_ZERO[2] = {0.0,0.0}; -const double Z_ONE[2] = {1.0,0.0}; -const double Z_MONE[2] = {-1.0,0.0}; -const float C_ZERO[2] = {0.0,0.0}; -const float C_ONE[2] = {1.0,0.0}; -const float C_MONE[2] = {-1.0,0.0}; - -//const double D_PREC = 1e-16; - -const unsigned N_ONE = 1; -const int N_MONE = -1; -const char JOB_STR[] = "NTOSVULCR"; + const double D_ZERO = 0.0; + const double D_ONE = 1.0; + const double D_MONE = -1.0; + const float S_ZERO = 0.0; + const float S_ONE = 1.0; + const float S_MONE = -1.0; + // for complex + const double Z_ZERO[2] = {0.0,0.0}; + const double Z_ONE[2] = {1.0,0.0}; + const double Z_MONE[2] = {-1.0,0.0}; + const float C_ZERO[2] = {0.0,0.0}; + const float C_ONE[2] = {1.0,0.0}; + const float C_MONE[2] = {-1.0,0.0}; + + //const double D_PREC = 1e-16; + + const unsigned N_ONE = 1; + const int N_MONE = -1; + const char JOB_STR[] = "NTOSVULCR"; } extern "C" { - // double ////////////////////////////////////////////////////////// - // blas 1 - double ddot_(const unsigned*, const double*, const unsigned*, const double*, const unsigned*); - void dscal_(const unsigned*, const double*, const double*, const unsigned*); - void dcopy_(const unsigned*, const double*, const unsigned*, double*, const unsigned*); - void daxpy_(const unsigned*, const double*, const double*, const unsigned*, double*, const unsigned*); - // blas 2 - void dgemv_(const char*, const unsigned*, const unsigned*, const double*, - const double*, const unsigned*, const double*, const unsigned*, - const double*, double*, const unsigned*); - // blas 3 - void dgemm_(const char*, const char*, const unsigned*, const unsigned*, - const unsigned*, const double*, double*, const unsigned*, - double*, const unsigned*, const double*, double*, const unsigned*); - // lapack - void dgesvd_(const char*, const char*, const unsigned*, const unsigned*, - double*, const unsigned*, double*, double*, const unsigned*, - double*, const unsigned*, double*, const unsigned*, int*); - void dgeqrf_(const unsigned*, const unsigned*, double*, const unsigned*, - double*, double*, const unsigned*, int*); - void dgeqp3_(const unsigned*, const unsigned*, double*, const unsigned*, /*TYPE OF JPIV*/ unsigned*, - double*, double*, const unsigned*, int*); - void dorgqr_(const unsigned*, const unsigned*, const unsigned*, - double*, const unsigned*, double*, double*, const unsigned*, int*); - void dormqr_(const char*, const char*, + // double ////////////////////////////////////////////////////////// + // blas 1 + double Fddot(const unsigned*, const double*, const unsigned*, const double*, const unsigned*); + void Fdscal(const unsigned*, const double*, const double*, const unsigned*); + void Fdcopy(const unsigned*, const double*, const unsigned*, double*, const unsigned*); + void Fdaxpy(const unsigned*, const double*, const double*, const unsigned*, double*, const unsigned*); + // blas 2 + void Fdgemv(const char*, const unsigned*, const unsigned*, const double*, + const double*, const unsigned*, const double*, const unsigned*, + const double*, double*, const unsigned*); + // blas 3 + void Fdgemm(const char*, const char*, const unsigned*, const unsigned*, + const unsigned*, const double*, double*, const unsigned*, + double*, const unsigned*, const double*, double*, const unsigned*); + // lapack + void Fdgesvd(const char*, const char*, const unsigned*, const unsigned*, + double*, const unsigned*, double*, double*, const unsigned*, + double*, const unsigned*, double*, const unsigned*, int*); + void Fdgeqrf(const unsigned*, const unsigned*, double*, const unsigned*, + double*, double*, const unsigned*, int*); + void Fdgeqp3(const unsigned*, const unsigned*, double*, const unsigned*, /*TYPE OF JPIV*/ unsigned*, + double*, double*, const unsigned*, int*); + void Fdorgqr(const unsigned*, const unsigned*, const unsigned*, + double*, const unsigned*, double*, double*, const unsigned*, int*); + void Fdormqr(const char*, const char*, const unsigned*, const unsigned*, const unsigned*, - const double*, const unsigned*, + const double*, const unsigned*, double*, double*, const unsigned*, double*, const unsigned*, int*); - void dpotrf_(const char*, const unsigned*, double*, const unsigned*, int*); - - // single ////////////////////////////////////////////////////////// - // blas 1 - float sdot_(const unsigned*, const float*, const unsigned*, const float*, const unsigned*); - void sscal_(const unsigned*, const float*, const float*, const unsigned*); - void scopy_(const unsigned*, const float*, const unsigned*, float*, const unsigned*); - void saxpy_(const unsigned*, const float*, const float*, const unsigned*, float*, const unsigned*); - // blas 2 - void sgemv_(const char*, const unsigned*, const unsigned*, const float*, - const float*, const unsigned*, const float*, const unsigned*, - const float*, float*, const unsigned*); - // blas 3 - void sgemm_(const char*, const char*, const unsigned*, const unsigned*, - const unsigned*, const float*, float*, const unsigned*, - float*, const unsigned*, const float*, float*, const unsigned*); - // lapack - void sgesvd_(const char*, const char*, const unsigned*, const unsigned*, - float*, const unsigned*, float*, float*, const unsigned*, - float*, const unsigned*, float*, const unsigned*, int*); - void sgeqrf_(const unsigned*, const unsigned*, float*, const unsigned*, - float*, float*, const unsigned*, int*); - void sgeqp3_(const unsigned*, const unsigned*, float*, const unsigned*, /*TYPE OF JPIV*/ unsigned*, - float*, float*, const unsigned*, int*); - void sorgqr_(const unsigned*, const unsigned*, const unsigned*, - float*, const unsigned*, float*, float*, const unsigned*, int*); - void sormqr_(const char*, const char*, + void Fdpotrf(const char*, const unsigned*, double*, const unsigned*, int*); + + // single ////////////////////////////////////////////////////////// + // blas 1 + float Fsdot(const unsigned*, const float*, const unsigned*, const float*, const unsigned*); + void Fsscal(const unsigned*, const float*, const float*, const unsigned*); + void Fscopy(const unsigned*, const float*, const unsigned*, float*, const unsigned*); + void Fsaxpy(const unsigned*, const float*, const float*, const unsigned*, float*, const unsigned*); + // blas 2 + void Fsgemv(const char*, const unsigned*, const unsigned*, const float*, + const float*, const unsigned*, const float*, const unsigned*, + const float*, float*, const unsigned*); + // blas 3 + void Fsgemm(const char*, const char*, const unsigned*, const unsigned*, + const unsigned*, const float*, float*, const unsigned*, + float*, const unsigned*, const float*, float*, const unsigned*); + // lapack + void Fsgesvd(const char*, const char*, const unsigned*, const unsigned*, + float*, const unsigned*, float*, float*, const unsigned*, + float*, const unsigned*, float*, const unsigned*, int*); + void Fsgeqrf(const unsigned*, const unsigned*, float*, const unsigned*, + float*, float*, const unsigned*, int*); + void Fsgeqp3(const unsigned*, const unsigned*, float*, const unsigned*, /*TYPE OF JPIV*/ unsigned*, + float*, float*, const unsigned*, int*); + void Fsorgqr(const unsigned*, const unsigned*, const unsigned*, + float*, const unsigned*, float*, float*, const unsigned*, int*); + void Fsormqr(const char*, const char*, const unsigned*, const unsigned*, const unsigned*, - const float*, const unsigned*, + const float*, const unsigned*, float*, float*, const unsigned*, float*, const unsigned*, int*); - void spotrf_(const char*, const unsigned*, float*, const unsigned*, int*); - - // double complex ////////////////////////////////////////////////// - // blas 1 - void zscal_(const unsigned*, const double*, const double*, const unsigned*); - void zcopy_(const unsigned*, const double*, const unsigned*, double*, const unsigned*); - void zaxpy_(const unsigned*, const double*, const double*, const unsigned*, double*, const unsigned*); - // blas 2 - void zgemv_(const char*, const unsigned*, const unsigned*, const double*, - const double*, const unsigned*, const double*, const unsigned*, - const double*, double*, const unsigned*); - // blas 3 - void zgemm_(const char*, const char*, const unsigned*, const unsigned*, - const unsigned*, const double*, double*, const unsigned*, - double*, const unsigned*, const double*, double*, const unsigned*); - void zgesvd_(const char*, const char*, const unsigned*, const unsigned*, - double*, const unsigned*, double*, double*, const unsigned*, - double*, const unsigned*, double*, int*, double*, int*); - - void zgeqrf_(const unsigned*, const unsigned*, double*, const unsigned*, - double*, double*, const unsigned*, int*); - void zgeqp3_(const unsigned*, const unsigned*, double*, const unsigned*,/*TYPE OF JPIV*/ unsigned*, - double*, double*, const unsigned*, int*); - - void zpotrf_(const char*, const unsigned*, double*, const unsigned*, int*); - - // single complex ////////////////////////////////////////////////// - // blas 1 - void cscal_(const unsigned*, const float*, const float*, const unsigned*); - void ccopy_(const unsigned*, const float*, const unsigned*, float*, const unsigned*); - void caxpy_(const unsigned*, const float*, const float*, const unsigned*, float*, const unsigned*); - // blas 2 - void cgemv_(const char*, const unsigned*, const unsigned*, const float*, - const float*, const unsigned*, const float*, const unsigned*, - const float*, float*, const unsigned*); - // blas 3 - void cgemm_(const char*, const char*, const unsigned*, const unsigned*, - const unsigned*, const float*, float*, const unsigned*, - float*, const unsigned*, const float*, float*, const unsigned*); - void cgeqrf_(const unsigned*, const unsigned*, float*, const unsigned*, - float*, float*, const unsigned*, int*); - void cgeqp3_(const unsigned*, const unsigned*, float*, const unsigned*, /*TYPE OF JPIV*/ unsigned*, - float*, float*, const unsigned*, int*); - void cpotrf_(const char*, const unsigned*, float*, const unsigned*, int*); + void Fspotrf(const char*, const unsigned*, float*, const unsigned*, int*); + + // double complex ////////////////////////////////////////////////// + // blas 1 + void Fzscal(const unsigned*, const double*, const double*, const unsigned*); + void Fzcopy(const unsigned*, const double*, const unsigned*, double*, const unsigned*); + void Fzaxpy(const unsigned*, const double*, const double*, const unsigned*, double*, const unsigned*); + // blas 2 + void Fzgemv(const char*, const unsigned*, const unsigned*, const double*, + const double*, const unsigned*, const double*, const unsigned*, + const double*, double*, const unsigned*); + // blas 3 + void Fzgemm(const char*, const char*, const unsigned*, const unsigned*, + const unsigned*, const double*, double*, const unsigned*, + double*, const unsigned*, const double*, double*, const unsigned*); + void Fzgesvd(const char*, const char*, const unsigned*, const unsigned*, + double*, const unsigned*, double*, double*, const unsigned*, + double*, const unsigned*, double*, int*, double*, int*); + + void Fzgeqrf(const unsigned*, const unsigned*, double*, const unsigned*, + double*, double*, const unsigned*, int*); + void Fzgeqp3(const unsigned*, const unsigned*, double*, const unsigned*,/*TYPE OF JPIV*/ unsigned*, + double*, double*, const unsigned*, int*); + + void Fzpotrf(const char*, const unsigned*, double*, const unsigned*, int*); + + // single complex ////////////////////////////////////////////////// + // blas 1 + void Fcscal(const unsigned*, const float*, const float*, const unsigned*); + void Fccopy(const unsigned*, const float*, const unsigned*, float*, const unsigned*); + void Fcaxpy(const unsigned*, const float*, const float*, const unsigned*, float*, const unsigned*); + // blas 2 + void Fcgemv(const char*, const unsigned*, const unsigned*, const float*, + const float*, const unsigned*, const float*, const unsigned*, + const float*, float*, const unsigned*); + // blas 3 + void Fcgemm(const char*, const char*, const unsigned*, const unsigned*, + const unsigned*, const float*, float*, const unsigned*, + float*, const unsigned*, const float*, float*, const unsigned*); + void Fcgeqrf(const unsigned*, const unsigned*, float*, const unsigned*, + float*, float*, const unsigned*, int*); + void Fcgeqp3(const unsigned*, const unsigned*, float*, const unsigned*, /*TYPE OF JPIV*/ unsigned*, + float*, float*, const unsigned*, int*); + void Fcpotrf(const char*, const unsigned*, float*, const unsigned*, int*); } @@ -164,460 +165,460 @@ extern "C" namespace FBlas { - // copy - inline void copy(const unsigned n, double* orig, double* dest) - { dcopy_(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); } - inline void copy(const unsigned n, const double* orig, double* dest) - { dcopy_(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); } - inline void copy(const unsigned n, float* orig, float* dest) - { scopy_(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); } - inline void copy(const unsigned n, const float* orig, float* dest) - { scopy_(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); } - inline void c_copy(const unsigned n, double* orig, double* dest) - { zcopy_(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); } - inline void c_copy(const unsigned n, const double* orig, double* dest) - { zcopy_(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); } - inline void c_copy(const unsigned n, float* orig, float* dest) - { ccopy_(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); } - inline void c_copy(const unsigned n, const float* orig, float* dest) - { ccopy_(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); } - - // copy (variable increment) - inline void copy(const unsigned n, double* orig, const unsigned inco, double* dest, const unsigned incd) - { dcopy_(&n, orig, &inco, dest, &incd); } - inline void copy(const unsigned n, float* orig, const unsigned inco, float* dest, const unsigned incd) - { scopy_(&n, orig, &inco, dest, &incd); } - inline void c_copy(const unsigned n, double* orig, const unsigned inco, double* dest, const unsigned incd) - { zcopy_(&n, orig, &inco, dest, &incd); } - inline void c_copy(const unsigned n, float* orig, const unsigned inco, float* dest, const unsigned incd) - { ccopy_(&n, orig, &inco, dest, &incd); } - - // scale - inline void scal(const unsigned n, const double d, double* const x) - { dscal_(&n, &d, x, &scalfmm::N_ONE); } - inline void scal(const unsigned n, const float d, float* const x) - { sscal_(&n, &d, x, &scalfmm::N_ONE); } - inline void c_scal(const unsigned n, const double d, double* const x) - { zscal_(&n, &d, x, &scalfmm::N_ONE); } - inline void c_scal(const unsigned n, const float d, float* const x) - { cscal_(&n, &d, x, &scalfmm::N_ONE); } - - // scale (variable increment) - inline void scal(const unsigned n, const double d, double* const x, const unsigned incd) - { dscal_(&n, &d, x, &incd); } - inline void scal(const unsigned n, const float d, float* const x, const unsigned incd) - { sscal_(&n, &d, x, &incd); } - inline void c_scal(const unsigned n, const double d, double* const x, const unsigned incd) - { zscal_(&n, &d, x, &incd); } - inline void c_scal(const unsigned n, const float d, float* const x, const unsigned incd) - { cscal_(&n, &d, x, &incd); } - - // set zero - inline void setzero(const unsigned n, double* const x) - { for (unsigned i=0; i<n; ++i) x[i] = 0.0; } - inline void setzero(const unsigned n, float* const x) - { for (unsigned i=0; i<n; ++i) x[i] = 0.0f; } - inline void c_setzero(const unsigned n, double* const x) - { for (unsigned i=0; i<n; ++i) x[i*2] = x[i*2+1] = 0.0; } - inline void c_setzero(const unsigned n, float* const x) - { for (unsigned i=0; i<n; ++i) x[i*2] = x[i*2+1] = 0.0f; } - - // y += x - inline void add(const unsigned n, double* const x, double* const y) - { daxpy_(&n, &scalfmm::D_ONE, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE); } - inline void add(const unsigned n, float* const x, float* const y) - { saxpy_(&n, &scalfmm::S_ONE, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE); } - inline void c_add(const unsigned n, float* const x, float* const y) - { caxpy_(&n, scalfmm::C_ONE, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE); } - inline void c_add(const unsigned n, double* const x,double* const y) - { zaxpy_(&n, scalfmm::Z_ONE, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE); } - - // y += d x - inline void axpy(const unsigned n, const double d, const double* const x, double* const y) - { daxpy_(&n, &d, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE); } - inline void axpy(const unsigned n, const float d, const float* const x, float* const y) - { saxpy_(&n, &d, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE); } - inline void c_axpy(const unsigned n, const float* d, const float* const x, float* const y) - { caxpy_(&n, d, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE); } - inline void c_axpy(const unsigned n, const double* d, const double* const x, double* const y) - { zaxpy_(&n, d, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE); } - - - - // // y = d Ax - // inline void gemv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) - // { cblas_dgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::D_ZERO, y, scalfmm::N_ONE); } - // inline void gemv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) - // { cblas_sgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::S_ZERO, y, scalfmm::N_ONE); } - // y = d Ax - inline void gemv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) - { dgemv_(scalfmm::JOB_STR, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::D_ZERO, y, &scalfmm::N_ONE); } - inline void gemv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) - { sgemv_(scalfmm::JOB_STR, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::S_ZERO, y, &scalfmm::N_ONE); } - inline void c_gemv(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y) - { cgemv_(scalfmm::JOB_STR, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ZERO, y, &scalfmm::N_ONE); } - inline void c_gemv(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y) - { zgemv_(scalfmm::JOB_STR, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ZERO, y, &scalfmm::N_ONE); } - - // // y += d Ax - // inline void gemva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) - // { cblas_dgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::D_ONE, y, scalfmm::N_ONE); } - // inline void gemva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) - // { cblas_sgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::S_ONE, y, scalfmm::N_ONE); } - // y += d Ax - inline void gemva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) - { dgemv_(scalfmm::JOB_STR, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::D_ONE, y, &scalfmm::N_ONE); } - inline void gemva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) - { sgemv_(scalfmm::JOB_STR, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::S_ONE, y, &scalfmm::N_ONE); } - inline void c_gemva(const unsigned m, const unsigned n, const float* d, const float* A, const float *x, float *y) - { cgemv_(scalfmm::JOB_STR, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ONE, y, &scalfmm::N_ONE); } - inline void c_gemva(const unsigned m, const unsigned n, const double* d, const double* A, const double *x, double *y) - { zgemv_(scalfmm::JOB_STR, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ONE, y, &scalfmm::N_ONE); } - - // // y = d A^T x - // inline void gemtv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) - // { cblas_dgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::D_ZERO, y, scalfmm::N_ONE); } - // inline void gemtv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) - // { cblas_sgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::S_ZERO, y, scalfmm::N_ONE); } - // y = d A^T x - inline void gemtv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) - { dgemv_(scalfmm::JOB_STR+1, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::D_ZERO, y, &scalfmm::N_ONE); } - inline void gemtv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) - { sgemv_(scalfmm::JOB_STR+1, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::S_ZERO, y, &scalfmm::N_ONE); } - inline void c_gemtv(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y) - { cgemv_(scalfmm::JOB_STR+1, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ZERO, y, &scalfmm::N_ONE); } - inline void c_gemtv(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y) - { zgemv_(scalfmm::JOB_STR+1, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ZERO, y, &scalfmm::N_ONE); } - inline void c_gemhv(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y) - { cgemv_(scalfmm::JOB_STR+7, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ZERO, y, &scalfmm::N_ONE); } // hermitian transposed - inline void c_gemhv(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y) - { zgemv_(scalfmm::JOB_STR+7, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ZERO, y, &scalfmm::N_ONE); } // hermitian transposed - - // // y += d A^T x - // inline void gemtva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) - // { cblas_dgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::D_ONE, y, scalfmm::N_ONE); } - // inline void gemtva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) - // { cblas_sgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::S_ONE, y, scalfmm::N_ONE); } - // y += d A^T x - inline void gemtva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) - { dgemv_(scalfmm::JOB_STR+1, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::D_ONE, y, &scalfmm::N_ONE); } - inline void gemtva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) - { sgemv_(scalfmm::JOB_STR+1, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::S_ONE, y, &scalfmm::N_ONE); } - inline void c_gemtva(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y) - { cgemv_(scalfmm::JOB_STR+1, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ONE, y, &scalfmm::N_ONE); } - inline void c_gemtva(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y) - { zgemv_(scalfmm::JOB_STR+1, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ONE, y, &scalfmm::N_ONE); } - inline void c_gemhva(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y) - { cgemv_(scalfmm::JOB_STR+7, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ONE, y, &scalfmm::N_ONE); } // hermitian transposed - inline void c_gemhva(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y) - { zgemv_(scalfmm::JOB_STR+7, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ONE, y, &scalfmm::N_ONE); } // hermitian transposed - - - - - // C = d A B, A is m x p, B is p x n - inline void gemm(unsigned m, unsigned p, unsigned n, double d, - double* A, unsigned ldA, double* B, unsigned ldB, double* C, unsigned ldC) - { dgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::D_ZERO, C, &ldC); } - inline void gemm(unsigned m, unsigned p, unsigned n, float d, - float* A, unsigned ldA, float* B, unsigned ldB, float* C, unsigned ldC) - { sgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::S_ZERO, C, &ldC); } - inline void c_gemm(const unsigned m, const unsigned p, const unsigned n, const float* d, - float* A, const unsigned ldA, float* B, const unsigned ldB, float* C, const unsigned ldC) - { - cgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC); } - inline void c_gemm(const unsigned m, const unsigned p, const unsigned n, const double* d, - double* A, const unsigned ldA, double* B, const unsigned ldB, double* C, const unsigned ldC) - { - zgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC); } - - // C += d A B, A is m x p, B is p x n - inline void gemma(unsigned m, unsigned p, unsigned n, double d, - double* A, unsigned ldA, double* B, unsigned ldB, double* C, unsigned ldC) - { dgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::D_ONE, C, &ldC); } - inline void gemma(unsigned m, unsigned p, unsigned n, float d, - float* A, unsigned ldA, float* B, unsigned ldB, float* C, unsigned ldC) - { sgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::S_ONE, C, &ldC); } - inline void c_gemma(unsigned m, unsigned p, unsigned n, float* d, - float* A, unsigned ldA, float* B, unsigned ldB, float* C, unsigned ldC) - { cgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); } - inline void c_gemma(unsigned m, unsigned p, unsigned n, double* d, - double* A, unsigned ldA, double* B, unsigned ldB, double* C, unsigned ldC) - { zgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); } - - // C = d A^T B, A is m x p, B is m x n - inline void gemtm(unsigned m, unsigned p, unsigned n, double d, - double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) - { dgemm_(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &scalfmm::D_ZERO, C, &ldC); } - inline void gemtm(unsigned m, unsigned p, unsigned n, float d, - float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) - { sgemm_(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &scalfmm::S_ZERO, C, &ldC); } - inline void c_gemtm(unsigned m, unsigned p, unsigned n, float* d, - float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) - { cgemm_(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC); } - inline void c_gemtm(unsigned m, unsigned p, unsigned n, double* d, - double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) - { zgemm_(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC); } - inline void c_gemhm(unsigned m, unsigned p, unsigned n, float* d, // hermitialn transposed - float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) - { cgemm_(scalfmm::JOB_STR+7, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC); } - inline void c_gemhm(unsigned m, unsigned p, unsigned n, double* d, // hermitian transposed - double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) - { zgemm_(scalfmm::JOB_STR+7, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC); } - - // C += d A^T B, A is m x p, B is m x n - inline void gemtma(unsigned m, unsigned p, unsigned n, double d, - double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) - { dgemm_(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &scalfmm::D_ONE, C, &ldC); } - inline void gemtma(unsigned m, unsigned p, unsigned n, float d, - float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) - { sgemm_(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &scalfmm::S_ONE, C, &ldC); } - inline void c_gemtma(unsigned m, unsigned p, unsigned n, float* d, - float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) - { cgemm_(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); } - inline void c_gemtma(unsigned m, unsigned p, unsigned n, double* d, - double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) - { zgemm_(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); } - inline void c_gemhma(unsigned m, unsigned p, unsigned n, float* d, // hermitian transposed - float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) - { cgemm_(scalfmm::JOB_STR+7, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); } - inline void c_gemhma(unsigned m, unsigned p, unsigned n, double* d, // hermitian transposed - double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) - { zgemm_(scalfmm::JOB_STR+7, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); } + // copy + inline void copy(const unsigned n, double* orig, double* dest) + { Fdcopy(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); } + inline void copy(const unsigned n, const double* orig, double* dest) + { Fdcopy(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); } + inline void copy(const unsigned n, float* orig, float* dest) + { Fscopy(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); } + inline void copy(const unsigned n, const float* orig, float* dest) + { Fscopy(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); } + inline void c_copy(const unsigned n, double* orig, double* dest) + { Fzcopy(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); } + inline void c_copy(const unsigned n, const double* orig, double* dest) + { Fzcopy(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); } + inline void c_copy(const unsigned n, float* orig, float* dest) + { Fccopy(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); } + inline void c_copy(const unsigned n, const float* orig, float* dest) + { Fccopy(&n, orig, &scalfmm::N_ONE, dest, &scalfmm::N_ONE); } + + // copy (variable increment) + inline void copy(const unsigned n, double* orig, const unsigned inco, double* dest, const unsigned incd) + { Fdcopy(&n, orig, &inco, dest, &incd); } + inline void copy(const unsigned n, float* orig, const unsigned inco, float* dest, const unsigned incd) + { Fscopy(&n, orig, &inco, dest, &incd); } + inline void c_copy(const unsigned n, double* orig, const unsigned inco, double* dest, const unsigned incd) + { Fzcopy(&n, orig, &inco, dest, &incd); } + inline void c_copy(const unsigned n, float* orig, const unsigned inco, float* dest, const unsigned incd) + { Fccopy(&n, orig, &inco, dest, &incd); } + + // scale + inline void scal(const unsigned n, const double d, double* const x) + { Fdscal(&n, &d, x, &scalfmm::N_ONE); } + inline void scal(const unsigned n, const float d, float* const x) + { Fsscal(&n, &d, x, &scalfmm::N_ONE); } + inline void c_scal(const unsigned n, const double d, double* const x) + { Fzscal(&n, &d, x, &scalfmm::N_ONE); } + inline void c_scal(const unsigned n, const float d, float* const x) + { Fcscal(&n, &d, x, &scalfmm::N_ONE); } + + // scale (variable increment) + inline void scal(const unsigned n, const double d, double* const x, const unsigned incd) + { Fdscal(&n, &d, x, &incd); } + inline void scal(const unsigned n, const float d, float* const x, const unsigned incd) + { Fsscal(&n, &d, x, &incd); } + inline void c_scal(const unsigned n, const double d, double* const x, const unsigned incd) + { Fzscal(&n, &d, x, &incd); } + inline void c_scal(const unsigned n, const float d, float* const x, const unsigned incd) + { Fcscal(&n, &d, x, &incd); } + + // set zero + inline void setzero(const unsigned n, double* const x) + { for (unsigned i=0; i<n; ++i) x[i] = 0.0; } + inline void setzero(const unsigned n, float* const x) + { for (unsigned i=0; i<n; ++i) x[i] = 0.0f; } + inline void c_setzero(const unsigned n, double* const x) + { for (unsigned i=0; i<n; ++i) x[i*2] = x[i*2+1] = 0.0; } + inline void c_setzero(const unsigned n, float* const x) + { for (unsigned i=0; i<n; ++i) x[i*2] = x[i*2+1] = 0.0f; } + + // y += x + inline void add(const unsigned n, double* const x, double* const y) + { Fdaxpy(&n, &scalfmm::D_ONE, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE); } + inline void add(const unsigned n, float* const x, float* const y) + { Fsaxpy(&n, &scalfmm::S_ONE, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE); } + inline void c_add(const unsigned n, float* const x, float* const y) + { Fcaxpy(&n, scalfmm::C_ONE, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE); } + inline void c_add(const unsigned n, double* const x,double* const y) + { Fzaxpy(&n, scalfmm::Z_ONE, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE); } + + // y += d x + inline void axpy(const unsigned n, const double d, const double* const x, double* const y) + { Fdaxpy(&n, &d, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE); } + inline void axpy(const unsigned n, const float d, const float* const x, float* const y) + { Fsaxpy(&n, &d, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE); } + inline void c_axpy(const unsigned n, const float* d, const float* const x, float* const y) + { Fcaxpy(&n, d, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE); } + inline void c_axpy(const unsigned n, const double* d, const double* const x, double* const y) + { Fzaxpy(&n, d, x, &scalfmm::N_ONE, y, &scalfmm::N_ONE); } + + + + // // y = d Ax + // inline void gemv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) + // { cblas_dgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::D_ZERO, y, scalfmm::N_ONE); } + // inline void gemv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) + // { cblas_sgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::S_ZERO, y, scalfmm::N_ONE); } + // y = d Ax + inline void gemv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) + { Fdgemv(scalfmm::JOB_STR, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::D_ZERO, y, &scalfmm::N_ONE); } + inline void gemv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) + { Fsgemv(scalfmm::JOB_STR, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::S_ZERO, y, &scalfmm::N_ONE); } + inline void c_gemv(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y) + { Fcgemv(scalfmm::JOB_STR, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ZERO, y, &scalfmm::N_ONE); } + inline void c_gemv(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y) + { Fzgemv(scalfmm::JOB_STR, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ZERO, y, &scalfmm::N_ONE); } + + // // y += d Ax + // inline void gemva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) + // { cblas_dgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::D_ONE, y, scalfmm::N_ONE); } + // inline void gemva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) + // { cblas_sgemv(CblasColMajor, CblasNoTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::S_ONE, y, scalfmm::N_ONE); } + // y += d Ax + inline void gemva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) + { Fdgemv(scalfmm::JOB_STR, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::D_ONE, y, &scalfmm::N_ONE); } + inline void gemva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) + { Fsgemv(scalfmm::JOB_STR, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::S_ONE, y, &scalfmm::N_ONE); } + inline void c_gemva(const unsigned m, const unsigned n, const float* d, const float* A, const float *x, float *y) + { Fcgemv(scalfmm::JOB_STR, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ONE, y, &scalfmm::N_ONE); } + inline void c_gemva(const unsigned m, const unsigned n, const double* d, const double* A, const double *x, double *y) + { Fzgemv(scalfmm::JOB_STR, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ONE, y, &scalfmm::N_ONE); } + + // // y = d A^T x + // inline void gemtv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) + // { cblas_dgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::D_ZERO, y, scalfmm::N_ONE); } + // inline void gemtv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) + // { cblas_sgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::S_ZERO, y, scalfmm::N_ONE); } + // y = d A^T x + inline void gemtv(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) + { Fdgemv(scalfmm::JOB_STR+1, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::D_ZERO, y, &scalfmm::N_ONE); } + inline void gemtv(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) + { Fsgemv(scalfmm::JOB_STR+1, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::S_ZERO, y, &scalfmm::N_ONE); } + inline void c_gemtv(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y) + { Fcgemv(scalfmm::JOB_STR+1, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ZERO, y, &scalfmm::N_ONE); } + inline void c_gemtv(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y) + { Fzgemv(scalfmm::JOB_STR+1, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ZERO, y, &scalfmm::N_ONE); } + inline void c_gemhv(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y) + { Fcgemv(scalfmm::JOB_STR+7, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ZERO, y, &scalfmm::N_ONE); } // hermitian transposed + inline void c_gemhv(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y) + { Fzgemv(scalfmm::JOB_STR+7, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ZERO, y, &scalfmm::N_ONE); } // hermitian transposed + + // // y += d A^T x + // inline void gemtva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) + // { cblas_dgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::D_ONE, y, scalfmm::N_ONE); } + // inline void gemtva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) + // { cblas_sgemv(CblasColMajor, CblasTrans, m, n, d, A, m, x, scalfmm::N_ONE, scalfmm::S_ONE, y, scalfmm::N_ONE); } + // y += d A^T x + inline void gemtva(const unsigned m, const unsigned n, double d, double* A, double *x, double *y) + { Fdgemv(scalfmm::JOB_STR+1, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::D_ONE, y, &scalfmm::N_ONE); } + inline void gemtva(const unsigned m, const unsigned n, float d, float* A, float *x, float *y) + { Fsgemv(scalfmm::JOB_STR+1, &m, &n, &d, A, &m, x, &scalfmm::N_ONE, &scalfmm::S_ONE, y, &scalfmm::N_ONE); } + inline void c_gemtva(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y) + { Fcgemv(scalfmm::JOB_STR+1, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ONE, y, &scalfmm::N_ONE); } + inline void c_gemtva(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y) + { Fzgemv(scalfmm::JOB_STR+1, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ONE, y, &scalfmm::N_ONE); } + inline void c_gemhva(const unsigned m, const unsigned n, float* d, float* A, float *x, float *y) + { Fcgemv(scalfmm::JOB_STR+7, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::C_ONE, y, &scalfmm::N_ONE); } // hermitian transposed + inline void c_gemhva(const unsigned m, const unsigned n, double* d, double* A, double *x, double *y) + { Fzgemv(scalfmm::JOB_STR+7, &m, &n, d, A, &m, x, &scalfmm::N_ONE, scalfmm::Z_ONE, y, &scalfmm::N_ONE); } // hermitian transposed + + + + + // C = d A B, A is m x p, B is p x n + inline void gemm(unsigned m, unsigned p, unsigned n, double d, + double* A, unsigned ldA, double* B, unsigned ldB, double* C, unsigned ldC) + { Fdgemm(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::D_ZERO, C, &ldC); } + inline void gemm(unsigned m, unsigned p, unsigned n, float d, + float* A, unsigned ldA, float* B, unsigned ldB, float* C, unsigned ldC) + { Fsgemm(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::S_ZERO, C, &ldC); } + inline void c_gemm(const unsigned m, const unsigned p, const unsigned n, const float* d, + float* A, const unsigned ldA, float* B, const unsigned ldB, float* C, const unsigned ldC) + { + Fcgemm(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC); } + inline void c_gemm(const unsigned m, const unsigned p, const unsigned n, const double* d, + double* A, const unsigned ldA, double* B, const unsigned ldB, double* C, const unsigned ldC) + { + Fzgemm(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC); } + + // C += d A B, A is m x p, B is p x n + inline void gemma(unsigned m, unsigned p, unsigned n, double d, + double* A, unsigned ldA, double* B, unsigned ldB, double* C, unsigned ldC) + { Fdgemm(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::D_ONE, C, &ldC); } + inline void gemma(unsigned m, unsigned p, unsigned n, float d, + float* A, unsigned ldA, float* B, unsigned ldB, float* C, unsigned ldC) + { Fsgemm(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::S_ONE, C, &ldC); } + inline void c_gemma(unsigned m, unsigned p, unsigned n, float* d, + float* A, unsigned ldA, float* B, unsigned ldB, float* C, unsigned ldC) + { Fcgemm(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); } + inline void c_gemma(unsigned m, unsigned p, unsigned n, double* d, + double* A, unsigned ldA, double* B, unsigned ldB, double* C, unsigned ldC) + { Fzgemm(scalfmm::JOB_STR, scalfmm::JOB_STR, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); } + + // C = d A^T B, A is m x p, B is m x n + inline void gemtm(unsigned m, unsigned p, unsigned n, double d, + double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) + { Fdgemm(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &scalfmm::D_ZERO, C, &ldC); } + inline void gemtm(unsigned m, unsigned p, unsigned n, float d, + float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) + { Fsgemm(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &scalfmm::S_ZERO, C, &ldC); } + inline void c_gemtm(unsigned m, unsigned p, unsigned n, float* d, + float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) + { Fcgemm(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC); } + inline void c_gemtm(unsigned m, unsigned p, unsigned n, double* d, + double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) + { Fzgemm(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC); } + inline void c_gemhm(unsigned m, unsigned p, unsigned n, float* d, // hermitialn transposed + float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) + { Fcgemm(scalfmm::JOB_STR+7, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC); } + inline void c_gemhm(unsigned m, unsigned p, unsigned n, double* d, // hermitian transposed + double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) + { Fzgemm(scalfmm::JOB_STR+7, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC); } + + // C += d A^T B, A is m x p, B is m x n + inline void gemtma(unsigned m, unsigned p, unsigned n, double d, + double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) + { Fdgemm(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &scalfmm::D_ONE, C, &ldC); } + inline void gemtma(unsigned m, unsigned p, unsigned n, float d, + float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) + { Fsgemm(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, &d, A, &ldA, B, &ldB, &scalfmm::S_ONE, C, &ldC); } + inline void c_gemtma(unsigned m, unsigned p, unsigned n, float* d, + float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) + { Fcgemm(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); } + inline void c_gemtma(unsigned m, unsigned p, unsigned n, double* d, + double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) + { Fzgemm(scalfmm::JOB_STR+1, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); } + inline void c_gemhma(unsigned m, unsigned p, unsigned n, float* d, // hermitian transposed + float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) + { Fcgemm(scalfmm::JOB_STR+7, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); } + inline void c_gemhma(unsigned m, unsigned p, unsigned n, double* d, // hermitian transposed + double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) + { Fzgemm(scalfmm::JOB_STR+7, scalfmm::JOB_STR, &p, &n, &m, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); } - // C = d A B^T, A is m x p, B is n x p - inline void gemmt(unsigned m, unsigned p, unsigned n, double d, - double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) - { dgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::D_ZERO, C, &ldC); } - inline void gemmt(unsigned m, unsigned p, unsigned n, float d, - float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) - { sgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::S_ZERO, C, &ldC); } - inline void c_gemmt(unsigned m, unsigned p, unsigned n, float d, - float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) - { cgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC); } - inline void c_gemmt(unsigned m, unsigned p, unsigned n, double d, - double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) - { zgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC); } - inline void c_gemmh(unsigned m, unsigned p, unsigned n, float d, // hermitian transposed - float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) - { cgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+7, &m, &n, &p, &d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC); } - inline void c_gemmh(unsigned m, unsigned p, unsigned n, double d, // hermitian transposed - double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) - { zgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+7, &m, &n, &p, &d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC); } - - // C += d A B^T, A is m x p, B is n x p - inline void gemmta(unsigned m, unsigned p, unsigned n, double d, - double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) - { dgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::D_ONE, C, &ldC); } - inline void gemmta(unsigned m, unsigned p, unsigned n, float d, - float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) - { sgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::S_ONE, C, &ldC); } - inline void c_gemmta(unsigned m, unsigned p, unsigned n, float* d, - float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) - { cgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); } - inline void c_gemmta(unsigned m, unsigned p, unsigned n, double* d, - double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) - { zgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); } - inline void c_gemmha(unsigned m, unsigned p, unsigned n, float* d, // hermitian transposed - float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) - { cgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+7, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); } - inline void c_gemmha(unsigned m, unsigned p, unsigned n, double* d, // hermitian transposed - double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) - { zgemm_(scalfmm::JOB_STR, scalfmm::JOB_STR+7, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); } - - - // singular value decomposition - // - inline int gesvd(unsigned m, unsigned n, double* A, double* S, double* VT, unsigned ldVT, - unsigned nwk, double* wk) - { - int INF; - dgesvd_(scalfmm::JOB_STR+2, scalfmm::JOB_STR+3, &m, &n, A, &m, S, A, &m, VT, &ldVT, wk, &nwk, &INF); - return INF; - } - // - // A = U * SIGMA * conjugate-transpose(V) - // scalfmm::JOB_STR+2 = 'O': the first min(m,n) columns of U (the left singular vectors) are overwritten on the array A; - inline int c_gesvd(unsigned m, unsigned n, double* A, double* S, double* VT, unsigned ldVT, - int& nwk, double* wk,double* rwk) - { - int INF; - zgesvd_(scalfmm::JOB_STR+2, scalfmm::JOB_STR+3, &m, &n, A, &m, S, A, &m, VT, &ldVT, wk, &nwk, rwk,&INF); - return INF; - } - - inline int gesvd(unsigned m, unsigned n, float* A, float* S, float* VT, unsigned ldVT, - unsigned nwk, float* wk) - { - int INF; - sgesvd_(scalfmm::JOB_STR+2, scalfmm::JOB_STR+3, &m, &n, A, &m, S, A, &m, VT, &ldVT, wk, &nwk, &INF); - return INF; - } - - // singular value decomposition (SO) - inline int gesvdSO(unsigned m, unsigned n, double* A, double* S, double* U, unsigned ldU, - unsigned nwk, double* wk) - { - int INF; - dgesvd_(scalfmm::JOB_STR+3, scalfmm::JOB_STR+2, &m, &n, A, &m, S, U, &m, A, &ldU, wk, &nwk, &INF); - return INF; - } - inline int gesvdSO(unsigned m, unsigned n, float* A, float* S, float* U, unsigned ldU, - unsigned nwk, float* wk) - { - int INF; - sgesvd_(scalfmm::JOB_STR+3, scalfmm::JOB_STR+2, &m, &n, A, &m, S, U, &m, A, &ldU, wk, &nwk, &INF); - return INF; - } - - // singular value decomposition (AA) - inline int gesvdAA(unsigned m, unsigned n, double* A, double* S, double* U, unsigned ldU, - unsigned nwk, double* wk) - { - int INF; - dgesvd_("A", "A", &m, &n, A, &m, S, U, &m, A, &ldU, wk, &nwk, &INF); - return INF; - } - inline int gesvdAA(unsigned m, unsigned n, float* A, float* S, float* U, unsigned ldU, - unsigned nwk, float* wk) - { - int INF; - sgesvd_("A", "A", &m, &n, A, &m, S, U, &m, A, &ldU, wk, &nwk, &INF); - return INF; - } - - // Scalar product v1'*v2 - inline double scpr(const unsigned n, const double* const v1, const double* const v2) - { return ddot_(&n, v1, &scalfmm::N_ONE, v2, &scalfmm::N_ONE); } - inline float scpr(const unsigned n, const float* const v1, const float* const v2) - { return sdot_(&n, v1, &scalfmm::N_ONE, v2, &scalfmm::N_ONE); } - - - - // QR factorisation - inline int geqrf(const unsigned m, const unsigned n, double* A, double* tau, unsigned nwk, double* wk) - { - int INF; - dgeqrf_(&m, &n, A, &m, tau, wk, &nwk, &INF); - return INF; - } - inline int geqrf(const unsigned m, const unsigned n, float* A, float* tau, unsigned nwk, float* wk) - { - int INF; - sgeqrf_(&m, &n, A, &m, tau, wk, &nwk, &INF); - return INF; - } - // QR factorisation with column pivoting - inline int geqp3(const unsigned m, const unsigned n, double* A, unsigned* jpiv, double* tau, unsigned nwk, double* wk) - { - int INF; - dgeqp3_(&m, &n, A, &m, jpiv, tau, wk, &nwk, &INF); - return INF; - } - inline int geqp3(const unsigned m, const unsigned n, float* A, unsigned* jpiv, float* tau, unsigned nwk, float* wk) - { - int INF; - sgeqp3_(&m, &n, A, &m, jpiv, tau, wk, &nwk, &INF); - return INF; - } - inline int c_geqrf(const unsigned m, const unsigned n, float* A, float* tau, unsigned nwk, float* wk) - { - int INF; - cgeqrf_(&m, &n, A, &m, tau, wk, &nwk, &INF); - return INF; - } + // C = d A B^T, A is m x p, B is n x p + inline void gemmt(unsigned m, unsigned p, unsigned n, double d, + double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) + { Fdgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::D_ZERO, C, &ldC); } + inline void gemmt(unsigned m, unsigned p, unsigned n, float d, + float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) + { Fsgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::S_ZERO, C, &ldC); } + inline void c_gemmt(unsigned m, unsigned p, unsigned n, float d, + float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) + { Fcgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC); } + inline void c_gemmt(unsigned m, unsigned p, unsigned n, double d, + double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) + { Fzgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC); } + inline void c_gemmh(unsigned m, unsigned p, unsigned n, float d, // hermitian transposed + float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) + { Fcgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+7, &m, &n, &p, &d, A, &ldA, B, &ldB, scalfmm::C_ZERO, C, &ldC); } + inline void c_gemmh(unsigned m, unsigned p, unsigned n, double d, // hermitian transposed + double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) + { Fzgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+7, &m, &n, &p, &d, A, &ldA, B, &ldB, scalfmm::Z_ZERO, C, &ldC); } + + // C += d A B^T, A is m x p, B is n x p + inline void gemmta(unsigned m, unsigned p, unsigned n, double d, + double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) + { Fdgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::D_ONE, C, &ldC); } + inline void gemmta(unsigned m, unsigned p, unsigned n, float d, + float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) + { Fsgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, &d, A, &ldA, B, &ldB, &scalfmm::S_ONE, C, &ldC); } + inline void c_gemmta(unsigned m, unsigned p, unsigned n, float* d, + float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) + { Fcgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); } + inline void c_gemmta(unsigned m, unsigned p, unsigned n, double* d, + double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) + { Fzgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+1, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); } + inline void c_gemmha(unsigned m, unsigned p, unsigned n, float* d, // hermitian transposed + float* A, unsigned ldA, float *B, unsigned ldB, float* C, unsigned ldC) + { Fcgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+7, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::C_ONE, C, &ldC); } + inline void c_gemmha(unsigned m, unsigned p, unsigned n, double* d, // hermitian transposed + double* A, unsigned ldA, double *B, unsigned ldB, double* C, unsigned ldC) + { Fzgemm(scalfmm::JOB_STR, scalfmm::JOB_STR+7, &m, &n, &p, d, A, &ldA, B, &ldB, scalfmm::Z_ONE, C, &ldC); } + + + // singular value decomposition + // + inline int gesvd(unsigned m, unsigned n, double* A, double* S, double* VT, unsigned ldVT, + unsigned nwk, double* wk) + { + int INF; + Fdgesvd(scalfmm::JOB_STR+2, scalfmm::JOB_STR+3, &m, &n, A, &m, S, A, &m, VT, &ldVT, wk, &nwk, &INF); + return INF; + } + // + // A = U * SIGMA * conjugate-transpose(V) + // scalfmm::JOB_STR+2 = 'O': the first min(m,n) columns of U (the left singular vectors) are overwritten on the array A; + inline int c_gesvd(unsigned m, unsigned n, double* A, double* S, double* VT, unsigned ldVT, + int& nwk, double* wk,double* rwk) + { + int INF; + Fzgesvd(scalfmm::JOB_STR+2, scalfmm::JOB_STR+3, &m, &n, A, &m, S, A, &m, VT, &ldVT, wk, &nwk, rwk,&INF); + return INF; + } + + inline int gesvd(unsigned m, unsigned n, float* A, float* S, float* VT, unsigned ldVT, + unsigned nwk, float* wk) + { + int INF; + Fsgesvd(scalfmm::JOB_STR+2, scalfmm::JOB_STR+3, &m, &n, A, &m, S, A, &m, VT, &ldVT, wk, &nwk, &INF); + return INF; + } + + // singular value decomposition (SO) + inline int gesvdSO(unsigned m, unsigned n, double* A, double* S, double* U, unsigned ldU, + unsigned nwk, double* wk) + { + int INF; + Fdgesvd(scalfmm::JOB_STR+3, scalfmm::JOB_STR+2, &m, &n, A, &m, S, U, &m, A, &ldU, wk, &nwk, &INF); + return INF; + } + inline int gesvdSO(unsigned m, unsigned n, float* A, float* S, float* U, unsigned ldU, + unsigned nwk, float* wk) + { + int INF; + Fsgesvd(scalfmm::JOB_STR+3, scalfmm::JOB_STR+2, &m, &n, A, &m, S, U, &m, A, &ldU, wk, &nwk, &INF); + return INF; + } + + // singular value decomposition (AA) + inline int gesvdAA(unsigned m, unsigned n, double* A, double* S, double* U, unsigned ldU, + unsigned nwk, double* wk) + { + int INF; + Fdgesvd("A", "A", &m, &n, A, &m, S, U, &m, A, &ldU, wk, &nwk, &INF); + return INF; + } + inline int gesvdAA(unsigned m, unsigned n, float* A, float* S, float* U, unsigned ldU, + unsigned nwk, float* wk) + { + int INF; + Fsgesvd("A", "A", &m, &n, A, &m, S, U, &m, A, &ldU, wk, &nwk, &INF); + return INF; + } + + // Scalar product v1'*v2 + inline double scpr(const unsigned n, const double* const v1, const double* const v2) + { return Fddot(&n, v1, &scalfmm::N_ONE, v2, &scalfmm::N_ONE); } + inline float scpr(const unsigned n, const float* const v1, const float* const v2) + { return Fsdot(&n, v1, &scalfmm::N_ONE, v2, &scalfmm::N_ONE); } + + + + // QR factorisation + inline int geqrf(const unsigned m, const unsigned n, double* A, double* tau, unsigned nwk, double* wk) + { + int INF; + Fdgeqrf(&m, &n, A, &m, tau, wk, &nwk, &INF); + return INF; + } + inline int geqrf(const unsigned m, const unsigned n, float* A, float* tau, unsigned nwk, float* wk) + { + int INF; + Fsgeqrf(&m, &n, A, &m, tau, wk, &nwk, &INF); + return INF; + } + // QR factorisation with column pivoting + inline int geqp3(const unsigned m, const unsigned n, double* A, unsigned* jpiv, double* tau, unsigned nwk, double* wk) + { + int INF; + Fdgeqp3(&m, &n, A, &m, jpiv, tau, wk, &nwk, &INF); + return INF; + } + inline int geqp3(const unsigned m, const unsigned n, float* A, unsigned* jpiv, float* tau, unsigned nwk, float* wk) + { + int INF; + Fsgeqp3(&m, &n, A, &m, jpiv, tau, wk, &nwk, &INF); + return INF; + } + inline int c_geqrf(const unsigned m, const unsigned n, float* A, float* tau, unsigned nwk, float* wk) + { + int INF; + Fcgeqrf(&m, &n, A, &m, tau, wk, &nwk, &INF); + return INF; + } - inline int c_geqrf(const unsigned m, const unsigned n, double* A, double* tau, unsigned nwk, double* wk) - { - int INF; - zgeqrf_(&m, &n, A, &m, tau, wk, &nwk, &INF); - return INF; - } - inline int c_geqp3(const unsigned m, const unsigned n, float* A, unsigned* jpiv, float* tau, unsigned nwk, float* wk) - { - int INF; - cgeqp3_(&m, &n, A, &m, jpiv, tau, wk, &nwk, &INF); - return INF; - } + inline int c_geqrf(const unsigned m, const unsigned n, double* A, double* tau, unsigned nwk, double* wk) + { + int INF; + Fzgeqrf(&m, &n, A, &m, tau, wk, &nwk, &INF); + return INF; + } + inline int c_geqp3(const unsigned m, const unsigned n, float* A, unsigned* jpiv, float* tau, unsigned nwk, float* wk) + { + int INF; + Fcgeqp3(&m, &n, A, &m, jpiv, tau, wk, &nwk, &INF); + return INF; + } - inline int c_geqp3(const unsigned m, const unsigned n, double* A, unsigned* jpiv, double* tau, unsigned nwk, double* wk) - { - int INF; - zgeqp3_(&m, &n, A, &m, jpiv, tau, wk, &nwk, &INF); - return INF; - } - - // return full of Q-Matrix (QR factorization) in A - inline int orgqr_full(const unsigned m, const unsigned n, double* A, double* tau, unsigned nwk, double* wk) - { - int INF; - dorgqr_(&m, &m, &n, A, &m, tau, wk, &nwk, &INF); - return INF; - } - inline int orgqr_full(const unsigned m, const unsigned n, float* A, float* tau, unsigned nwk, float* wk) - { - int INF; - sorgqr_(&m, &m, &n, A, &m, tau, wk, &nwk, &INF); - return INF; - } - // return the leading n columns of Q-Matrix (QR factorization) in A - inline int orgqr(const unsigned m, const unsigned n, double* A, double* tau, unsigned nwk, double* wk) - { - int INF; - dorgqr_(&m, &n, &n, A, &m, tau, wk, &nwk, &INF); - return INF; - } - inline int orgqr(const unsigned m, const unsigned n, float* A, float* tau, unsigned nwk, float* wk) - { - int INF; - sorgqr_(&m, &n, &n, A, &m, tau, wk, &nwk, &INF); - return INF; - } - - - - // apply Q-Matrix (from QR factorization) to C - // LEFT: Q(^T)C - inline int left_ormqr(const char* TRANS, const unsigned m, const unsigned n, const double* A, double* tau, double* C, unsigned nwk, double* wk) - { - int INF; - dormqr_("L", TRANS, &m, &n, &m, A, &m, tau, C, &m, wk, &nwk, &INF); - return INF; - } - inline int left_ormqr(const char* TRANS, const unsigned m, const unsigned n, const float* A, float* tau, float* C, unsigned nwk, float* wk) - { - int INF; - sormqr_("L", TRANS, &m, &n, &m, A, &m, tau, C, &m, wk, &nwk, &INF); - return INF; - } - // RIGHT: CQ(^T) - inline int right_ormqr(const char* TRANS, const unsigned m, const unsigned n, const double* A, double* tau, double* C, unsigned nwk, double* wk) - { - int INF; - dormqr_("R", TRANS, &m, &n, &n, A, &n, tau, C, &m, wk, &nwk, &INF); - return INF; - } - inline int right_ormqr(const char* TRANS, const unsigned m, const unsigned n, const float* A, float* tau, float* C, unsigned nwk, float* wk) - { - int INF; - sormqr_("R", TRANS, &m, &n, &n, A, &n, tau, C, &m, wk, &nwk, &INF); - return INF; - } - - // Cholesky decomposition: A=LL^T (if A is symmetric definite positive) - inline int potrf(const unsigned m, double* A, const unsigned n) - { - int INF; - dpotrf_("L", &m, A, &n, &INF); - return INF; - } - inline int potrf(const unsigned m, float* A, const unsigned n) - { - int INF; - spotrf_("L", &m, A, &n, &INF); - return INF; - } + inline int c_geqp3(const unsigned m, const unsigned n, double* A, unsigned* jpiv, double* tau, unsigned nwk, double* wk) + { + int INF; + Fzgeqp3(&m, &n, A, &m, jpiv, tau, wk, &nwk, &INF); + return INF; + } + + // return full of Q-Matrix (QR factorization) in A + inline int orgqr_full(const unsigned m, const unsigned n, double* A, double* tau, unsigned nwk, double* wk) + { + int INF; + Fdorgqr(&m, &m, &n, A, &m, tau, wk, &nwk, &INF); + return INF; + } + inline int orgqr_full(const unsigned m, const unsigned n, float* A, float* tau, unsigned nwk, float* wk) + { + int INF; + Fsorgqr(&m, &m, &n, A, &m, tau, wk, &nwk, &INF); + return INF; + } + // return the leading n columns of Q-Matrix (QR factorization) in A + inline int orgqr(const unsigned m, const unsigned n, double* A, double* tau, unsigned nwk, double* wk) + { + int INF; + Fdorgqr(&m, &n, &n, A, &m, tau, wk, &nwk, &INF); + return INF; + } + inline int orgqr(const unsigned m, const unsigned n, float* A, float* tau, unsigned nwk, float* wk) + { + int INF; + Fsorgqr(&m, &n, &n, A, &m, tau, wk, &nwk, &INF); + return INF; + } + + + + // apply Q-Matrix (from QR factorization) to C + // LEFT: Q(^T)C + inline int left_ormqr(const char* TRANS, const unsigned m, const unsigned n, const double* A, double* tau, double* C, unsigned nwk, double* wk) + { + int INF; + Fdormqr("L", TRANS, &m, &n, &m, A, &m, tau, C, &m, wk, &nwk, &INF); + return INF; + } + inline int left_ormqr(const char* TRANS, const unsigned m, const unsigned n, const float* A, float* tau, float* C, unsigned nwk, float* wk) + { + int INF; + Fsormqr("L", TRANS, &m, &n, &m, A, &m, tau, C, &m, wk, &nwk, &INF); + return INF; + } + // RIGHT: CQ(^T) + inline int right_ormqr(const char* TRANS, const unsigned m, const unsigned n, const double* A, double* tau, double* C, unsigned nwk, double* wk) + { + int INF; + Fdormqr("R", TRANS, &m, &n, &n, A, &n, tau, C, &m, wk, &nwk, &INF); + return INF; + } + inline int right_ormqr(const char* TRANS, const unsigned m, const unsigned n, const float* A, float* tau, float* C, unsigned nwk, float* wk) + { + int INF; + Fsormqr("R", TRANS, &m, &n, &n, A, &n, tau, C, &m, wk, &nwk, &INF); + return INF; + } + + // Cholesky decomposition: A=LL^T (if A is symmetric definite positive) + inline int potrf(const unsigned m, double* A, const unsigned n) + { + int INF; + Fdpotrf("L", &m, A, &n, &INF); + return INF; + } + inline int potrf(const unsigned m, float* A, const unsigned n) + { + int INF; + Fspotrf("L", &m, A, &n, &INF); + return INF; + } } // end namespace FCBlas diff --git a/Src/Utils/FFortranMangling.hpp b/Src/Utils/FFortranMangling.hpp new file mode 100644 index 0000000000000000000000000000000000000000..92fc3f42e37674709aabf8c8b27c2e4022b9ea5c --- /dev/null +++ b/Src/Utils/FFortranMangling.hpp @@ -0,0 +1,91 @@ +/* + * FFortranMangling.hpp + * + * Created on: 6 juin 2016 + * Author: coulaud + */ + +#ifndef SRC_UTILS_FFORTRANMANGLING_HPP_ +#define SRC_UTILS_FFORTRANMANGLING_HPP_ + + +#include "ScalFmmConfig.h" + +#ifdef SCALFMM_BLAS_ADD_ +/* Mangling for Fortran subroutine symbols with underscores. */ + +#define FortranName(name,NAME) name##_ + +#elif defined(SCALFMM_BLAS_UPCASE) + +/* Mangling for Fortran subroutine symbols in uppercase and without underscores */ + +#define FortranName(name,NAME) NAME + +#elif defined(SCALFMM_BLAS_NOCHANGE) +/* Mangling for Fortran subroutine symbols without no change. */ + +#define FortranName(name,NAME) name + +#else + +#error("Fortran MANGLING NOT DEFINED") + +#endif + + // blas 1 +#define Fddot FortranName(ddot,DDOT) +#define Fdscal FortranName(dscal,DSCAL) +#define Fdcopy FortranName(dcopy,DCOPY) +#define Fdaxpy FortranName(daxpy,DAXPY) +#define Fsdot FortranName(sdot,SDOT) +#define Fsscal FortranName(sscal,SSCAL) +#define Fscopy FortranName(scopy,SCOPY) +#define Fsaxpy FortranName(saxpy,SAXPY) +#define Fcscal FortranName(cscal,CSCAL) +#define Fccopy FortranName(ccopy,CCOPY) +#define Fcaxpy FortranName(caxpy,CAXPY) +#define Fzscal FortranName(zscal,ZSCAL) +#define Fzcopy FortranName(zcopy,ZCOPY) +#define Fzaxpy FortranName(zaxpy,ZAXPY) +// blas 2 +#define Fdgemv FortranName(dgemv,DGEMV) +#define Fsgemv FortranName(sgemv,SGEMV) +#define Fcgemv FortranName(cgemv,CGEMV) +#define Fzgemv FortranName(zgemv,ZGEMV) + // blas 3 +#define Fdgemm FortranName(dgemm,DGEMM) +#define Fsgemm FortranName(sgemm,SGEMM) +#define Fcgemm FortranName(cgemm,CGEMM) +#define Fzgemm FortranName(zgemm,ZGEMM) + // lapack +#define Fdgesvd FortranName(dgesvd,DGESVD) +#define Fdgeqrf FortranName(dgeqrf,DGEQRF) +#define Fdgeqp3 FortranName(dgeqp3,DGEQP3) +#define Fdorgqr FortranName(dorgqr,DORGQR) +#define Fdormqr FortranName(dormqr,DORMQR) +#define Fdpotrf FortranName(dpotrf,DPOTRF) +#define Fsgesvd FortranName(sgesvd,SGESVD) +#define Fsgeqrf FortranName(sgeqrf,SGEQRF) +#define Fsgeqp3 FortranName(sgeqp3,SGEQP3) +#define Fsorgqr FortranName(sorgqr,SORGQR) +#define Fsormqr FortranName(sormqr,SORMQR) +#define Fspotrf FortranName(spotrf,SPOTRF) +#define Fcgesvd FortranName(cgesvd,CGESVD) +#define Fcgeqrf FortranName(cgeqrf,CGEQRF) +#define Fcgeqp3 FortranName(cgeqp3,CGEQP3) +#define Fcorgqr FortranName(corgqr,CORGQR) +#define Fcormqr FortranName(cormqr,CORMQR) +#define Fcpotrf FortranName(cpotrf,CPOTRF) +#define Fzgesvd FortranName(zgesvd,ZGESVD) +#define Fzgeqrf FortranName(zgeqrf,ZGEQRF) +#define Fzgeqp3 FortranName(zgeqp3,ZGEQP3) +#define Fzorgqr FortranName(zorgqr,ZORGQR) +#define Fzormqr FortranName(zormqr,ZORMQR) +#define Fzpotrf FortranName(zpotrf,ZPOTRF) + + +#endif /* SRC_UTILS_FFORTRANMANGLING_HPP_ */ + + + diff --git a/Src/Utils/FTic.hpp b/Src/Utils/FTic.hpp index 911c88043091d1985fb8324bd377bd52d013c5e4..72527b6e454f50ab0a2db5d67ab54ce8c00dae2e 100644 --- a/Src/Utils/FTic.hpp +++ b/Src/Utils/FTic.hpp @@ -4,13 +4,13 @@ // This software is a computer program whose purpose is to compute the FMM. // // This software is governed by the CeCILL-C and LGPL licenses and -// abiding by the rules of distribution of free software. -// +// abiding by the rules of distribution of free software. +// // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public and CeCILL-C Licenses for more details. -// "http://www.cecill.info". +// "http://www.cecill.info". // "http://www.gnu.org/licenses". // =================================================================================== #ifndef FTIC_HPP @@ -18,7 +18,11 @@ #include "FGlobal.hpp" -#ifdef _OPENMP +#define USE_STD_CHRONO + +#if defined(USE_STD_CHRONO) + #include <chrono> +#elif defined(_OPENMP) #include <omp.h> #elif defined(WINDOWS) // We need an os specific function #include <time.h> @@ -42,7 +46,7 @@ * * - use elapsed() to get the last time interval; * - use cumulated() to get the total running time; - * - use reset() to stop and reset the counter. + * - use reset() to stop and reset the counter. * * \code * FTic timer; @@ -66,12 +70,12 @@ private: double start = 0; ///< start time (tic) double end = 0; ///< stop time (tac) - double cumulate = 0; ///< the cumulate time + double cumulate = 0; ///< cumulated duration public: /// Constructor FTic() { - tic(); + this->reset(); } /// Copy constructor @@ -100,12 +104,12 @@ public: res.cumulate += other.cumulate; return res; } - + /// Resets the timer /**\warning Use tic() to restart the timer. */ void reset() { - start = 0; - end = 0; + start = FTic::GetTime(); + end = start; cumulate = 0; } @@ -114,26 +118,35 @@ public: this->start = FTic::GetTime(); } + /// Peek at current elapsed time without stopping timer + double peek() const { + return FTic::GetTime() - this->start;; + } + /// Stop measuring time and add to cumulated time. - void tac(){ + double tac(){ this->end = FTic::GetTime(); - cumulate += elapsed(); + auto lapse = this->elapsed(); + cumulate += lapse; + return lapse; } /// Elapsed time between the last tic() and tac() (in seconds). /** \return the time elapsed between tic() & tac() in second. */ - double elapsed() const{ + double elapsed() const { return this->end - this->start; } /// Cumulated tic() - tac() time spans /** \return the time elapsed between ALL tic() & tac() in second. */ - double cumulated() const{ + double cumulated() const { return cumulate; } /// Combination of tic() and elapsed(). - /** \return the time elapsed between tic() & tac() in second. */ + /** + * \todo deprecate + * \return the time elapsed between tic() & tac() in second. */ double tacAndElapsed() { tac(); return elapsed(); @@ -145,7 +158,11 @@ public: * \return A system dependent time point. */ static double GetTime(){ -#ifdef _OPENMP +#if defined(USE_STD_CHRONO) + using clock = std::chrono::high_resolution_clock; + using duration = std::chrono::duration<double>; + return duration(clock::now().time_since_epoch()).count(); +#elif defined(_OPENMP) return omp_get_wtime(); #elif defined(WINDOWS) return static_cast<double>(GetTickCount())/1000.0; @@ -159,4 +176,3 @@ public: #endif - diff --git a/Tests/GroupTree/testBlockedUniformBench.cpp b/Tests/GroupTree/testBlockedUniformBench.cpp new file mode 100644 index 0000000000000000000000000000000000000000..56f3be19bacf564a455337c01a614f9ef6bab9df --- /dev/null +++ b/Tests/GroupTree/testBlockedUniformBench.cpp @@ -0,0 +1,194 @@ + +// ==== CMAKE ===== +// @FUSE_BLAS +// @FUSE_FFT +// @FUSE_STARPU +// ================ +// Keep in private GIT + + +#include "../../Src/Utils/FGlobal.hpp" + +#include "../../Src/GroupTree/Core/FGroupTree.hpp" + +#include "../../Src/Components/FSimpleLeaf.hpp" +#include "../../Src/Containers/FVector.hpp" + +#include "../../Src/Kernels/P2P/FP2PParticleContainer.hpp" + +#include "Kernels/Interpolation/FInterpMatrixKernel.hpp" +#include "../../Src/Kernels/Uniform/FUnifKernel.hpp" + +#include "../../Src/GroupTree/Uniform/FUnifCellPOD.hpp" + +#include "../../Src/Utils/FMath.hpp" +#include "../../Src/Utils/FMemUtils.hpp" +#include "../../Src/Utils/FParameters.hpp" + +#include "../../Src/Files/FRandomLoader.hpp" +#include "../../Src/Files/FFmaGenericLoader.hpp" + +#include "../../Src/GroupTree/Core/FGroupSeqAlgorithm.hpp" +#include "../../Src/GroupTree/Core/FGroupTaskAlgorithm.hpp" +#ifdef SCALFMM_USE_OMP4 +#include "../../Src/GroupTree/Core/FGroupTaskDepAlgorithm.hpp" +#endif +#ifdef SCALFMM_USE_STARPU +#include "../../Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp" +#include "../../Src/GroupTree/StarPUUtils/FStarPUKernelCapacities.hpp" +#endif +#include "../../Src/GroupTree/Core/FP2PGroupParticleContainer.hpp" + +#include "../../Src/Utils/FParameterNames.hpp" + +#include <memory> + + +#define RANDOM_PARTICLES + +int main(int argc, char* argv[]){ + const FParameterNames LocalOptionBlocSize { {"-bs"}, "The size of the block of the blocked tree"}; + const FParameterNames LocalOptionValidate { {"-validation"}, "To compare with direct computation"}; + FHelpDescribeAndExit(argc, argv, "Perform Lagrange Kernel based simulation with StarPU", + FParameterDefinitions::OctreeHeight, +#ifdef RANDOM_PARTICLES + FParameterDefinitions::NbParticles, +#else + FParameterDefinitions::InputFile, +#endif + FParameterDefinitions::NbThreads, + LocalOptionBlocSize, LocalOptionValidate); + + // Initialize the types + typedef double FReal; + static const int ORDER = 5; + typedef FInterpMatrixKernelR<FReal> MatrixKernelClass; + + typedef FUnifCellPODCore GroupCellSymbClass; + typedef FUnifCellPODPole<FReal,ORDER> GroupCellUpClass; + typedef FUnifCellPODLocal<FReal,ORDER> GroupCellDownClass; + typedef FUnifCellPOD<FReal,ORDER> GroupCellClass; + + typedef FP2PGroupParticleContainer<FReal> GroupContainerClass; + typedef FGroupTree< FReal, GroupCellClass, GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass, GroupContainerClass, 1, 4, FReal> GroupOctreeClass; + + typedef FStarPUAllCpuCapacities<FUnifKernel<FReal,GroupCellClass,GroupContainerClass,MatrixKernelClass,ORDER>> GroupKernelClass; + typedef FStarPUCpuWrapper<typename GroupOctreeClass::CellGroupClass, GroupCellClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupContainerClass> GroupCpuWrapper; + typedef FGroupTaskStarPUAlgorithm<GroupOctreeClass, typename GroupOctreeClass::CellGroupClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupCpuWrapper > GroupAlgorithm; + + // Get params + const int NbLevels = FParameters::getValue(argc,argv,FParameterDefinitions::OctreeHeight.options, 5); + const int groupSize = FParameters::getValue(argc,argv,LocalOptionBlocSize.options, 250); + + // Load the particles +#ifdef RANDOM_PARTICLES + FRandomLoader<FReal> loader(FParameters::getValue(argc,argv,FParameterDefinitions::NbParticles.options, 2000), 1.0, FPoint<FReal>(0,0,0), 0); +#else + const char* const filename = FParameters::getStr(argc,argv,FParameterDefinitions::InputFile.options, "../Data/test20k.fma"); + FFmaGenericLoader<FReal> loader(filename); +#endif + FAssertLF(loader.isOpen()); + FTic timer; + + FP2PParticleContainer<FReal> allParticles; + for(FSize idxPart = 0 ; idxPart < loader.getNumberOfParticles() ; ++idxPart){ + FPoint<FReal> particlePosition; + FReal physicalValue; +#ifdef RANDOM_PARTICLES + physicalValue = 0.10; + loader.fillParticle(&particlePosition); +#else + loader.fillParticle(&particlePosition, &physicalValue); +#endif + allParticles.push(particlePosition, physicalValue); + } + std::cout << "Particles loaded in " << timer.tacAndElapsed() << "s\n"; + + // Put the data into the tree + timer.tic(); + GroupOctreeClass groupedTree(NbLevels, loader.getBoxWidth(), loader.getCenterOfBox(), groupSize, &allParticles); + groupedTree.printInfoBlocks(); + std::cout << "Tree created in " << timer.tacAndElapsed() << "s\n"; + + // Run the algorithm + const MatrixKernelClass MatrixKernel; + GroupKernelClass groupkernel(NbLevels, loader.getBoxWidth(), loader.getCenterOfBox(), &MatrixKernel); + GroupAlgorithm groupalgo(&groupedTree,&groupkernel); + + timer.tic(); + groupalgo.execute(); + timer.tac(); + std::cout << "@EXEC TIME = " << timer.elapsed() << "s\n"; + + // Validate the result + if(FParameters::existParameter(argc, argv, LocalOptionValidate.options) == true){ + FSize offsetParticles = 0; + FReal*const allPhysicalValues = allParticles.getPhysicalValues(); + FReal*const allPosX = const_cast<FReal*>( allParticles.getPositions()[0]); + FReal*const allPosY = const_cast<FReal*>( allParticles.getPositions()[1]); + FReal*const allPosZ = const_cast<FReal*>( allParticles.getPositions()[2]); + + groupedTree.forEachCellLeaf<FP2PGroupParticleContainer<FReal> >([&](GroupCellClass cellTarget, FP2PGroupParticleContainer<FReal> * leafTarget){ + const FReal*const physicalValues = leafTarget->getPhysicalValues(); + const FReal*const posX = leafTarget->getPositions()[0]; + const FReal*const posY = leafTarget->getPositions()[1]; + const FReal*const posZ = leafTarget->getPositions()[2]; + const FSize nbPartsInLeafTarget = leafTarget->getNbParticles(); + + for(FSize idxPart = 0 ; idxPart < nbPartsInLeafTarget ; ++idxPart){ + allPhysicalValues[offsetParticles + idxPart] = physicalValues[idxPart]; + allPosX[offsetParticles + idxPart] = posX[idxPart]; + allPosY[offsetParticles + idxPart] = posY[idxPart]; + allPosZ[offsetParticles + idxPart] = posZ[idxPart]; + } + + offsetParticles += nbPartsInLeafTarget; + }); + + FAssertLF(offsetParticles == loader.getNumberOfParticles()); + + FReal*const allDirectPotentials = allParticles.getPotentials(); + FReal*const allDirectforcesX = allParticles.getForcesX(); + FReal*const allDirectforcesY = allParticles.getForcesY(); + FReal*const allDirectforcesZ = allParticles.getForcesZ(); + + for(int idxTgt = 0 ; idxTgt < offsetParticles ; ++idxTgt){ + for(int idxMutual = idxTgt + 1 ; idxMutual < offsetParticles ; ++idxMutual){ + FP2PR::MutualParticles( + allPosX[idxTgt],allPosY[idxTgt],allPosZ[idxTgt], allPhysicalValues[idxTgt], + &allDirectforcesX[idxTgt], &allDirectforcesY[idxTgt], &allDirectforcesZ[idxTgt], &allDirectPotentials[idxTgt], + allPosX[idxMutual],allPosY[idxMutual],allPosZ[idxMutual], allPhysicalValues[idxMutual], + &allDirectforcesX[idxMutual], &allDirectforcesY[idxMutual], &allDirectforcesZ[idxMutual], &allDirectPotentials[idxMutual] + ); + } + } + + FMath::FAccurater<FReal> potentialDiff; + FMath::FAccurater<FReal> fx, fy, fz; + offsetParticles = 0; + groupedTree.forEachCellLeaf<FP2PGroupParticleContainer<FReal> >([&](GroupCellClass cellTarget, FP2PGroupParticleContainer<FReal> * leafTarget){ + const FReal*const potentials = leafTarget->getPotentials(); + const FReal*const forcesX = leafTarget->getForcesX(); + const FReal*const forcesY = leafTarget->getForcesY(); + const FReal*const forcesZ = leafTarget->getForcesZ(); + const FSize nbPartsInLeafTarget = leafTarget->getNbParticles(); + + for(int idxTgt = 0 ; idxTgt < nbPartsInLeafTarget ; ++idxTgt){ + potentialDiff.add(allDirectPotentials[idxTgt + offsetParticles], potentials[idxTgt]); + fx.add(allDirectforcesX[idxTgt + offsetParticles], forcesX[idxTgt]); + fy.add(allDirectforcesY[idxTgt + offsetParticles], forcesY[idxTgt]); + fz.add(allDirectforcesZ[idxTgt + offsetParticles], forcesZ[idxTgt]); + } + + offsetParticles += nbPartsInLeafTarget; + }); + + std::cout << "Error : Potential " << potentialDiff << "\n"; + std::cout << "Error : fx " << fx << "\n"; + std::cout << "Error : fy " << fy << "\n"; + std::cout << "Error : fz " << fz << "\n"; + } + + return 0; +} + diff --git a/Tests/GroupTree/testBlockedUniformCompare.cpp b/Tests/GroupTree/testBlockedUniformCompare.cpp index 7571699259f5321c1bdfec73655a36584161ac50..a1de9692eb99e1430d4a09f84872946f06c97078 100644 --- a/Tests/GroupTree/testBlockedUniformCompare.cpp +++ b/Tests/GroupTree/testBlockedUniformCompare.cpp @@ -496,27 +496,51 @@ struct RunContainer{ typedef FFmmAlgorithmThreadBalance<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass; std::cout << "Using FFmmAlgorithmThreadBalance " << std::endl; FmmClass algorithm(&tree, &kernels); +#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME) + // The taskname() clause is only supported by KSTAR. Make sure + // to set it from CMake to enable tracing. + starpu_fxt_start_profiling(); +#endif time.tic(); algorithm.execute(); time.tac(); +#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME) + starpu_fxt_stop_profiling(); +#endif std::cout << "Done " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl; } else if(FParameters::existParameter(argc, argv, LocalOptionOmpTask.options)){ typedef FFmmAlgorithmTask<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass; std::cout << "Using FFmmAlgorithmTask " << std::endl; FmmClass algorithm(&tree, &kernels); +#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME) + // The taskname() clause is only supported by KSTAR. Make sure + // to set it from CMake to enable tracing. + starpu_fxt_start_profiling(); +#endif time.tic(); algorithm.execute(); time.tac(); +#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME) + starpu_fxt_stop_profiling(); +#endif std::cout << "Done " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl; } else if(FParameters::existParameter(argc, argv, LocalOptionOmpSection.options)){ typedef FFmmAlgorithmSectionTask<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass; std::cout << "Using FFmmAlgorithmSectionTask " << std::endl; FmmClass algorithm(&tree, &kernels); +#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME) + // The taskname() clause is only supported by KSTAR. Make sure + // to set it from CMake to enable tracing. + starpu_fxt_start_profiling(); +#endif time.tic(); algorithm.execute(); time.tac(); +#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME) + starpu_fxt_stop_profiling(); +#endif std::cout << "Done " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl; } #ifdef SCALFMM_USE_OMP4 @@ -524,9 +548,17 @@ struct RunContainer{ typedef FFmmAlgorithmOmp4<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass; std::cout << "Using FFmmAlgorithmOmp4 " << std::endl; FmmClass algorithm(&tree, &kernels); +#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME) + // The taskname() clause is only supported by KSTAR. Make sure + // to set it from CMake to enable tracing. + starpu_fxt_start_profiling(); +#endif time.tic(); algorithm.execute(); time.tac(); +#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME) + starpu_fxt_stop_profiling(); +#endif std::cout << "Done " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl; } #endif @@ -534,9 +566,17 @@ struct RunContainer{ typedef FFmmAlgorithmThread<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass; std::cout << "Using FFmmAlgorithmThread " << std::endl; FmmClass algorithm(&tree, &kernels); +#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME) + // The taskname() clause is only supported by KSTAR. Make sure + // to set it from CMake to enable tracing. + starpu_fxt_start_profiling(); +#endif time.tic(); algorithm.execute(); time.tac(); +#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME) + starpu_fxt_stop_profiling(); +#endif std::cout << "Done " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl; } } // ----------------------------------------------------- @@ -848,9 +888,17 @@ struct RunContainer{ typedef FFmmAlgorithmThread<OctreeClass,CellClass,ContainerClass,KernelClass,LeafClass> FmmClass; std::cout << "Using FFmmAlgorithmThread " << std::endl; FmmClass algorithm(&tree, &kernels); +#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME) + // The taskname() clause is only supported by KSTAR. Make sure + // to set it from CMake to enable tracing. + starpu_fxt_start_profiling(); +#endif time.tic(); algorithm.execute(); time.tac(); +#if defined(SCALFMM_USE_STARPU) || defined(OPENMP_SUPPORT_TASK_NAME) + starpu_fxt_stop_profiling(); +#endif std::cout << "Done " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl; } diff --git a/Tests/Utils/testOctreeRearrangeTsm.cpp b/Tests/Utils/testOctreeRearrangeTsm.cpp index b6b6f4db93b2d9a55dac6ebfad3f7870991fc1f0..e859f5f6f1d7e6351023ea5ac8202c0484d76b9a 100644 --- a/Tests/Utils/testOctreeRearrangeTsm.cpp +++ b/Tests/Utils/testOctreeRearrangeTsm.cpp @@ -98,7 +98,7 @@ int main(int argc, char ** argv){ (BoxWidth*FReal(drand48())) + (BoxCenter-(BoxWidth/2)), (BoxWidth*FReal(drand48())) + (BoxCenter-(BoxWidth/2)), (BoxWidth*FReal(drand48())) + (BoxCenter-(BoxWidth/2))); - tree.insert(particleToFill,FParticleTypeSource,idxPart); + tree.insert(particleToFill,FParticleType::FParticleTypeSource,idxPart); } for(FSize idxPart = 0 ; idxPart < NbPart_Target; ++idxPart){ @@ -106,7 +106,7 @@ int main(int argc, char ** argv){ (BoxWidth*FReal(drand48())) + (BoxCenter-(BoxWidth/2)), (BoxWidth*FReal(drand48())) + (BoxCenter-(BoxWidth/2)), (BoxWidth*FReal(drand48())) + (BoxCenter-(BoxWidth/2))); - tree.insert(particleToFill,FParticleTypeTarget,idxPart); + tree.insert(particleToFill,FParticleType::FParticleTypeTarget,idxPart); } } diff --git a/Tests/noDist/ChebyshevPeriodic.cpp b/Tests/noDist/ChebyshevPeriodic.cpp index 58425bb91554795e28c847fd014f2832deff99c9..d1babc715d7b083940d0e7c9c5b4b3ed714cbcf6 100644 --- a/Tests/noDist/ChebyshevPeriodic.cpp +++ b/Tests/noDist/ChebyshevPeriodic.cpp @@ -83,12 +83,6 @@ int main(int argc, char* argv[]) const unsigned int NbThreads = FParameters::getValue(argc, argv, FParameterDefinitions::NbThreads.options, 1); const int PeriodicDeep = FParameters::getValue(argc,argv,FParameterDefinitions::PeriodicityNbLevels.options, 3); -#ifdef _OPENMP - omp_set_num_threads(NbThreads); - std::cout << "\n>> Using " << omp_get_max_threads() << " threads.\n" << std::endl; -#else - std::cout << "\n>> Sequential version.\n" << std::endl; -#endif // std::cout << "Parameters "<< std::endl << "\t Octree Depth \t"<< TreeHeight <<std::endl diff --git a/Tests/noDist/FMMnonUnitCube.cpp b/Tests/noDist/FMMnonUnitCube.cpp index 08bcb11857fb046fc9984d8f9adaa36526a11d04..dc9d79bc6d6a06264a41ba1a2272b797cfd84b52 100644 --- a/Tests/noDist/FMMnonUnitCube.cpp +++ b/Tests/noDist/FMMnonUnitCube.cpp @@ -98,20 +98,11 @@ int main(int argc, char* argv[]) const std::string filename(FParameters::getStr(argc,argv,FParameterDefinitions::InputFile.options, "../Data/UTest/unitCubeRef20kDouble.bfma")); const unsigned int TreeHeight = FParameters::getValue(argc, argv, FParameterDefinitions::OctreeHeight.options, 5); const unsigned int SubTreeHeight = FParameters::getValue(argc, argv, FParameterDefinitions::OctreeSubHeight.options, 2); - const unsigned int NbThreads = FParameters::getValue(argc, argv, FParameterDefinitions::NbThreads.options, omp_get_max_threads()); - - // -#ifdef _OPENMP - omp_set_num_threads(NbThreads); -#else - std::cout << "\n>> Sequential version.\n" << std:: -#endif std::cout << "Parameters "<< std::endl << " Octree Depth \t"<< TreeHeight <<std::endl << " SubOctree depth \t"<< SubTreeHeight <<std::endl << " Input file name: \t" <<filename <<std::endl - << " Thread number: \t" << NbThreads <<std::endl <<std::endl; // init timer diff --git a/Tests/noDist/PerfTest.cpp b/Tests/noDist/PerfTest.cpp deleted file mode 100644 index e036f686329e6e9f2ecd7beb0776a81106c83f4d..0000000000000000000000000000000000000000 --- a/Tests/noDist/PerfTest.cpp +++ /dev/null @@ -1,204 +0,0 @@ -// ==== CMAKE ==== -// Keep in private GIT -// @SCALFMM_PRIVATE - - -/** - * \file - * \author Quentin Khan - * - * This program is used to run different performance tests for the various - * algorithms that have been implemented for ScalFMM. - * - * See the PerfUtils.hpp file classes for some more in depth information. Run - * with argument --help for usage information. - */ - - -#include <iostream> -#include <string> - -#include "Utils/FParameters.hpp" -#include "Utils/FParameterNames.hpp" - -#include "PerfTest/PerfTestUtils.hpp" - -#include "PerfTest/TreeLoaderBasic.hpp" -#include "PerfTest/TreeLoaderFCheb.hpp" - -#ifdef SCALFMM_USE_MPI -#include "PerfTest/TreeLoaderMpiSplitFCheb.hpp" -#include "PerfTest/TreeLoaderMpiGenericFCheb.hpp" -#endif - -#include "PerfTest/KernelLoaderFChebSym.hpp" - -#include "PerfTest/AlgoLoaderThread.hpp" -#include "PerfTest/AlgoLoaderTask.hpp" -#include "PerfTest/AlgoLoaderSectionTask.hpp" -#include "PerfTest/AlgoLoaderCostZones.hpp" -#include "PerfTest/AlgoLoaderThreadBalance.hpp" - -#ifdef SCALFMM_USE_MPI -#include "PerfTest/AlgoLoaderThreadProc.hpp" -#endif - -#define HOST_NAME_MAX 64 - -/** - * \brief Runs a generic sequence of actions to use an algorithm. - * - * This function runs the basic steps that are needed to run an FMM algorithm - * over a set of particles. It does the following steps : - * - * - Load a tree using the class defined as a TreeLoader - * - Prepare the needed kernels using the KernelLoader - * - Prepare and run the algorithm using the AlgorithmLoader - * - * See documentation of FTreeLoader, FKernelLoader, FAlgoLoader. - */ -template <class TreeLoader, - template <typename TL_1> class KernelLoader, - template <typename TL_2, template <typename TL_3> class KL> class AlgoLoader> -void runperf(FPerfTestParams& params) -{ - TreeLoader treeLoader(params); - KernelLoader<TreeLoader> kernelLoader(params, treeLoader); - AlgoLoader<TreeLoader, KernelLoader> algoLoader(params, treeLoader, kernelLoader); - algoLoader.run(); - - char hostname[HOST_NAME_MAX]; - memset(hostname,'\0',HOST_NAME_MAX); - if ( -1 == gethostname(hostname, HOST_NAME_MAX-1) ) { - perror("Could not get hostname"); - strncpy(hostname, "unknown", HOST_NAME_MAX); - } - - std::cout << "@@ " - << "host:" << hostname << " " - << "algo:" << params.algo << " " - << "file:" << params.filename.substr( - params.filename.find_last_of('/')+1 ) << " " - << "particles:" << treeLoader._loader.getNumberOfParticles() << " " - << "procs:" << params.nbProcs << " " - << "threads:" << params.nbThreads << " " - << "height:" << params.treeHeight << " " - << "subheight:" << params.subTreeHeight << " " - << algoLoader.getRunInfoString() - << "P2M:" << algoLoader.getCumulatedTime(FAlgorithmTimers::P2MTimer) << " " - << "M2M:" << algoLoader.getCumulatedTime(FAlgorithmTimers::M2MTimer) << " " - << "M2L:" << algoLoader.getCumulatedTime(FAlgorithmTimers::M2LTimer) << " " - << "L2L:" << algoLoader.getCumulatedTime(FAlgorithmTimers::L2LTimer) << " " - << "P2PL2P:" << algoLoader.getCumulatedTime(FAlgorithmTimers::NearTimer) << " " - << std::endl; -} - -namespace ParName { - const FParameterNames Algo = {{"--algo"},"Algorithm to run (basic, task, costzones, sectiontask, autobalance" -#ifdef SCALFMM_USE_MPI - ", mpi-split, mpi-generic" -#endif - ")."}; - const FParameterNames Schedule = {{"--schedule"},"OpenMP scheduling policy (static, dynamic)."}; - const FParameterNames ChunkSize = {{"--chunk-size"},"OpenMP chunk size for basic dynamic algorithm."}; -} - -int main (int argc, char** argv) -{ - // Parameter handling ////////////// - FHelpDescribeAndExit(argc, argv, - "Performance test program for FMM balancing techniques. " -#ifdef SCALFMM_USE_MPI - "This program has been compiled with MPI superpowers !" -#endif - , - FParameterDefinitions::InputFile, - FParameterDefinitions::OctreeHeight, - FParameterDefinitions::OctreeSubHeight, - FParameterDefinitions::NbThreads, - ParName::Algo, - ParName::Schedule, - ParName::ChunkSize); - FPerfTestParams params; - { - using namespace FParameterDefinitions; - using namespace FParameters; - params.filename = getStr(argc,argv,InputFile.options, - "../Data/unitCubeXYZQ100.bfma"); - params.treeHeight = getValue(argc, argv, OctreeHeight.options, 5); - params.subTreeHeight = getValue(argc, argv, OctreeSubHeight.options, 2); - params.nbThreads = getValue(argc, argv, NbThreads.options, 1); - params.algo = getStr(argc,argv,ParName::Algo.options,"task"); - params.omp_chunk_size = getValue(argc, argv, ParName::ChunkSize.options, 0); - -#ifdef SCALFMM_USE_MPI - std::string prefix("mpi-"); - if( params.algo.substr(0, prefix.size()) == prefix ) { - params.mpiContext = new FMpi(argc,argv); - params.nbProcs = params.mpiContext->global().processCount(); - } -#endif - } - // End of Parameter handling /////// - - char hostname[HOST_NAME_MAX]; - memset(hostname,'\0',HOST_NAME_MAX); - if ( -1 == gethostname(hostname, HOST_NAME_MAX-1) ) { - perror("Could not get hostname"); - strncpy(hostname, "unknown", HOST_NAME_MAX); - } - std::cout << "Hostname: " << hostname << std::endl; - - omp_set_num_threads(params.nbThreads); - - using FReal = double; - constexpr const int ORDER = 7; - - if( "basic" == params.algo ) { - runperf<TreeLoaderFCheb<FReal,ORDER>, - KernelLoaderFChebSym, - AlgoLoaderThread> - (params); - } else if( "task" == params.algo ) { - runperf<TreeLoaderFCheb<FReal,ORDER>, - KernelLoaderFChebSym, - AlgoLoaderTask> - (params); - } else if ( "costzones" == params.algo ) { - runperf<TreeLoaderFCheb<FReal,ORDER>, - KernelLoaderFChebSym, - AlgoLoaderCostZones> - (params); - } else if ( "sectiontask" == params.algo ) { - runperf<TreeLoaderFCheb<FReal,ORDER>, - KernelLoaderFChebSym, - AlgoLoaderSectionTask> - (params); - } else if ( "autobalance" == params.algo ) { - runperf<TreeLoaderFCheb<FReal,ORDER>, - KernelLoaderFChebSym, - AlgoLoaderThreadBalance> - (params); -#ifdef SCALFMM_USE_MPI - } else if ( "mpi-split" == params.algo ) { - runperf<TreeLoaderMpiSplitFCheb<FReal,ORDER>, - KernelLoaderFChebSym, - AlgoLoaderThreadProc> - (params); - } else if ( "mpi-generic" == params.algo ) { - runperf<TreeLoaderMpiGenericFCheb<FReal,ORDER>, - KernelLoaderFChebSym, - AlgoLoaderThreadProc> - (params); -#endif - } else { - std::cout << "Unknown algorithm: " << params.algo << std::endl; - } - -#ifdef SCALFMM_USE_MPI - if( nullptr != params.mpiContext ) { - delete params.mpiContext; - } -#endif - -} diff --git a/Tests/noDist/PerfTest/AlgoLoaderCostZones.hpp b/Tests/noDist/PerfTest/AlgoLoaderCostZones.hpp deleted file mode 100644 index 017c553815d7001b208fbcf2c2b8177a01a5b258..0000000000000000000000000000000000000000 --- a/Tests/noDist/PerfTest/AlgoLoaderCostZones.hpp +++ /dev/null @@ -1,113 +0,0 @@ -// ==== CMAKE ==== -// Keep in private GIT -// @SCALFMM_PRIVATE - -#ifndef _ALGOLOADERCOSTZONES_HPP_ -#define _ALGOLOADERCOSTZONES_HPP_ - -#include <memory> -#include <sstream> - -#include "PerfTestUtils.hpp" - -#include "Core/FFmmAlgorithmThread.hpp" - -#include "BalanceTree/FFmmAlgorithmThreadBalanced.hpp" -#include "BalanceTree/FCostCell.hpp" -#include "BalanceTree/FCostZones.hpp" - -/** - * \brief Algorithm loader for FFmmAlgorithmThreadBalanced. - * - * See FAlgoLoader documentation. - * - * \warning : This loader requires that the KernelLoader supply a type definition - * for a `CostKernelClass` - */ -template <class _TreeLoader, template<typename> class _KernelLoader> -class AlgoLoaderCostZones : public FAlgoLoader<_TreeLoader, _KernelLoader> { -public: - // Types definitions - - /// The TreeLoader type that is used. - using TreeLoader = _TreeLoader; - using KernelLoader = _KernelLoader<TreeLoader>; - - using FReal = typename TreeLoader::FReal; - using CellClass = typename TreeLoader::CellClass; - using ContainerClass = typename TreeLoader::ContainerClass; - using LeafClass = typename TreeLoader::LeafClass; - using OctreeClass = typename TreeLoader::OctreeClass; - using KernelClass = typename KernelLoader::KernelClass; - using CostKernelClass= typename KernelLoader::CostKernelClass; - - static_assert(std::is_base_of<FCostCellTypeTrait, CellClass>::value, - "The tree cells must derive from FCostCell."); - - using FMMClass = FFmmAlgorithmThreadBalanced - <OctreeClass, CellClass, ContainerClass, KernelClass, LeafClass>; - using CostFmmClass = FFmmAlgorithmThread - <OctreeClass, CellClass, ContainerClass, CostKernelClass, LeafClass>; - - std::stringstream _infostring; - TreeLoader& _treeLoader; - KernelLoader& _kernelLoader; - - std::unique_ptr<FMMClass> _algo; - - - /// Builds the loader - AlgoLoaderCostZones(FPerfTestParams& /*params*/, - TreeLoader& treeLoader, - KernelLoader& kernelLoader) : - _treeLoader(treeLoader), - _kernelLoader(kernelLoader), - _algo(nullptr) { - - } - - /// Computes the tree cells costs then runs the costzones and FMM algorithms. - void run() { - // The tree loader holds the tree structure - OctreeClass* p_tree = &(_treeLoader._tree); - - // Compute tree cells costs - CostFmmClass costAlgo(p_tree, &(_kernelLoader._costKernel)); - - this->time.tic(); - costAlgo.execute(); - this->time.tac(); - std::cout << "Generating tree cost: " << this->time.elapsed() << "s.\n"; - _infostring << "costgen:" << this->time.elapsed() << " "; - - // Compute cost zones - FCostZones<OctreeClass, CellClass> costzones(p_tree, omp_get_max_threads()); - - this->time.tic(); - costzones.run(); - this->time.tac(); - std::cout << "Generating cost zones: " << this->time.elapsed() << "s.\n"; - _infostring << "zonegen:" << this->time.elapsed() << " "; - - // Execute FFM algorithm - this->time.tic(); - _algo = std::unique_ptr<FMMClass>( - new FMMClass(p_tree, &(_kernelLoader._kernel), - costzones.getZoneBounds(), costzones.getLeafZoneBounds())); - _algo->execute(); - this->time.tac(); - } - - std::string getRunInfoString() const { - return _infostring.str(); - } - - double getCumulatedTime(FAlgorithmTimers::FTimers timerName) const { - return _algo->getCumulatedTime(timerName); - } - -}; - - - -#endif diff --git a/Tests/noDist/PerfTest/AlgoLoaderSectionTask.hpp b/Tests/noDist/PerfTest/AlgoLoaderSectionTask.hpp deleted file mode 100644 index 245e3c477f3a8f6f848cf08980bf7aaaed6093ef..0000000000000000000000000000000000000000 --- a/Tests/noDist/PerfTest/AlgoLoaderSectionTask.hpp +++ /dev/null @@ -1,59 +0,0 @@ -// ==== CMAKE ==== -// Keep in private GIT -// @SCALFMM_PRIVATE - -#ifndef _ALGOLOADERSECTIONTASK_HPP_ -#define _ALGOLOADERSECTIONTASK_HPP_ - -#include <memory> - -#include "PerfTestUtils.hpp" - -#include "Core/FFmmAlgorithmSectionTask.hpp" - - -template <class _TreeLoader, template<typename> class _KernelLoader> -class AlgoLoaderSectionTask : public FAlgoLoader<_TreeLoader, _KernelLoader> { -public: - using TreeLoader = _TreeLoader; - using KernelLoader = _KernelLoader<TreeLoader>; - - - using FReal = typename TreeLoader::FReal; - using CellClass = typename TreeLoader::CellClass; - using ContainerClass = typename TreeLoader::ContainerClass; - using LeafClass = typename TreeLoader::LeafClass; - using OctreeClass = typename TreeLoader::OctreeClass; - using KernelClass = typename KernelLoader::KernelClass; - - using FMMClass = FFmmAlgorithmSectionTask<OctreeClass, CellClass, ContainerClass, KernelClass, LeafClass>; - - TreeLoader& _treeLoader; - KernelLoader& _kernelLoader; - - std::unique_ptr<FMMClass> _algo; - - AlgoLoaderSectionTask(FPerfTestParams& /*params*/, - TreeLoader& treeLoader, - KernelLoader& kernelLoader) : - _treeLoader(treeLoader), - _kernelLoader(kernelLoader), - _algo(nullptr) { - - } - - - void run() { - _algo = std::unique_ptr<FMMClass>( - new FMMClass(&(_treeLoader._tree), &(_kernelLoader._kernel))); - _algo->execute(); - } - - double getCumulatedTime(FAlgorithmTimers::FTimers timerName) const { - return _algo->getCumulatedTime(timerName); - } -}; - - - -#endif diff --git a/Tests/noDist/PerfTest/AlgoLoaderTask.hpp b/Tests/noDist/PerfTest/AlgoLoaderTask.hpp deleted file mode 100644 index 2422f7e7c6efb7c7a27fa6b916424276c785a992..0000000000000000000000000000000000000000 --- a/Tests/noDist/PerfTest/AlgoLoaderTask.hpp +++ /dev/null @@ -1,59 +0,0 @@ -// ==== CMAKE ==== -// Keep in private GIT -// @SCALFMM_PRIVATE - -#ifndef _ALGOLOADERTASK_HPP_ -#define _ALGOLOADERTASK_HPP_ - -#include <memory> - -#include "PerfTestUtils.hpp" - -#include "Core/FFmmAlgorithmTask.hpp" - - -template <class _TreeLoader, template<typename> class _KernelLoader> -class AlgoLoaderTask : public FAlgoLoader<_TreeLoader, _KernelLoader> { -public: - using TreeLoader = _TreeLoader; - using KernelLoader = _KernelLoader<TreeLoader>; - - - using FReal = typename TreeLoader::FReal; - using CellClass = typename TreeLoader::CellClass; - using ContainerClass = typename TreeLoader::ContainerClass; - using LeafClass = typename TreeLoader::LeafClass; - using OctreeClass = typename TreeLoader::OctreeClass; - using KernelClass = typename KernelLoader::KernelClass; - - using FMMClass = FFmmAlgorithmTask<OctreeClass, CellClass, ContainerClass, KernelClass, LeafClass>; - - TreeLoader& _treeLoader; - KernelLoader& _kernelLoader; - - std::unique_ptr<FMMClass> _algo; - - AlgoLoaderTask(FPerfTestParams& /*params*/, - TreeLoader& treeLoader, - KernelLoader& kernelLoader) : - _treeLoader(treeLoader), - _kernelLoader(kernelLoader), - _algo(nullptr) { - - } - - - void run() { - _algo = std::unique_ptr<FMMClass>( - new FMMClass(&(_treeLoader._tree), &(_kernelLoader._kernel))); - _algo->execute(); - } - - double getCumulatedTime(FAlgorithmTimers::FTimers timerName) const { - return _algo->getCumulatedTime(timerName); - } -}; - - - -#endif diff --git a/Tests/noDist/PerfTest/AlgoLoaderThread.hpp b/Tests/noDist/PerfTest/AlgoLoaderThread.hpp deleted file mode 100644 index 322ec6fd205e875a07d67ae26a6bde4dc1160ec3..0000000000000000000000000000000000000000 --- a/Tests/noDist/PerfTest/AlgoLoaderThread.hpp +++ /dev/null @@ -1,81 +0,0 @@ -// ==== CMAKE ==== -// Keep in private GIT -// @SCALFMM_PRIVATE - -#ifndef _ALGOLOADERTHREAD_HPP_ -#define _ALGOLOADERTHREAD_HPP_ - -#include <memory> -#include <sstream> - -#include "PerfTestUtils.hpp" - -#include "Core/FFmmAlgorithmThread.hpp" - -/** - * \brief Algorithm loader for FFmmAlgorithmThread - * - * See FAlgoLoader. - */ -template <class _TreeLoader, template<typename> class _KernelLoader> -class AlgoLoaderThread : public FAlgoLoader<_TreeLoader, _KernelLoader> { -public: - - // Type definitions, allows them to be reused by other classes - using TreeLoader = _TreeLoader; - using KernelLoader = _KernelLoader<TreeLoader>; - - using FReal = typename TreeLoader::FReal; - using CellClass = typename TreeLoader::CellClass; - using ContainerClass = typename TreeLoader::ContainerClass; - using LeafClass = typename TreeLoader::LeafClass; - using OctreeClass = typename TreeLoader::OctreeClass; - using KernelClass = typename KernelLoader::KernelClass; - - /// FMM algorithm class - using FMMClass = FFmmAlgorithmThread<OctreeClass, CellClass, ContainerClass, KernelClass, LeafClass>; - - /// The tree loader (FTreeLoader) that was used - TreeLoader& _treeLoader; - - /// The kernel loader (FKernelLoader) that was used - KernelLoader& _kernelLoader; - - unsigned int _omp_chunk_size; ///< Chunk size for OpenMP - - /// The #FMMClass algorithm instance - std::unique_ptr<FMMClass> _algo; - - AlgoLoaderThread(FPerfTestParams& params, - TreeLoader& treeLoader, - KernelLoader& kernelLoader) : - _treeLoader(treeLoader), - _kernelLoader(kernelLoader), - _omp_chunk_size(params.omp_chunk_size), - _algo(nullptr) { - - } - - void run() { - _algo = std::unique_ptr<FMMClass>( - new FMMClass(&(_treeLoader._tree), &(_kernelLoader._kernel))); - _algo->setChunkSize(_omp_chunk_size); - - _algo->execute(); - } - - - virtual std::string getRunInfoString() const { - std::stringstream sstr; - sstr << "chunksize:" << _omp_chunk_size << " "; - return sstr.str(); - } - - double getCumulatedTime(FAlgorithmTimers::FTimers timerName) const { - return _algo->getCumulatedTime(timerName); - } - - -}; - -#endif diff --git a/Tests/noDist/PerfTest/AlgoLoaderThreadBalance.hpp b/Tests/noDist/PerfTest/AlgoLoaderThreadBalance.hpp deleted file mode 100644 index 05cb80de0bccb8d9ae9e26634e5f2a6beecc2eaf..0000000000000000000000000000000000000000 --- a/Tests/noDist/PerfTest/AlgoLoaderThreadBalance.hpp +++ /dev/null @@ -1,68 +0,0 @@ -// ==== CMAKE ==== -// Keep in private GIT -// @SCALFMM_PRIVATE - -#ifndef _ALGOLOADERTHREADBALANCE_HPP_ -#define _ALGOLOADERTHREADBALANCE_HPP_ - -#include <memory> -#include <sstream> - -#include "PerfTestUtils.hpp" - -#include "Core/FFmmAlgorithmThreadBalance.hpp" - -/** - * \brief An algorithm loader for FFmmAlgorithmBalance - * - * See FAlgoLoader documentation. - */ -template <class _TreeLoader, template<typename> class _KernelLoader> -class AlgoLoaderThreadBalance : public FAlgoLoader<_TreeLoader, _KernelLoader> { -public: - using TreeLoader = _TreeLoader; - using KernelLoader = _KernelLoader<TreeLoader>; - - using FReal = typename TreeLoader::FReal; - using CellClass = typename TreeLoader::CellClass; - using ContainerClass = typename TreeLoader::ContainerClass; - using LeafClass = typename TreeLoader::LeafClass; - using OctreeClass = typename TreeLoader::OctreeClass; - using KernelClass = typename KernelLoader::KernelClass; - - using FMMClass = FFmmAlgorithmThreadBalance<OctreeClass, CellClass, ContainerClass, KernelClass, LeafClass>; - - TreeLoader& _treeLoader; - KernelLoader& _kernelLoader; - - std::unique_ptr<FMMClass> _algo; - - AlgoLoaderThreadBalance(FPerfTestParams& params, - TreeLoader& treeLoader, - KernelLoader& kernelLoader) : - _treeLoader(treeLoader), - _kernelLoader(kernelLoader), - _algo(nullptr) { - - } - - void run() { - _algo = std::unique_ptr<FMMClass>( - new FMMClass(&(_treeLoader._tree), &(_kernelLoader._kernel))); - - _algo->execute(); - } - - - virtual std::string getRunInfoString() const { - return ""; - } - - double getCumulatedTime(FAlgorithmTimers::FTimers timerName) const { - return _algo->getCumulatedTime(timerName); - } - - -}; - -#endif diff --git a/Tests/noDist/PerfTest/AlgoLoaderThreadProc.hpp b/Tests/noDist/PerfTest/AlgoLoaderThreadProc.hpp deleted file mode 100644 index 9798e4733d0e758782cccb57098fc844837ac314..0000000000000000000000000000000000000000 --- a/Tests/noDist/PerfTest/AlgoLoaderThreadProc.hpp +++ /dev/null @@ -1,87 +0,0 @@ -// ==== CMAKE ==== -// Keep in private GIT -// @SCALFMM_PRIVATE - -#ifndef _ALGOLOADERTHREADPROC_HPP_ -#define _ALGOLOADERTHREADPROC_HPP_ - -#include <memory> -#include <sstream> - -#include "PerfTestUtils.hpp" - -#include "Core/FFmmAlgorithmThreadProc.hpp" -#include "Utils/FMpi.hpp" - -/** - * \brief Algorithm loader for FFmmAlgorithmThread - * - * See FAlgoLoader. - */ -template <class _TreeLoader, template<typename> class _KernelLoader> -class AlgoLoaderThreadProc : public FAlgoLoader<_TreeLoader, _KernelLoader> { -public: - - // Type definitions, allows them to be reused by other classes - using TreeLoader = _TreeLoader; - using KernelLoader = _KernelLoader<TreeLoader>; - - using FReal = typename TreeLoader::FReal; - using CellClass = typename TreeLoader::CellClass; - using ContainerClass = typename TreeLoader::ContainerClass; - using LeafClass = typename TreeLoader::LeafClass; - using OctreeClass = typename TreeLoader::OctreeClass; - using KernelClass = typename KernelLoader::KernelClass; - - /// FMM algorithm class - using FMMClass = FFmmAlgorithmThreadProc<OctreeClass, CellClass, ContainerClass, KernelClass, LeafClass>; - - FMpi* _mpiContext; - - /// The tree loader (FTreeLoader) that was used - TreeLoader& _treeLoader; - - /// The kernel loader (FKernelLoader) that was used - KernelLoader& _kernelLoader; - - /// The #FMMClass algorithm instance - std::unique_ptr<FMMClass> _algo; - - /// Array of MPI gathered cumulated times - double timers[FAlgorithmTimers::nbTimers] {0}; - - - AlgoLoaderThreadProc(FPerfTestParams& params, - TreeLoader& treeLoader, - KernelLoader& kernelLoader) : - _mpiContext(params.mpiContext), - _treeLoader(treeLoader), - _kernelLoader(kernelLoader), - _algo(nullptr) { - - } - - - void run() { - _algo = std::unique_ptr<FMMClass>( - new FMMClass(_mpiContext->global(), &(_treeLoader._tree), &(_kernelLoader._kernel))); - _algo->execute(); - - for( int idxTimer = 0; idxTimer < FAlgorithmTimers::nbTimers; ++idxTimer ) { - timers[idxTimer] = _algo->getCumulatedTime(FAlgorithmTimers::FTimers(idxTimer)); - } - - if( _mpiContext->global().processId() == 0) { - MPI_Reduce(MPI_IN_PLACE, timers, FAlgorithmTimers::nbTimers, MPI_DOUBLE, MPI_MAX, 0, _mpiContext->global().getComm()); - } else { - MPI_Reduce(timers, NULL, FAlgorithmTimers::nbTimers, MPI_DOUBLE, MPI_MAX, 0, _mpiContext->global().getComm()); - } - } - - double getCumulatedTime(FAlgorithmTimers::FTimers timerName) const { - return timers[timerName]; - } - -}; - -#endif diff --git a/Tests/noDist/PerfTest/KernelLoaderFChebSym.hpp b/Tests/noDist/PerfTest/KernelLoaderFChebSym.hpp deleted file mode 100644 index 2945c2714e72dc4a91fcad5c19b5a092bcbf5aaf..0000000000000000000000000000000000000000 --- a/Tests/noDist/PerfTest/KernelLoaderFChebSym.hpp +++ /dev/null @@ -1,79 +0,0 @@ -// ==== CMAKE ==== -// Keep in private GIT -// @SCALFMM_PRIVATE - - - -#ifndef _KERNELLOADERFCHEBSYM_HPP_ -#define _KERNELLOADERFCHEBSYM_HPP_ - -#include "PerfTestUtils.hpp" - -#include "Kernels/Interpolation/FInterpMatrixKernel.hpp" -#include "Kernels/Chebyshev/FChebSymKernel.hpp" - -#include "BalanceTree/FChebSymCostKernel.hpp" - -/** - * \brief Kernel loader for the symetric Chebyshev kernel. - * - * \warning This loader requires that TreeLoader::CellClass inherits from - * FChebCell. - * - * \note This loader also provides the typedef CostKernelClass and a member - * _costKernel that cam be used by the AlgoLoaderCostZones. - */ -template <typename _TreeLoader> -class KernelLoaderFChebSym : public FKernelLoader<_TreeLoader> { - // Meaningfull (?) error message. - static_assert( - std::is_base_of<FChebCell<typename _TreeLoader::FReal,_TreeLoader::ORDER>, - typename _TreeLoader::CellClass>::value, - "TreeLoader::CellClass must derive from FChebCell"); - - -public: - // Required type definitions - using TreeLoader = _TreeLoader; - using FReal = typename TreeLoader::FReal; - /// Must derive from FChebCell - using CellClass = typename TreeLoader::CellClass; - using ContainerClass = typename TreeLoader::ContainerClass; - using OctreeClass = typename TreeLoader::OctreeClass; - - using MatrixKernelClass = FInterpMatrixKernelR<FReal>; - using KernelClass = FChebSymKernel <FReal, CellClass, ContainerClass, - MatrixKernelClass, TreeLoader::ORDER>; - /// Kernel class used to compute the tree cell costs. - using CostKernelClass = FChebSymCostKernel<FReal, CellClass, ContainerClass, - MatrixKernelClass, TreeLoader::ORDER, - OctreeClass>; - - const FReal epsilon = 1e-4; - - /// Matrix used to compute the tree cells interactions. - const MatrixKernelClass _matrixKernel; - /// Kernel used to compute the tree cells interactions. - KernelClass _kernel; - /// Kernel used to compute the tree cells costs. - CostKernelClass _costKernel; - - /// Builds and loads the kernel. - /** \param params Parameters from the main invocation, UNSUSED - * \param treeLoader Tree loader that was used. - */ - KernelLoaderFChebSym(FPerfTestParams& /*params*/, TreeLoader& treeLoader) : - _matrixKernel(), - _kernel(treeLoader._tree.getHeight(), - treeLoader._tree.getBoxWidth(), - treeLoader._tree.getBoxCenter(), - &_matrixKernel), - _costKernel(&(treeLoader._tree), epsilon){ - - } - - -}; - - -#endif diff --git a/Tests/noDist/PerfTest/PerfTestUtils.hpp b/Tests/noDist/PerfTest/PerfTestUtils.hpp deleted file mode 100644 index cc4020467774f17d916461405304e8550da88db6..0000000000000000000000000000000000000000 --- a/Tests/noDist/PerfTest/PerfTestUtils.hpp +++ /dev/null @@ -1,150 +0,0 @@ -// ==== CMAKE ==== -// Keep in private GIT -// @SCALFMM_PRIVATE - -#ifndef _PERFTESTUTILS_HPP_ -#define _PERFTESTUTILS_HPP_ - -#include <string> - -#ifdef SCALFMM_USE_MPI -#include "Utils/FMpi.hpp" -#endif - -#include "Utils/FTic.hpp" -#include "Files/FFmaGenericLoader.hpp" - -#include "Containers/FOctree.hpp" - -/** - * \brief Store the PerfTest program parameters. - */ -struct FPerfTestParams { - int subTreeHeight = 2; ///< Subtree height. - int treeHeight = 5; ///< Tree height. - int nbThreads = 1; ///< Maximum number of threads (when used). - std::string filename = ""; ///< Particles file. - std::string algo = "task"; ///< Algorithm to run. - int omp_chunk_size = 0; ///< OpenMP chunk size for basic algorithm (FFmmAlgorithmThread) - int nbProcs = 1; -#ifdef SCALFMM_USE_MPI - FMpi* mpiContext = nullptr; -#endif -}; - - -/** - * \brief Base class for tree loaders. - * - * This class itself does not provide anything but a base on which to build tree - * loaders. A tree loader should satisfy the following rules. - * - * - Define the public typedefs : CellClass, ContainerClass, LeafClass, - * OctreeClass. - * - Provide public acces to a member of type OctreeClass _tree as the tree - * that is loaded. - * - Tree loading must happen at construction. - * - It may provide any other members or typdefs required by a special - * FKernelLoader or FAlgoLoader. - * - * For convenience, this class provides a timer and a basic loadTree method that - * should be enough to load a tree from and FMA file. - * - * \note It is not mandatory that a loader inherit from this class. It must - * however follow the aforementioned rules. - */ -class FTreeLoader { -public: - /// A timer used to time the loadTree method. - FTic time; -protected: - - /** - * \brief Load a tree from a file. - * - * \param loader The file loader to read from the file. - * \param tree The tree to be filled. - */ - virtual void loadTree() = 0; -}; - -/** - * \brief Base class for kernel loaders. - * - * This class itself does not provide anything but a base on which to build - * kernel loaders. A kernel loader should satisfy the following rules. - * - * - Define the public typedefs : TreeLoader, KernelClass. - * - Provide public acces to a member of type Kernelclass _kernel as the - * kernel that is loaded. - * - Kernel loading must happen at construction. - * - It may provide any other members or typdefs required by a special - * FAlgoLoader. - * - * For convenience, this class provides a timer. - * - * \tparam _TreeLoader The tree loader that was used. - * - * \note It is not mandatory that a loader inherit from this class. It must - * however follow the aforementioned rules. - */ -template<class _TreeLoader> -class FKernelLoader { - /// The tree loader that was used (see FTreeLoader). - using TreeLoader = _TreeLoader; -public: - /// A timer - FTic time; -}; - -/** - * \brief Base class for algorithm loaders. - * - * This class itself does not provide anything but a base on which to build - * algorithm loaders. A kernel loader should satisfy the following rules. - * - * - Define the public typedefs : TreeLoader, KernelLoader. - * - Provide public acces to a member of type - * \link TreeLoader Treeloader::OctreeClass* \endlink` _algo` - * as the algorithm that is loaded. This pointer should be valid from the - * end of the ::run method to the destruction of the loader. - * - It may provide any other members or typdefs. - * - * For convenience, this class provides a timer. - * - * \tparam _TreeLoader The tree loader that was used. - * \tparam _KernelLoader The kernel loader *template* that was used, the - * KernelLoader type will then be _KernelLoader<_TreeLoader>. - * - * \note It is not mandatory that a loader inherit from this class. It must - * however follow the aforementioned rules. - */ -template <class _TreeLoader, template<typename> class _KernelLoader> -class FAlgoLoader { - /// The tree loader that was used (see FTreeLoader). - using TreeLoader = _TreeLoader; - /// The kernel loader that was used (see FKernelLoader). - using KernelLoader = _KernelLoader<TreeLoader>; -public: - /// A timer. - FTic time; - - /// Method that runs the algorithm. - virtual void run() = 0; - - /// Additionnal information for specific algorithm loader. - /** - * The string should be formated as a key:value list separated by spaces. - * For instance : "key1:value1 key2:value2 ". It may be a good idea to add a - * space at the end of the string. - */ - virtual std::string getRunInfoString() const { - return ""; - } -}; - - - - - -#endif diff --git a/Tests/noDist/PerfTest/TreeLoaderBasic.hpp b/Tests/noDist/PerfTest/TreeLoaderBasic.hpp deleted file mode 100644 index c130c5d1968f95ec0b0c95e52996b87369af9a10..0000000000000000000000000000000000000000 --- a/Tests/noDist/PerfTest/TreeLoaderBasic.hpp +++ /dev/null @@ -1,68 +0,0 @@ -// ==== CMAKE ==== -// Keep in private GIT -// @SCALFMM_PRIVATE - -#ifndef _TREELOADERBASIC_HPP_ -#define _TREELOADERBASIC_HPP_ - -#include "PerfTestUtils.hpp" - -#include "Containers/FOctree.hpp" -#include "Components/FSimpleLeaf.hpp" -#include "Kernels/P2P/FP2PParticleContainerIndexed.hpp" - -#include "BalanceTree/FCostCell.hpp" - -/** - * \brief Basic tree loader. - * - * See FTreeLoader documentation. - */ -template <typename _FReal, typename _BaseClass> -class TreeLoaderBasic : public FTreeLoader { -public: - using FReal = _FReal; - using BaseClass = _BaseClass; - - // Required type definitions. - using CellClass = FCostCell<BaseClass>; - using ContainerClass = FP2PParticleContainerIndexed<FReal>; - using LeafClass = FSimpleLeaf<FReal, ContainerClass >; - using OctreeClass = FOctree<FReal, CellClass, ContainerClass, LeafClass>; - - /// File loader. - FFmaGenericLoader<FReal> _loader; - /// Required tree member. - OctreeClass _tree; - - /// Constructs the loader and loads the tree. - TreeLoaderBasic(FPerfTestParams& params): - _loader(params.filename), - _tree(params.treeHeight, - params.subTreeHeight, - _loader.getBoxWidth(), - _loader.getCenterOfBox()) { - this->loadTree(); - } - - virtual void loadTree() { - std::cout << "Creating & inserting particles" << std::flush; - - time.tic(); - - FPoint<FReal> position; - FReal physicalValue = 0.0; - for(FSize idxPart = 0 ; idxPart < _loader.getNumberOfParticles() ; ++idxPart) { - // Read particle per particle from file - _loader.fillParticle(&position,&physicalValue); - // put particle in octree - _tree.insert(position, idxPart, physicalValue); - } - - time.tac(); - std::cout << " Done (" << time.elapsed() << " s)." << std::endl; - } - -}; - -#endif diff --git a/Tests/noDist/PerfTest/TreeLoaderFCheb.hpp b/Tests/noDist/PerfTest/TreeLoaderFCheb.hpp deleted file mode 100644 index 8821d98767f9a271addc6e2776e6e952a2f68bd0..0000000000000000000000000000000000000000 --- a/Tests/noDist/PerfTest/TreeLoaderFCheb.hpp +++ /dev/null @@ -1,31 +0,0 @@ -// ==== CMAKE ==== -// Keep in private GIT -// @SCALFMM_PRIVATE - -#ifndef _TREELOADERFCHEB_HPP_ -#define _TREELOADERFCHEB_HPP_ - -#include "Kernels/Chebyshev/FChebCell.hpp" - -#include "TreeLoaderBasic.hpp" - - -/** - * \brief Tree loader for a Chebyshev cell type tree. - * - * See FTreeLoader and TreeLoaderBasic documentation. - */ -template <typename _FReal, int _ORDER> -class TreeLoaderFCheb : public TreeLoaderBasic<_FReal, FChebCell<_FReal, _ORDER> > { -public: - - enum {ORDER=_ORDER}; - - /// Constructs the loader and loads the tree. - TreeLoaderFCheb(FPerfTestParams& params): - TreeLoaderBasic<_FReal, FChebCell<_FReal, _ORDER>>(params) - {} - -}; - -#endif diff --git a/Tests/noDist/PerfTest/TreeLoaderMpiGeneric.hpp b/Tests/noDist/PerfTest/TreeLoaderMpiGeneric.hpp deleted file mode 100644 index 4775a9f6d6d0f1be09c6ba2aae71ba8ba4bdcac2..0000000000000000000000000000000000000000 --- a/Tests/noDist/PerfTest/TreeLoaderMpiGeneric.hpp +++ /dev/null @@ -1,132 +0,0 @@ -// ==== CMAKE ==== -// Keep in private GIT -// @SCALFMM_PRIVATE - -#ifndef _TREELOADERMPIGENERIC_HPP_ -#define _TREELOADERMPIGENERIC_HPP_ - -#include "PerfTestUtils.hpp" -#include "Utils/FMpi.hpp" - -#include "Kernels/P2P/FP2PParticleContainerIndexed.hpp" -#include "BalanceTree/FCostCell.hpp" -#include "Components/FSimpleLeaf.hpp" -#include "Containers/FOctree.hpp" - -#include "Files/FFmaGenericLoader.hpp" -#include "Files/FMpiFmaGenericLoader.hpp" -#include "Files/FMpiTreeBuilder.hpp" - - -/** - * \brief Genericted FMA file tree loader. - * - * See FTreeLoader documentation. - */ -template <typename _FReal, class _BaseClass> -class TreeLoaderMpiGeneric : public FTreeLoader { -public: - using FReal = _FReal; - - // Required type definitions. - using BaseClass = _BaseClass; - using CellClass = FCostCell<BaseClass>; - using ContainerClass = FP2PParticleContainerIndexed<FReal>; - using LeafClass = FSimpleLeaf<FReal, ContainerClass >; - using OctreeClass = FOctree<FReal, CellClass, ContainerClass, LeafClass>; - - /// MPI applcation context. - FMpi* _mpiContext; - /// File loader. - FMpiFmaGenericLoader<FReal> _loader; - /// Required tree member. - OctreeClass _tree; - - /* Mock particle structure to balance the tree over the processes. */ - struct TestParticle{ - FSize index; // Index of the particle in the original file. - FPoint<FReal> position; // Spatial position of the particle. - FReal physicalValue; // Physical value of the particle. - /* Returns the particle position. */ - const FPoint<FReal>& getPosition(){ - return position; - } - }; - - - /// Constructs the loader and loads the tree. - TreeLoaderMpiGeneric(FPerfTestParams& params): - _mpiContext(params.mpiContext), - _loader(params.filename, _mpiContext->global()), - _tree(params.treeHeight, - params.subTreeHeight, - _loader.getBoxWidth(), - _loader.getCenterOfBox()) { - this->loadTree(); - } - - void loadTree() { - if( 0 == _mpiContext->global().processId()) - std::cout << "Creating & inserting particles" << std::flush; - - time.tic(); - - // Temporary array of particles read by this process. - TestParticle* particles = new TestParticle[_loader.getMyNumberOfParticles()]; - memset(particles, 0, (sizeof(TestParticle) * _loader.getMyNumberOfParticles())); - - // Index (in file) of the first particle that will be read by this process. - FSize idxStart = _loader.getStart(); - - // Read particles from parts. - for(FSize idxPart = 0 ; idxPart < _loader.getMyNumberOfParticles() ; ++idxPart){ - // Store the index (in the original file) the particle. - particles[idxPart].index = idxPart + idxStart; - // Read particle from file - _loader.fillParticle(&particles[idxPart].position, - &particles[idxPart].physicalValue); - } - - // Final vector of particles - FVector<TestParticle> finalParticles; - FLeafBalance balancer; - - // Redistribute particules between processes - FMpiTreeBuilder< FReal, TestParticle >:: - DistributeArrayToContainer(_mpiContext->global(), - particles, - _loader.getMyNumberOfParticles(), - _tree.getBoxCenter(), - _tree.getBoxWidth(), - _tree.getHeight(), - &finalParticles, - &balancer); - - // Free temporary array memory. - delete[] particles; - - // Insert final particles into tree. - for(FSize idx = 0 ; idx < finalParticles.getSize(); ++idx){ - _tree.insert(finalParticles[idx].position, - finalParticles[idx].index, - finalParticles[idx].physicalValue); - } - - time.tac(); - double elapsedTime = time.elapsed(), minTime, maxTime; - - MPI_Reduce(&elapsedTime,&minTime,1,MPI_DOUBLE,MPI_MIN,0,_mpiContext->global().getComm()); - MPI_Reduce(&elapsedTime,&maxTime,1,MPI_DOUBLE,MPI_MAX,0,_mpiContext->global().getComm()); - - if( 0 == _mpiContext->global().processId()) { - std::cout << " Done ( min-time:" << minTime - << " max-time:" << maxTime - << " )" - << std::endl; - - } - } - -}; - -#endif diff --git a/Tests/noDist/PerfTest/TreeLoaderMpiGenericFCheb.hpp b/Tests/noDist/PerfTest/TreeLoaderMpiGenericFCheb.hpp deleted file mode 100644 index 745126125905d87821f34333998fd59d53fa27ed..0000000000000000000000000000000000000000 --- a/Tests/noDist/PerfTest/TreeLoaderMpiGenericFCheb.hpp +++ /dev/null @@ -1,31 +0,0 @@ -// ==== CMAKE ==== -// Keep in private GIT -// @SCALFMM_PRIVATE - -#ifndef _TREELOADERMPIGENERICFCHEB_HPP_ -#define _TREELOADERMPIGENERICFCHEB_HPP_ - -#include "Kernels/Chebyshev/FChebCell.hpp" - -#include "TreeLoaderMpiGeneric.hpp" - - -/** - * \brief Tree loader for a Chebyshev cell type tree. - * - * See FTreeLoader and TreeLoaderBasic documentation. - */ -template <typename _FReal, int _ORDER> -class TreeLoaderMpiGenericFCheb : public TreeLoaderMpiGeneric<_FReal, FChebCell<_FReal, _ORDER> > { -public: - - enum {ORDER=_ORDER}; - - /// Constructs the loader and loads the tree. - TreeLoaderMpiGenericFCheb(FPerfTestParams& params): - TreeLoaderMpiGeneric<_FReal, FChebCell<_FReal, _ORDER>>(params) - {} - -}; - -#endif diff --git a/Tests/noDist/PerfTest/TreeLoaderMpiSplit.hpp b/Tests/noDist/PerfTest/TreeLoaderMpiSplit.hpp deleted file mode 100644 index d38697fe3289086bb3ef4c8e371142c9c148aa43..0000000000000000000000000000000000000000 --- a/Tests/noDist/PerfTest/TreeLoaderMpiSplit.hpp +++ /dev/null @@ -1,79 +0,0 @@ -// ==== CMAKE ==== -// Keep in private GIT -// @SCALFMM_PRIVATE - -#ifndef _TREELOADERMPISPLIT_HPP_ -#define _TREELOADERMPISPLIT_HPP_ - -#include "PerfTestUtils.hpp" -#include "Utils/FMpi.hpp" - -#include "Kernels/P2P/FP2PParticleContainerIndexed.hpp" -#include "BalanceTree/FCostCell.hpp" -#include "Components/FSimpleLeaf.hpp" -#include "Containers/FOctree.hpp" - -#include "Files/FMpiSplitFmaLoader.hpp" - - -/** - * \brief Splitted FMA file tree loader. - * - * See FTreeLoader documentation. - */ -template <typename _FReal, class _BaseClass> -class TreeLoaderMpiSplit : public FTreeLoader { -public: - using FReal = _FReal; - - // Required type definitions. - using BaseClass = _BaseClass; - using CellClass = FCostCell<BaseClass>; - using ContainerClass = FP2PParticleContainerIndexed<FReal>; - using LeafClass = FSimpleLeaf<FReal, ContainerClass >; - using OctreeClass = FOctree<FReal, CellClass, ContainerClass, LeafClass>; - - /// Mpi application context - FMpi* _mpiContext; - /// File loader. - FMpiSplitFmaLoader<FReal> _loader; - /// Required tree member. - OctreeClass _tree; - - /// Constructs the loader and loads the tree. - TreeLoaderMpiSplit(FPerfTestParams& params): - _mpiContext(params.mpiContext), - _loader(params.filename,_mpiContext->global().processId()), - _tree(params.treeHeight, - params.subTreeHeight, - _loader.getBoxWidth(), - _loader.getCenterOfBox()) { - if( nullptr == _mpiContext ) { - std::cerr << "No MPI context available" << std::endl; - exit(-1); - } - - this->loadTree(); - } - - void loadTree() { - std::cout << "Creating & inserting particles" << std::flush; - - time.tic(); - - FPoint<FReal> position; - FReal physicalValue = 0.0; - for(FSize idxPart = 0 ; idxPart < _loader.getMyNumberOfParticles() ; ++idxPart) { - // Read particle per particle from file - _loader.fillParticle(&position,&physicalValue); - // put particle in octree - _tree.insert(position, idxPart, physicalValue); - } - - time.tac(); - std::cout << " Done (" << time.elapsed() << " s)." << std::endl; - } - -}; - -#endif diff --git a/Tests/noDist/PerfTest/TreeLoaderMpiSplitFCheb.hpp b/Tests/noDist/PerfTest/TreeLoaderMpiSplitFCheb.hpp deleted file mode 100644 index 2ede231989c0c12714cacfbbc3fa3705787234f0..0000000000000000000000000000000000000000 --- a/Tests/noDist/PerfTest/TreeLoaderMpiSplitFCheb.hpp +++ /dev/null @@ -1,31 +0,0 @@ -// ==== CMAKE ==== -// Keep in private GIT -// @SCALFMM_PRIVATE - -#ifndef _TREELOADERMPISPLITFCHEB_HPP_ -#define _TREELOADERMPISPLITFCHEB_HPP_ - -#include "Kernels/Chebyshev/FChebCell.hpp" - -#include "TreeLoaderMpiSplit.hpp" - - -/** - * \brief Tree loader for a Chebyshev cell type tree. - * - * See FTreeLoader and TreeLoaderBasic documentation. - */ -template <typename _FReal, int _ORDER> -class TreeLoaderMpiSplitFCheb : public TreeLoaderMpiSplit<_FReal, FChebCell<_FReal, _ORDER> > { -public: - - enum {ORDER=_ORDER}; - - /// Constructs the loader and loads the tree. - TreeLoaderMpiSplitFCheb(FPerfTestParams& params): - TreeLoaderMpiSplit<_FReal, FChebCell<_FReal, _ORDER>>(params) - {} - -}; - -#endif diff --git a/Tests/noDist/testFmmAlgorithmBalanced.cpp b/Tests/noDist/testFmmAlgorithmBalanced.cpp index 8259ba6ab91fb867b4f1462f52211da2717ccef6..10d84e5cfb67f9729a014343b059333c415d1d77 100644 --- a/Tests/noDist/testFmmAlgorithmBalanced.cpp +++ b/Tests/noDist/testFmmAlgorithmBalanced.cpp @@ -17,6 +17,7 @@ // ==== CMAKE ==== // Keep in private GIT // @SCALFMM_PRIVATE +// @FUSE_BLAS #include <string> diff --git a/Tests/noDist/testSphericalDebug.cpp b/Tests/noDist/testSphericalDebug.cpp index f72ca2a398c3b97901acc8d39e688e5b701a489a..0956b8b69018243e233b3e082f67ed95b0468ff3 100644 --- a/Tests/noDist/testSphericalDebug.cpp +++ b/Tests/noDist/testSphericalDebug.cpp @@ -16,7 +16,7 @@ // Keep in private GIT // @SCALFMM_PRIVATE - +// @FUSE_BLAS #define DEBUG_SPHERICAL_M2L #define BLAS_SPHERICAL_COMPRESS #define BLAS_M2L_P diff --git a/UTests/utestChebyshevDirectTsm.cpp b/UTests/utestChebyshevDirectTsm.cpp index c0c32fa5b883214dc8e09397bc070f6ae4bbb56f..215a437b756a1f549921c4be7b6cbc57a8297795 100644 --- a/UTests/utestChebyshevDirectTsm.cpp +++ b/UTests/utestChebyshevDirectTsm.cpp @@ -69,7 +69,7 @@ class TestChebyshevDirectTsm : public FUTester<TestChebyshevDirectTsm> { FPoint<FReal> position; loader.fillParticle(&position); // put in tree - tree.insert(position, FParticleTypeTarget, idxPart, physicalValue); + tree.insert(position, FParticleType::FParticleTypeTarget, idxPart, physicalValue); // get copy particlesTargets[idxPart].setPosition(position); *(particlesTargets[idxPart].setPhysicalValue()) = physicalValue; @@ -84,7 +84,7 @@ class TestChebyshevDirectTsm : public FUTester<TestChebyshevDirectTsm> { FPoint<FReal> position; loader.fillParticle(&position); // put in tree - tree.insert(position, FParticleTypeSource, idxPart, physicalValue); + tree.insert(position, FParticleType::FParticleTypeSource, idxPart, physicalValue); // get copy particlesSources[idxPart].setPosition(position); *(particlesSources[idxPart].setPhysicalValue()) = physicalValue; diff --git a/UTests/utestRotationDirectTsm.cpp b/UTests/utestRotationDirectTsm.cpp index 3ea1f79bedb27a20ce7dd54de70ad497a519ff66..7e1122fdc1cc3bb08d32b13cd6230b2a6fbac143 100644 --- a/UTests/utestRotationDirectTsm.cpp +++ b/UTests/utestRotationDirectTsm.cpp @@ -68,7 +68,7 @@ class TestRotationDirectTsm : public FUTester<TestRotationDirectTsm> { FPoint<FReal> position; loader.fillParticle(&position); // put in tree - tree.insert(position, FParticleTypeTarget, idxPart, physicalValue); + tree.insert(position, FParticleType::FParticleTypeTarget, idxPart, physicalValue); // get copy particlesTargets[idxPart].setPosition(position); *(particlesTargets[idxPart].setPhysicalValue()) = physicalValue; @@ -83,7 +83,7 @@ class TestRotationDirectTsm : public FUTester<TestRotationDirectTsm> { FPoint<FReal> position; loader.fillParticle(&position); // put in tree - tree.insert(position, FParticleTypeSource, idxPart, physicalValue); + tree.insert(position, FParticleType::FParticleTypeSource, idxPart, physicalValue); // get copy particlesSources[idxPart].setPosition(position); *(particlesSources[idxPart].setPhysicalValue()) = physicalValue; diff --git a/Utils/noDist/FmmAlgorithmTsm.cpp b/Utils/noDist/FmmAlgorithmTsm.cpp index 3731ce86aaf0c7c2765a065e9218b27c2c5ee524..3579231f9bf1bfacabd62b105f72cbc304282d4e 100644 --- a/Utils/noDist/FmmAlgorithmTsm.cpp +++ b/Utils/noDist/FmmAlgorithmTsm.cpp @@ -152,7 +152,7 @@ struct TempMainStruct{ { // Insert sources - FParticleType particleType, source = FParticleTypeSource; + FParticleType particleType, source = FParticleType::FParticleTypeSource; for(FSize idxPart = 0 ; idxPart < nbSRC ; ++idxPart){ loader.fillParticle(&particlePosition, &particleType); // std::cout << idxPart << " " << particlePosition << " type " << particleType @@ -175,9 +175,9 @@ struct TempMainStruct{ // int nbTargets = 256; for(FSize idxPart = 0 ; idxPart < nbTargets; ++idxPart){ particlePosition2.incX(dx); - std::cout << idxPart << " " <<particlePosition2.getX()/dimLeaf<< " " << particlePosition2 << " type " << FParticleTypeTarget + std::cout << idxPart << " " <<particlePosition2.getX()/dimLeaf<< " " << particlePosition2 << " type " << static_cast<int>(FParticleType::FParticleTypeTarget) << " " <<physicalValue<<std::endl; - tree.insert(particlePosition2, FParticleTypeTarget,idxPart,physicalValue ); + tree.insert(particlePosition2, FParticleType::FParticleTypeTarget,idxPart,physicalValue ); } } diff --git a/Utils/python/readGeod.py b/Utils/python/readGeod.py new file mode 100644 index 0000000000000000000000000000000000000000..496f951b6a2bc5a9fde5ec880105c5a42e001bda --- /dev/null +++ b/Utils/python/readGeod.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Jun 3 10:28:29 2016 + +@author: coulaud +""" + +import random +import numpy +import math + + +meshFile="Vega_Z09_RR1_sans_fils.geod" +fmmFile="Vega_Z09_RR1_sans_fils.fma" + +ptfile = open(fmmFile,'w') +ptfile.write("8 4 \n") + +Fichier = open(meshFile,'r') +line = Fichier.readline() +line = Fichier.readline() +# mear size triangles an points +line = Fichier.readline() +size = line.rstrip('\n\r').split() +print(line) +Npt = int(size[1]) +NT = int(size[0]) +x = numpy.zeros([Npt,3]) +for i in range(Npt): + line = Fichier.readline() + size = line.rstrip('\n\r').split() + x[i,0] = float(size[1]) + x[i,1] = float(size[2]) + x[i,2] = float(size[3]) + +a = numpy.amin(x,axis=0) +b = numpy.amax(x,axis=0) +print(a) +print(b) +length = math.ceil(max(b-a)) +centre= (a+b)/2 +print('Centre: ',centre) +print('length: ',length,max(b-a)) +ptfile.write(str(Npt)+' ' + str((length)/2) + ' '+ str(centre[0]) + + ' '+ str(centre[1])+ ' '+ str(centre[2]) +"\n" ) + +for i in range(Npt): + rho = 2*random.random()-1 + str1 = str(x[i,0])+ ' ' + str(x[i,1])+' ' +str(x[i,2])+ ' ' + str(rho) +"\n" + ptfile.write(str1) +ptfile.close() \ No newline at end of file