Mentions légales du service
Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
ScalFMM
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
solverstack
ScalFMM
Commits
0014e497
Commit
0014e497
authored
11 years ago
by
Olivier COULAUD
Browse files
Options
Downloads
Plain Diff
Merge branch 'master' of
git+ssh://scm.gforge.inria.fr//gitroot/scalfmm/scalfmm
parents
dd8edc44
4e389e08
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Src/Kernels/P2P/FP2PAvx.hpp
+244
-1
244 additions, 1 deletion
Src/Kernels/P2P/FP2PAvx.hpp
with
244 additions
and
1 deletion
Src/Kernels/P2P/FP2PAvx.hpp
+
244
−
1
View file @
0014e497
...
...
@@ -258,8 +258,251 @@ namespace FP2P{
}
#else //Float, ScalFMM_USE_DOUBLE_PRECISION not set
template
<
class
ContainerClass
>
static
void
FullMutual
(
ContainerClass
*
const
FRestrict
inTargets
,
ContainerClass
*
const
inNeighbors
[],
const
int
limiteNeighbors
){
const
int
nbParticlesTargets
=
inTargets
->
getNbParticles
();
const
FReal
*
const
targetsPhysicalValues
=
inTargets
->
getPhysicalValues
();
const
FReal
*
const
targetsX
=
inTargets
->
getPositions
()[
0
];
const
FReal
*
const
targetsY
=
inTargets
->
getPositions
()[
1
];
const
FReal
*
const
targetsZ
=
inTargets
->
getPositions
()[
2
];
FReal
*
const
targetsForcesX
=
inTargets
->
getForcesX
();
FReal
*
const
targetsForcesY
=
inTargets
->
getForcesY
();
FReal
*
const
targetsForcesZ
=
inTargets
->
getForcesZ
();
FReal
*
const
targetsPotentials
=
inTargets
->
getPotentials
();
const
__m256
mOne
=
_mm256_set1_ps
(
1.0
);
for
(
int
idxNeighbors
=
0
;
idxNeighbors
<
limiteNeighbors
;
++
idxNeighbors
){
if
(
inNeighbors
[
idxNeighbors
]
){
const
int
nbParticlesSources
=
(
inNeighbors
[
idxNeighbors
]
->
getNbParticles
()
+
7
)
/
8
;
const
__m256
*
const
sourcesPhysicalValues
=
(
const
__m256
*
)
inNeighbors
[
idxNeighbors
]
->
getPhysicalValues
();
const
__m256
*
const
sourcesX
=
(
const
__m256
*
)
inNeighbors
[
idxNeighbors
]
->
getPositions
()[
0
];
const
__m256
*
const
sourcesY
=
(
const
__m256
*
)
inNeighbors
[
idxNeighbors
]
->
getPositions
()[
1
];
const
__m256
*
const
sourcesZ
=
(
const
__m256
*
)
inNeighbors
[
idxNeighbors
]
->
getPositions
()[
2
];
__m256
*
const
sourcesForcesX
=
(
__m256
*
)
inNeighbors
[
idxNeighbors
]
->
getForcesX
();
__m256
*
const
sourcesForcesY
=
(
__m256
*
)
inNeighbors
[
idxNeighbors
]
->
getForcesY
();
__m256
*
const
sourcesForcesZ
=
(
__m256
*
)
inNeighbors
[
idxNeighbors
]
->
getForcesZ
();
__m256
*
const
sourcesPotentials
=
(
__m256
*
)
inNeighbors
[
idxNeighbors
]
->
getPotentials
();
for
(
int
idxTarget
=
0
;
idxTarget
<
nbParticlesTargets
;
++
idxTarget
){
const
__m256
tx
=
_mm256_broadcast_ss
(
&
targetsX
[
idxTarget
]);
const
__m256
ty
=
_mm256_broadcast_ss
(
&
targetsY
[
idxTarget
]);
const
__m256
tz
=
_mm256_broadcast_ss
(
&
targetsZ
[
idxTarget
]);
const
__m256
tv
=
_mm256_broadcast_ss
(
&
targetsPhysicalValues
[
idxTarget
]);
__m256
tfx
=
_mm256_setzero_ps
();
__m256
tfy
=
_mm256_setzero_ps
();
__m256
tfz
=
_mm256_setzero_ps
();
__m256
tpo
=
_mm256_setzero_ps
();
for
(
int
idxSource
=
0
;
idxSource
<
nbParticlesSources
;
++
idxSource
){
__m256
dx
=
sourcesX
[
idxSource
]
-
tx
;
__m256
dy
=
sourcesY
[
idxSource
]
-
ty
;
__m256
dz
=
sourcesZ
[
idxSource
]
-
tz
;
__m256
inv_square_distance
=
mOne
/
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
);
const
__m256
inv_distance
=
_mm256_rsqrt_ps
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
);
inv_square_distance
*=
inv_distance
;
inv_square_distance
*=
tv
*
sourcesPhysicalValues
[
idxSource
];
dx
*=
inv_square_distance
;
dy
*=
inv_square_distance
;
dz
*=
inv_square_distance
;
tfx
+=
dx
;
tfy
+=
dy
;
tfz
+=
dz
;
tpo
+=
inv_distance
*
sourcesPhysicalValues
[
idxSource
];
sourcesForcesX
[
idxSource
]
-=
dx
;
sourcesForcesY
[
idxSource
]
-=
dy
;
sourcesForcesZ
[
idxSource
]
-=
dz
;
sourcesPotentials
[
idxSource
]
+=
inv_distance
*
tv
;
}
__attribute__
((
aligned
(
32
)))
float
buffer
[
8
];
_mm256_store_ps
(
buffer
,
tfx
);
targetsForcesX
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
]
+
buffer
[
4
]
+
buffer
[
5
]
+
buffer
[
6
]
+
buffer
[
7
];
_mm256_store_ps
(
buffer
,
tfy
);
targetsForcesY
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
]
+
buffer
[
4
]
+
buffer
[
5
]
+
buffer
[
6
]
+
buffer
[
7
];
_mm256_store_ps
(
buffer
,
tfz
);
targetsForcesZ
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
]
+
buffer
[
4
]
+
buffer
[
5
]
+
buffer
[
6
]
+
buffer
[
7
];
_mm256_store_ps
(
buffer
,
tpo
);
targetsPotentials
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
]
+
buffer
[
4
]
+
buffer
[
5
]
+
buffer
[
6
]
+
buffer
[
7
];
}
}
}
{
const
int
nbParticlesSources
=
(
nbParticlesTargets
+
7
)
/
8
;
const
__m256
*
const
sourcesPhysicalValues
=
(
const
__m256
*
)
targetsPhysicalValues
;
const
__m256
*
const
sourcesX
=
(
const
__m256
*
)
targetsX
;
const
__m256
*
const
sourcesY
=
(
const
__m256
*
)
targetsY
;
const
__m256
*
const
sourcesZ
=
(
const
__m256
*
)
targetsZ
;
__m256
*
const
sourcesForcesX
=
(
__m256
*
)
targetsForcesX
;
__m256
*
const
sourcesForcesY
=
(
__m256
*
)
targetsForcesY
;
__m256
*
const
sourcesForcesZ
=
(
__m256
*
)
targetsForcesZ
;
__m256
*
const
sourcesPotentials
=
(
__m256
*
)
targetsPotentials
;
for
(
int
idxTarget
=
0
;
idxTarget
<
nbParticlesTargets
;
++
idxTarget
){
const
__m256
tx
=
_mm256_broadcast_ss
(
&
targetsX
[
idxTarget
]);
const
__m256
ty
=
_mm256_broadcast_ss
(
&
targetsY
[
idxTarget
]);
const
__m256
tz
=
_mm256_broadcast_ss
(
&
targetsZ
[
idxTarget
]);
const
__m256
tv
=
_mm256_broadcast_ss
(
&
targetsPhysicalValues
[
idxTarget
]);
__m256
tfx
=
_mm256_setzero_ps
();
__m256
tfy
=
_mm256_setzero_ps
();
__m256
tfz
=
_mm256_setzero_ps
();
__m256
tpo
=
_mm256_setzero_ps
();
for
(
int
idxSource
=
(
idxTarget
+
2
)
/
2
;
idxSource
<
nbParticlesSources
;
++
idxSource
){
__m256
dx
=
sourcesX
[
idxSource
]
-
tx
;
__m256
dy
=
sourcesY
[
idxSource
]
-
ty
;
__m256
dz
=
sourcesZ
[
idxSource
]
-
tz
;
__m256
inv_square_distance
=
mOne
/
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
);
const
__m256
inv_distance
=
_mm256_rsqrt_ps
(
inv_square_distance
);
inv_square_distance
*=
inv_distance
;
inv_square_distance
*=
tv
*
sourcesPhysicalValues
[
idxSource
];
dx
*=
inv_square_distance
;
dy
*=
inv_square_distance
;
dz
*=
inv_square_distance
;
tfx
+=
dx
;
tfy
+=
dy
;
tfz
+=
dz
;
tpo
+=
inv_distance
*
sourcesPhysicalValues
[
idxSource
];
sourcesForcesX
[
idxSource
]
-=
dx
;
sourcesForcesY
[
idxSource
]
-=
dy
;
sourcesForcesZ
[
idxSource
]
-=
dz
;
sourcesPotentials
[
idxSource
]
+=
inv_distance
*
tv
;
}
__attribute__
((
aligned
(
32
)))
float
buffer
[
8
];
_mm256_store_ps
(
buffer
,
tfx
);
targetsForcesX
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
]
+
buffer
[
4
]
+
buffer
[
5
]
+
buffer
[
6
]
+
buffer
[
7
];
_mm256_store_ps
(
buffer
,
tfy
);
targetsForcesY
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
]
+
buffer
[
4
]
+
buffer
[
5
]
+
buffer
[
6
]
+
buffer
[
7
];
#error("NOT IMPLMEENTED")
_mm256_store_ps
(
buffer
,
tfz
);
targetsForcesZ
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
]
+
buffer
[
4
]
+
buffer
[
5
]
+
buffer
[
6
]
+
buffer
[
7
];
_mm256_store_ps
(
buffer
,
tpo
);
targetsPotentials
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
]
+
buffer
[
4
]
+
buffer
[
5
]
+
buffer
[
6
]
+
buffer
[
7
];
}
}
for
(
int
idxTarget
=
0
;
idxTarget
<
nbParticlesTargets
;
idxTarget
+=
4
){
for
(
int
idxClose
=
1
;
idxClose
<
4
;
++
idxClose
){
const
int
idxSource
=
idxTarget
+
idxClose
;
FReal
dx
=
targetsX
[
idxSource
]
-
targetsX
[
idxTarget
];
FReal
dy
=
targetsY
[
idxSource
]
-
targetsY
[
idxTarget
];
FReal
dz
=
targetsZ
[
idxSource
]
-
targetsZ
[
idxTarget
];
FReal
inv_square_distance
=
FReal
(
1.0
)
/
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
);
const
FReal
inv_distance
=
FMath
::
Sqrt
(
inv_square_distance
);
inv_square_distance
*=
inv_distance
;
inv_square_distance
*=
targetsPhysicalValues
[
idxTarget
]
*
targetsPhysicalValues
[
idxSource
];
dx
*=
inv_square_distance
;
dy
*=
inv_square_distance
;
dz
*=
inv_square_distance
;
targetsForcesX
[
idxTarget
]
+=
dx
;
targetsForcesY
[
idxTarget
]
+=
dy
;
targetsForcesZ
[
idxTarget
]
+=
dz
;
targetsPotentials
[
idxTarget
]
+=
inv_distance
*
targetsPhysicalValues
[
idxSource
];
targetsForcesX
[
idxSource
]
-=
dx
;
targetsForcesY
[
idxSource
]
-=
dy
;
targetsForcesZ
[
idxSource
]
-=
dz
;
targetsPotentials
[
idxSource
]
+=
inv_distance
*
targetsPhysicalValues
[
idxTarget
];
}
}
}
template
<
class
ContainerClass
>
static
void
FullRemote
(
ContainerClass
*
const
FRestrict
inTargets
,
ContainerClass
*
const
inNeighbors
[],
const
int
limiteNeighbors
){
const
int
nbParticlesTargets
=
inTargets
->
getNbParticles
();
const
FReal
*
const
targetsPhysicalValues
=
inTargets
->
getPhysicalValues
();
const
FReal
*
const
targetsX
=
inTargets
->
getPositions
()[
0
];
const
FReal
*
const
targetsY
=
inTargets
->
getPositions
()[
1
];
const
FReal
*
const
targetsZ
=
inTargets
->
getPositions
()[
2
];
FReal
*
const
targetsForcesX
=
inTargets
->
getForcesX
();
FReal
*
const
targetsForcesY
=
inTargets
->
getForcesY
();
FReal
*
const
targetsForcesZ
=
inTargets
->
getForcesZ
();
FReal
*
const
targetsPotentials
=
inTargets
->
getPotentials
();
const
__m256
mOne
=
_mm256_set1_ps
(
1.0
);
for
(
int
idxNeighbors
=
0
;
idxNeighbors
<
limiteNeighbors
;
++
idxNeighbors
){
if
(
inNeighbors
[
idxNeighbors
]
){
const
int
nbParticlesSources
=
(
inNeighbors
[
idxNeighbors
]
->
getNbParticles
()
+
7
)
/
8
;
const
__m256
*
const
sourcesPhysicalValues
=
(
const
__m256
*
)
inNeighbors
[
idxNeighbors
]
->
getPhysicalValues
();
const
__m256
*
const
sourcesX
=
(
const
__m256
*
)
inNeighbors
[
idxNeighbors
]
->
getPositions
()[
0
];
const
__m256
*
const
sourcesY
=
(
const
__m256
*
)
inNeighbors
[
idxNeighbors
]
->
getPositions
()[
1
];
const
__m256
*
const
sourcesZ
=
(
const
__m256
*
)
inNeighbors
[
idxNeighbors
]
->
getPositions
()[
2
];
for
(
int
idxTarget
=
0
;
idxTarget
<
nbParticlesTargets
;
++
idxTarget
){
const
__m256
tx
=
_mm256_broadcast_ss
(
&
targetsX
[
idxTarget
]);
const
__m256
ty
=
_mm256_broadcast_ss
(
&
targetsY
[
idxTarget
]);
const
__m256
tz
=
_mm256_broadcast_ss
(
&
targetsZ
[
idxTarget
]);
const
__m256
tv
=
_mm256_broadcast_ss
(
&
targetsPhysicalValues
[
idxTarget
]);
__m256
tfx
=
_mm256_setzero_ps
();
__m256
tfy
=
_mm256_setzero_ps
();
__m256
tfz
=
_mm256_setzero_ps
();
__m256
tpo
=
_mm256_setzero_ps
();
for
(
int
idxSource
=
0
;
idxSource
<
nbParticlesSources
;
++
idxSource
){
__m256
dx
=
sourcesX
[
idxSource
]
-
tx
;
__m256
dy
=
sourcesY
[
idxSource
]
-
ty
;
__m256
dz
=
sourcesZ
[
idxSource
]
-
tz
;
__m256
inv_square_distance
=
mOne
/
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
);
const
__m256
inv_distance
=
_mm256_rsqrt_ps
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
);
inv_square_distance
*=
inv_distance
;
inv_square_distance
*=
tv
*
sourcesPhysicalValues
[
idxSource
];
dx
*=
inv_square_distance
;
dy
*=
inv_square_distance
;
dz
*=
inv_square_distance
;
tfx
+=
dx
;
tfy
+=
dy
;
tfz
+=
dz
;
tpo
+=
inv_distance
*
sourcesPhysicalValues
[
idxSource
];
}
__attribute__
((
aligned
(
32
)))
float
buffer
[
8
];
_mm256_store_ps
(
buffer
,
tfx
);
targetsForcesX
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
]
+
buffer
[
4
]
+
buffer
[
5
]
+
buffer
[
6
]
+
buffer
[
7
];
_mm256_store_ps
(
buffer
,
tfy
);
targetsForcesY
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
]
+
buffer
[
4
]
+
buffer
[
5
]
+
buffer
[
6
]
+
buffer
[
7
];
_mm256_store_ps
(
buffer
,
tfz
);
targetsForcesZ
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
]
+
buffer
[
4
]
+
buffer
[
5
]
+
buffer
[
6
]
+
buffer
[
7
];
_mm256_store_ps
(
buffer
,
tpo
);
targetsPotentials
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
]
+
buffer
[
4
]
+
buffer
[
5
]
+
buffer
[
6
]
+
buffer
[
7
];
}
}
}
}
#endif
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment