Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
solverstack
ScalFMM
Commits
1a642434
Commit
1a642434
authored
Oct 01, 2013
by
BRAMAS Berenger
Browse files
add sse for simple precision should be tested
parent
7e9eb4a6
Changes
2
Hide whitespace changes
Inline
Side-by-side
Src/Kernels/P2P/FP2P.hpp
View file @
1a642434
...
...
@@ -102,6 +102,8 @@ public:
}
}
#else
#ifdef ScalFMM_USE_DOUBLE_PRECISION
template
<
class
ContainerClass
>
static
void
FullMutual
(
ContainerClass
*
const
FRestrict
inTargets
,
ContainerClass
*
const
inNeighbors
[],
const
int
limiteNeighbors
){
...
...
@@ -273,6 +275,182 @@ public:
targetsPotentials
[
idxSource
]
+=
inv_distance
*
targetsPhysicalValues
[
idxTarget
];
}
}
#else
template
<
class
ContainerClass
>
static
void
FullMutual
(
ContainerClass
*
const
FRestrict
inTargets
,
ContainerClass
*
const
inNeighbors
[],
const
int
limiteNeighbors
){
const
int
nbParticlesTargets
=
inTargets
->
getNbParticles
();
const
FReal
*
const
targetsPhysicalValues
=
inTargets
->
getPhysicalValues
();
const
FReal
*
const
targetsX
=
inTargets
->
getPositions
()[
0
];
const
FReal
*
const
targetsY
=
inTargets
->
getPositions
()[
1
];
const
FReal
*
const
targetsZ
=
inTargets
->
getPositions
()[
2
];
FReal
*
const
targetsForcesX
=
inTargets
->
getForcesX
();
FReal
*
const
targetsForcesY
=
inTargets
->
getForcesY
();
FReal
*
const
targetsForcesZ
=
inTargets
->
getForcesZ
();
FReal
*
const
targetsPotentials
=
inTargets
->
getPotentials
();
const
__m128
mOne
=
_mm_set1_ps
(
1.0
);
for
(
int
idxNeighbors
=
0
;
idxNeighbors
<
limiteNeighbors
;
++
idxNeighbors
){
if
(
inNeighbors
[
idxNeighbors
]
){
const
int
nbParticlesSources
=
(
inNeighbors
[
idxNeighbors
]
->
getNbParticles
()
+
3
)
/
4
;
const
__m128
*
const
sourcesPhysicalValues
=
(
const
__m128
*
)
inNeighbors
[
idxNeighbors
]
->
getPhysicalValues
();
const
__m128
*
const
sourcesX
=
(
const
__m128
*
)
inNeighbors
[
idxNeighbors
]
->
getPositions
()[
0
];
const
__m128
*
const
sourcesY
=
(
const
__m128
*
)
inNeighbors
[
idxNeighbors
]
->
getPositions
()[
1
];
const
__m128
*
const
sourcesZ
=
(
const
__m128
*
)
inNeighbors
[
idxNeighbors
]
->
getPositions
()[
2
];
__m128
*
const
sourcesForcesX
=
(
__m128
*
)
inNeighbors
[
idxNeighbors
]
->
getForcesX
();
__m128
*
const
sourcesForcesY
=
(
__m128
*
)
inNeighbors
[
idxNeighbors
]
->
getForcesY
();
__m128
*
const
sourcesForcesZ
=
(
__m128
*
)
inNeighbors
[
idxNeighbors
]
->
getForcesZ
();
__m128
*
const
sourcesPotentials
=
(
__m128
*
)
inNeighbors
[
idxNeighbors
]
->
getPotentials
();
for
(
int
idxTarget
=
0
;
idxTarget
<
nbParticlesTargets
;
++
idxTarget
){
const
__m128
tx
=
_mm_load1_ps
(
&
targetsX
[
idxTarget
]);
const
__m128
ty
=
_mm_load1_ps
(
&
targetsY
[
idxTarget
]);
const
__m128
tz
=
_mm_load1_ps
(
&
targetsZ
[
idxTarget
]);
const
__m128
tv
=
_mm_load1_ps
(
&
targetsPhysicalValues
[
idxTarget
]);
__m128
tfx
=
_mm_setzero_ps
();
__m128
tfy
=
_mm_setzero_ps
();
__m128
tfz
=
_mm_setzero_ps
();
__m128
tpo
=
_mm_setzero_ps
();
for
(
int
idxSource
=
0
;
idxSource
<
nbParticlesSources
;
++
idxSource
){
__m128
dx
=
sourcesX
[
idxSource
]
-
tx
;
__m128
dy
=
sourcesY
[
idxSource
]
-
ty
;
__m128
dz
=
sourcesZ
[
idxSource
]
-
tz
;
__m128
inv_square_distance
=
mOne
/
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
);
const
__m128
inv_distance
=
_mm_sqrt_ps
(
inv_square_distance
);
inv_square_distance
*=
inv_distance
;
inv_square_distance
*=
tv
*
sourcesPhysicalValues
[
idxSource
];
dx
*=
inv_square_distance
;
dy
*=
inv_square_distance
;
dz
*=
inv_square_distance
;
tfx
+=
dx
;
tfy
+=
dy
;
tfz
+=
dz
;
tpo
+=
inv_distance
*
sourcesPhysicalValues
[
idxSource
];
sourcesForcesX
[
idxSource
]
-=
dx
;
sourcesForcesY
[
idxSource
]
-=
dy
;
sourcesForcesZ
[
idxSource
]
-=
dz
;
sourcesPotentials
[
idxSource
]
+=
inv_distance
*
tv
;
}
__attribute__
((
aligned
(
16
)))
float
buffer
[
4
];
_mm_store_ps
(
buffer
,
tfx
);
targetsForcesX
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
];
_mm_store_ps
(
buffer
,
tfy
);
targetsForcesY
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
];
_mm_store_ps
(
buffer
,
tfz
);
targetsForcesZ
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
];
_mm_store_ps
(
buffer
,
tpo
);
targetsPotentials
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
];
}
}
}
{
const
int
nbParticlesSources
=
(
nbParticlesTargets
+
3
)
/
4
;
const
__m128
*
const
sourcesPhysicalValues
=
(
const
__m128
*
)
targetsPhysicalValues
;
const
__m128
*
const
sourcesX
=
(
const
__m128
*
)
targetsX
;
const
__m128
*
const
sourcesY
=
(
const
__m128
*
)
targetsY
;
const
__m128
*
const
sourcesZ
=
(
const
__m128
*
)
targetsZ
;
__m128
*
const
sourcesForcesX
=
(
__m128
*
)
targetsForcesX
;
__m128
*
const
sourcesForcesY
=
(
__m128
*
)
targetsForcesY
;
__m128
*
const
sourcesForcesZ
=
(
__m128
*
)
targetsForcesZ
;
__m128
*
const
sourcesPotentials
=
(
__m128
*
)
targetsPotentials
;
for
(
int
idxTarget
=
0
;
idxTarget
<
nbParticlesTargets
;
++
idxTarget
){
const
__m128
tx
=
_mm_load1_ps
(
&
targetsX
[
idxTarget
]);
const
__m128
ty
=
_mm_load1_ps
(
&
targetsY
[
idxTarget
]);
const
__m128
tz
=
_mm_load1_ps
(
&
targetsZ
[
idxTarget
]);
const
__m128
tv
=
_mm_load1_ps
(
&
targetsPhysicalValues
[
idxTarget
]);
__m128
tfx
=
_mm_setzero_ps
();
__m128
tfy
=
_mm_setzero_ps
();
__m128
tfz
=
_mm_setzero_ps
();
__m128
tpo
=
_mm_setzero_ps
();
for
(
int
idxSource
=
(
idxTarget
+
2
)
/
2
;
idxSource
<
nbParticlesSources
;
++
idxSource
){
__m128
dx
=
sourcesX
[
idxSource
]
-
tx
;
__m128
dy
=
sourcesY
[
idxSource
]
-
ty
;
__m128
dz
=
sourcesZ
[
idxSource
]
-
tz
;
__m128
inv_square_distance
=
mOne
/
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
);
const
__m128
inv_distance
=
_mm_sqrt_ps
(
inv_square_distance
);
inv_square_distance
*=
inv_distance
;
inv_square_distance
*=
tv
*
sourcesPhysicalValues
[
idxSource
];
dx
*=
inv_square_distance
;
dy
*=
inv_square_distance
;
dz
*=
inv_square_distance
;
tfx
+=
dx
;
tfy
+=
dy
;
tfz
+=
dz
;
tpo
+=
inv_distance
*
sourcesPhysicalValues
[
idxSource
];
sourcesForcesX
[
idxSource
]
-=
dx
;
sourcesForcesY
[
idxSource
]
-=
dy
;
sourcesForcesZ
[
idxSource
]
-=
dz
;
sourcesPotentials
[
idxSource
]
+=
inv_distance
*
tv
;
}
__attribute__
((
aligned
(
16
)))
float
buffer
[
4
];
_mm_store_ps
(
buffer
,
tfx
);
targetsForcesX
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
];
_mm_store_ps
(
buffer
,
tfy
);
targetsForcesY
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
];
_mm_store_ps
(
buffer
,
tfz
);
targetsForcesZ
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
];
_mm_store_ps
(
buffer
,
tpo
);
targetsPotentials
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
];
}
}
for
(
int
idxTarget
=
0
;
idxTarget
<
nbParticlesTargets
;
idxTarget
+=
4
){
for
(
int
idxClose
=
1
;
idxClose
<
4
;
++
idxClose
){
const
int
idxSource
=
idxTarget
+
idxClose
;
FReal
dx
=
targetsX
[
idxSource
]
-
targetsX
[
idxTarget
];
FReal
dy
=
targetsY
[
idxSource
]
-
targetsY
[
idxTarget
];
FReal
dz
=
targetsZ
[
idxSource
]
-
targetsZ
[
idxTarget
];
FReal
inv_square_distance
=
FReal
(
1.0
)
/
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
);
const
FReal
inv_distance
=
FMath
::
Sqrt
(
inv_square_distance
);
inv_square_distance
*=
inv_distance
;
inv_square_distance
*=
targetsPhysicalValues
[
idxTarget
]
*
targetsPhysicalValues
[
idxSource
];
dx
*=
inv_square_distance
;
dy
*=
inv_square_distance
;
dz
*=
inv_square_distance
;
targetsForcesX
[
idxTarget
]
+=
dx
;
targetsForcesY
[
idxTarget
]
+=
dy
;
targetsForcesZ
[
idxTarget
]
+=
dz
;
targetsPotentials
[
idxTarget
]
+=
inv_distance
*
targetsPhysicalValues
[
idxSource
];
targetsForcesX
[
idxSource
]
-=
dx
;
targetsForcesY
[
idxSource
]
-=
dy
;
targetsForcesZ
[
idxSource
]
-=
dz
;
targetsPotentials
[
idxSource
]
+=
inv_distance
*
targetsPhysicalValues
[
idxTarget
];
}
}
}
#endif
#endif
/**
...
...
@@ -327,6 +505,8 @@ public:
}
}
#else
#ifdef ScalFMM_USE_DOUBLE_PRECISION
template
<
class
ContainerClass
>
static
void
FullRemote
(
ContainerClass
*
const
FRestrict
inTargets
,
ContainerClass
*
const
inNeighbors
[],
const
int
limiteNeighbors
){
...
...
@@ -398,6 +578,80 @@ public:
}
}
}
#else
template
<
class
ContainerClass
>
static
void
FullRemote
(
ContainerClass
*
const
FRestrict
inTargets
,
ContainerClass
*
const
inNeighbors
[],
const
int
limiteNeighbors
){
const
int
nbParticlesTargets
=
inTargets
->
getNbParticles
();
const
FReal
*
const
targetsPhysicalValues
=
inTargets
->
getPhysicalValues
();
const
FReal
*
const
targetsX
=
inTargets
->
getPositions
()[
0
];
const
FReal
*
const
targetsY
=
inTargets
->
getPositions
()[
1
];
const
FReal
*
const
targetsZ
=
inTargets
->
getPositions
()[
2
];
FReal
*
const
targetsForcesX
=
inTargets
->
getForcesX
();
FReal
*
const
targetsForcesY
=
inTargets
->
getForcesY
();
FReal
*
const
targetsForcesZ
=
inTargets
->
getForcesZ
();
FReal
*
const
targetsPotentials
=
inTargets
->
getPotentials
();
const
__m128
mOne
=
_mm_set1_ps
(
1.0
);
for
(
int
idxNeighbors
=
0
;
idxNeighbors
<
limiteNeighbors
;
++
idxNeighbors
){
if
(
inNeighbors
[
idxNeighbors
]
){
const
int
nbParticlesSources
=
(
inNeighbors
[
idxNeighbors
]
->
getNbParticles
()
+
3
)
/
4
;
const
__m128
*
const
sourcesPhysicalValues
=
(
const
__m128
*
)
inNeighbors
[
idxNeighbors
]
->
getPhysicalValues
();
const
__m128
*
const
sourcesX
=
(
const
__m128
*
)
inNeighbors
[
idxNeighbors
]
->
getPositions
()[
0
];
const
__m128
*
const
sourcesY
=
(
const
__m128
*
)
inNeighbors
[
idxNeighbors
]
->
getPositions
()[
1
];
const
__m128
*
const
sourcesZ
=
(
const
__m128
*
)
inNeighbors
[
idxNeighbors
]
->
getPositions
()[
2
];
for
(
int
idxTarget
=
0
;
idxTarget
<
nbParticlesTargets
;
++
idxTarget
){
const
__m128
tx
=
_mm_load1_ps
(
&
targetsX
[
idxTarget
]);
const
__m128
ty
=
_mm_load1_ps
(
&
targetsY
[
idxTarget
]);
const
__m128
tz
=
_mm_load1_ps
(
&
targetsZ
[
idxTarget
]);
const
__m128
tv
=
_mm_load1_ps
(
&
targetsPhysicalValues
[
idxTarget
]);
__m128
tfx
=
_mm_setzero_ps
();
__m128
tfy
=
_mm_setzero_ps
();
__m128
tfz
=
_mm_setzero_ps
();
__m128
tpo
=
_mm_setzero_ps
();
for
(
int
idxSource
=
0
;
idxSource
<
nbParticlesSources
;
++
idxSource
){
__m128
dx
=
sourcesX
[
idxSource
]
-
tx
;
__m128
dy
=
sourcesY
[
idxSource
]
-
ty
;
__m128
dz
=
sourcesZ
[
idxSource
]
-
tz
;
__m128
inv_square_distance
=
mOne
/
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
);
const
__m128
inv_distance
=
_mm_sqrt_ps
(
inv_square_distance
);
inv_square_distance
*=
inv_distance
;
inv_square_distance
*=
tv
*
sourcesPhysicalValues
[
idxSource
];
dx
*=
inv_square_distance
;
dy
*=
inv_square_distance
;
dz
*=
inv_square_distance
;
tfx
+=
dx
;
tfy
+=
dy
;
tfz
+=
dz
;
tpo
+=
inv_distance
*
sourcesPhysicalValues
[
idxSource
];
}
__attribute__
((
aligned
(
16
)))
float
buffer
[
4
];
_mm_store_ps
(
buffer
,
tfx
);
targetsForcesX
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
];
_mm_store_ps
(
buffer
,
tfy
);
targetsForcesY
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
];
_mm_store_ps
(
buffer
,
tfz
);
targetsForcesZ
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
];
_mm_store_ps
(
buffer
,
tpo
);
targetsPotentials
[
idxTarget
]
+=
buffer
[
0
]
+
buffer
[
1
]
+
buffer
[
2
]
+
buffer
[
3
];
}
}
}
}
#endif
#endif
/** P2P mutual interaction,
...
...
Src/Utils/FSse.hpp
View file @
1a642434
...
...
@@ -41,6 +41,38 @@ inline __m128d operator/(const __m128d& v1, const __m128d& v2){
return
_mm_div_pd
(
v1
,
v2
);
}
inline
__m128
&
operator
+=
(
__m128
&
v1
,
const
__m128
&
v2
){
return
(
v1
=
_mm_add_ps
(
v1
,
v2
));
}
inline
__m128
&
operator
-=
(
__m128
&
v1
,
const
__m128
&
v2
){
return
(
v1
=
_mm_sub_ps
(
v1
,
v2
));
}
inline
__m128
&
operator
*=
(
__m128
&
v1
,
const
__m128
&
v2
){
return
(
v1
=
_mm_mul_ps
(
v1
,
v2
));
}
inline
__m128
&
operator
/=
(
__m128
&
v1
,
const
__m128
&
v2
){
return
(
v1
=
_mm_div_ps
(
v1
,
v2
));
}
inline
__m128
operator
+
(
const
__m128
&
v1
,
const
__m128
&
v2
){
return
_mm_add_ps
(
v1
,
v2
);
}
inline
__m128
operator
-
(
const
__m128
&
v1
,
const
__m128
&
v2
){
return
_mm_sub_ps
(
v1
,
v2
);
}
inline
__m128
operator
*
(
const
__m128
&
v1
,
const
__m128
&
v2
){
return
_mm_mul_ps
(
v1
,
v2
);
}
inline
__m128
operator
/
(
const
__m128
&
v1
,
const
__m128
&
v2
){
return
_mm_div_ps
(
v1
,
v2
);
}
#endif
#endif // FSSE_HPP
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment