Commit f50099fd authored by Emmanuel Thomé's avatar Emmanuel Thomé

back-port matops.cpp and matops.h from master to 2.2

parent 8e456fff
......@@ -49,14 +49,14 @@
/* {{{ _cado_mm_set1_epi64 _m128i from 1 int64_t's */
#define _cado_mm_set1_epi64(u) _mm_set1_epi64( _cado_mm_cvtsi64_m64((int64_t) (u)))
/* }}} */
/* {{{ _cado_mm_setr_epi64_c _m128i from 2 int64_t CONSTANTS (and try to get suffix right) */
/* {{{ _cado_mm_setr_epi64_c _m128i from 2 int64_t CONSTANTS (and try to get suffix right) */
#define _cado_mm_setr_epi64_c(lo, hi) \
_mm_setr_epi64( \
_cado_mm_cvtsi64_m64(INT64_C(lo)), \
_cado_mm_cvtsi64_m64(INT64_C(hi)) \
)
/* }}} */
/* {{{ _cado_mm_set1_epi64_c _m128i from 1 int64_t CONSTANT (and try to get suffix right) */
/* {{{ _cado_mm_set1_epi64_c _m128i from 1 int64_t CONSTANT (and try to get suffix right) */
#define _cado_mm_set1_epi64_c(u) _mm_set1_epi64( _cado_mm_cvtsi64_m64(INT64_C(u)))
/* }}} */
/* {{{ same for 32-bits (which, for some, have SSE-2) */
......@@ -179,7 +179,8 @@ void mul_6464_6464_sse(mat64_ptr C, mat64_srcptr A, mat64_srcptr B)
__m128i one = _cado_mm_set1_epi64_c(1);
for (i = 0; i < 64; i++) {
__m128i bw = _cado_mm_set1_epi64(B[i]);
c = _mm_xor_si128(c,_mm_and_si128(bw,_mm_sub_epi64(_mm_setzero_si128(),_mm_and_si128(a, one))));
// c ^= (bw & -(a & one));
c = _mm_xor_si128(c, _mm_and_si128(bw, _mm_sub_epi64(_mm_setzero_si128(), _mm_and_si128(a, one))));
a = _mm_srli_epi64(a, 1);
}
*Cw++ = c;
......@@ -346,6 +347,7 @@ void mul_o64_T6464_C_parity3(uint64_t * w, uint64_t a, mat64_srcptr b)
void transp_6464(mat64_ptr dst, mat64_srcptr src)
{
int i, j;
ASSERT_ALWAYS(dst != src);
for (i = 0; i < 64; i++) {
dst[i] = 0;
for (j = 0; j < 64; j++) {
......@@ -399,6 +401,8 @@ void mul_N64_6464_lookup4(uint64_t *C,
Bx[j][9] = w; w ^= bb[0];
Bx[j][8] = w;
}
/* We don't zero out C before the computation, but rather at the
* moment we read A[i], so that A==C is supported */
for (size_t i = 0; i < m; i++) {
uint64_t aa = A[i];
C[i] = Bx[0][aa & 15]; aa>>=4;
......@@ -445,7 +449,6 @@ static inline void MAYBE_UNUSED addmul_N64_6464_lookup4(uint64_t *C,
Bx[j][9] = w; w ^= bb[0];
Bx[j][8] = w;
}
memset(C, 0, m * sizeof(uint64_t));
for (size_t i = 0; i < m; i++) {
uint64_t aa = A[i];
C[i]^= Bx[0][aa & 15]; aa>>=4;
......@@ -789,7 +792,7 @@ void mul_N64_T6464_transB(uint64_t *C,
}
#if defined(HAVE_SSE2) && ULONG_BITS == 64
/* implements mul_N64_6464 */
/* implements addmul_N64_6464 */
void addmul_N64_6464_sse(uint64_t *C,
const uint64_t *A,
const uint64_t *B, size_t m)
......@@ -798,7 +801,8 @@ void addmul_N64_6464_sse(uint64_t *C,
__m128i *Cw = (__m128i *) C;
__m128i *Aw = (__m128i *) A;
/* If m is odd, then we can't do sse because of data width */
/* If m is odd, then we can't do sse all the way through because of
* data width */
for (j = 0; j < m - 1; j += 2) {
__m128i c = _mm_setzero_si128();
__m128i a = *Aw++;
......@@ -807,7 +811,7 @@ void addmul_N64_6464_sse(uint64_t *C,
for (int i = 0; i < 64; i++) {
__m128i bw = _cado_mm_set1_epi64(B[i]);
// c ^= (bw & -(a & one));
c = _mm_xor_si128(c,_mm_and_si128(bw,_mm_sub_epi64(_mm_setzero_si128(),_mm_and_si128(a, one))));
c = _mm_xor_si128(c, _mm_and_si128(bw, _mm_sub_epi64(_mm_setzero_si128(), _mm_and_si128(a, one))));
a = _mm_srli_epi64(a, 1);
}
*Cw = _mm_xor_si128(*Cw, c);
......@@ -825,12 +829,43 @@ void addmul_N64_6464_sse(uint64_t *C,
*C++ ^= c;
}
}
/* can work in place, so not simply memset0 + addmul (the ^= have been
* changed to =)
*/
void mul_N64_6464_sse(uint64_t *C,
const uint64_t *A,
const uint64_t *B, size_t m)
{
memset(C, 0, m * sizeof(uint64_t));
addmul_N64_6464_sse(C,A,B,m);
size_t j;
__m128i *Cw = (__m128i *) C;
__m128i *Aw = (__m128i *) A;
/* If m is odd, then we can't do sse all the way through because of
* data width */
for (j = 0; j < m - 1; j += 2) {
__m128i c = _mm_setzero_si128();
__m128i a = *Aw++;
__m128i one = _cado_mm_set1_epi64_c(1);
for (int i = 0; i < 64; i++) {
__m128i bw = _cado_mm_set1_epi64(B[i]);
// c ^= (bw & -(a & one));
c = _mm_xor_si128(c, _mm_and_si128(bw, _mm_sub_epi64(_mm_setzero_si128(), _mm_and_si128(a, one))));
a = _mm_srli_epi64(a, 1);
}
*Cw++ = c;
}
C += j;
A += j;
for (; j < m; j++) {
uint64_t c = UINT64_C(0);
uint64_t a = *A++;
for (int i = 0; i < 64; i++) {
c ^= (B[i] & -(a & UINT64_C(1)));
a >>= UINT64_C(1);
}
*C++ = c;
}
}
#endif
......@@ -1791,6 +1826,9 @@ void pmattab_complete(int * phi, uint64_t * bits, int nbits)
}
}
/* Given phi as in PLUQ64_inner below, return two permutation matrices
* such that p*u*transpose(q) is a diagonal matrix.
*/
void pqperms_from_phi(pmat_ptr p, pmat_ptr q, int * phi, int m, int n)
{
int ip = 0;ASSERT_ALWAYS((m%64)==0);ASSERT_ALWAYS(p->n==m);
......@@ -1818,8 +1856,19 @@ void mat64_copy(mat64_ptr b, mat64_srcptr a)
{
memcpy(b,a,sizeof(mat64));
}
void mat64_set_zero(mat64_ptr m)
{
memset(m,0,sizeof(mat64));
}
/* {{{ PLUQ stuff -- well we're not computing exactly PLUQ */
/* {{{ PLUQ stuff */
/* Compute matrices l and u such that l*a = u. phi[] is filled with the
* column indices (shifted by col_offset) of he pivots in u: entry
* (i,phi(i)-col_offset) is u is one. When row i in u has no important
* non-zero coefficient, then phi[i] < 0.
* In column phi[i]-col_offset of u, entries of row index >i are zero.
*/
int PLUQ64_inner(int * phi, mat64 l, mat64 u, mat64 a, int col_offset)
{
const int m = 64;
......@@ -1886,12 +1935,25 @@ int PLUQ64_inner(int * phi, mat64 l, mat64 u, mat64 a, int col_offset)
return rank;
}
/* PLUQ -- well we're not computing exactly PLUQ
* PLUQ says: Any m*n matrix A with rank r , can be written A = P*L*U*Q
* where P and Q are two permutation matrices, of dimension respectively
* m*m and n*n, L is m*r unit lower triangular and U is r*n upper
* triangular.
*
* Here we compute p,l,u,q such that p*l*a*transpose(q) = an upper
* triangular matrix, whose diagonal has r one and n-r zeros.
*/
/* outer routine */
int PLUQ64(pmat_ptr p, mat64 * l, mat64 * u, pmat_ptr q, mat64 * m)
{
int phi[64];
for(int i = 0 ; i < 64 ; i++) phi[i]=-1;
int r = PLUQ64_inner(phi,l[0],u[0],m[0],0);
/* l*m = u */
/* p*u*transpose(q) = diagonal.
* p*l*m*transpose(q) = diagonal.
*/
pqperms_from_phi(p,q,phi,64,64);
return r;
}
......@@ -1939,7 +2001,7 @@ int PLUQ64_n(int * phi, mat64 l, mat64 * u, mat64 * a, int n)
}
#endif
for( ; b < nb ; b++)
mul_6464_6464(u[b], l, u[b]);
mul_6464_6464(u[b], l, a[b]);
return nspins*m+b;
}
......@@ -1991,6 +2053,13 @@ static inline void bli_64x64N_clobber(mat64 h, mat64 * us, int * phi, int nb)
#endif
}
/* Given a 64x128 matrix u that is upper triangular up to some
* permutation, written as a sequence of two 64x64 matrices,
* and given a table phi such that either phi[i]<0, or entry (i,phi[i])
* of u is non-zero, and the nonnegative values taken by phi are all
* distinct, compute a matrix H such that H*U has exactly one non-zero
* entry in each column whose index is a value taken by phi.
*/
void bli_64x128(mat64 h, mat64 * us, int * phi)
{
mat64 uc[2];
......@@ -2061,6 +2130,11 @@ void extract_cols_64_from_128(mat64 t, mat64 * m, int * phi)
}
*/
/* This code is here because someday, I vaguely had the idea of using it
* as a building block for the binary lingen. In fact, the code fragments
* here for PLUQ and such have never been put in production, so I'm
* pretty sure they're quite fragile.
*/
int PLUQ128(pmat_ptr p, mat64 * l, mat64 * u, pmat_ptr q, mat64 * m)
{
/* This is really an outer routine. An inner routine will not have p
......@@ -2071,31 +2145,63 @@ int PLUQ128(pmat_ptr p, mat64 * l, mat64 * u, pmat_ptr q, mat64 * m)
int phi[128];
for(int i = 0 ; i < 128 ; i++) phi[i]=-1;
mat64_set_zero(l[0]);
mat64_set_zero(l[1]);
mat64_set_zero(l[2]);
mat64_set_identity(l[3]);
int r1 = PLUQ64_n(phi,l[0],u,m,128);
r1 = r1 % 64;
// andouille 7.65
/* l[0] * m = u */
mat64 h;
bli_64x128(h, u, phi);
/* h * u is "sort of" identity, at least up to permutation */
mat64 l21;
mat64_ptr S = l21;
/* This is __very__ expensive w.r.t. what it really does :-(( */
extract_cols_64_from_128(l21, m+2, phi);
extract_cols_64_from_128(S, m+2, phi);
/* Column i of S is column phi[i] in Mlow. Now by bli_64x128, in
* column phi[i] of h*u, only the coefficient of row i is equal to 1,
* so that column phi[i] of S*H*U is equal to column phi[i] of Mlow
*/
// andouille 16.7 -- 17.5
mul_6464_6464(l21, l21, h);
mul_6464_6464(l21, S, h);
mul_6464_6464(l[2], l21, l[0]);
// The matrix below has many zero columns (as many as the rank of
// Mhigh).
// Mlow+S*H*l[0]*Mhigh;
/* The matrix [ l[0] 0 ] = L
* [ l[2] 1 ]
* is equal to [ 1 0 ] [ l[0] 0 ]
* [ l21 1 ] * [ 0 1 ]
*
* Now based on l[0] * mhigh, compute t2 = (L*M)_low
*/
mat64 t2[2];
mul_6464_6464(t2[0], l21, u[0]); add_6464_6464(t2[0], m[2], t2[0]);
mul_6464_6464(t2[1], l21, u[1]); add_6464_6464(t2[1], m[3], t2[1]);
/* And do pluq again on this low part. Most of it is zero. */
int r2 = PLUQ64_n(phi + 64,l[3],u+2,t2,128);
r2 = r2 % 64;
/* need to adjust l[3] */
mul_6464_6464(l[2], l[3], l[2]);
pqperms_from_phi(p,q,phi,128,128);
/* At this point P*L*M*Tranpose(Q) should be upper triangular with
* unit diagonal */
return r1 + r2;
}
......
......@@ -32,10 +32,12 @@ extern void pqperms_from_phi(pmat_ptr p, pmat_ptr q, int * phi, int m, int n);
static inline int pmat_get(pmat_srcptr x, int k) { return x->v[k]; }
static inline void pmat_set(pmat_srcptr x, int k, int w) { x->v[k]=w; }
extern void mat64_set_identity(mat64_ptr m);
extern int mat64_is_uppertriangular(mat64_srcptr u);
extern int mat64_is_lowertriangular(mat64_srcptr u);
extern int mat64_triangular_is_unit(mat64_srcptr u);
extern int mat64_eq(mat64_srcptr a, mat64_srcptr b);
extern void mat64_copy(mat64_ptr a, mat64_srcptr b);
/* These are the only important entry points. These functions
* are meant to be accessors for tuned implementations (ok, for now
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment