diff --git a/Makefile.am b/Makefile.am index b6af5fd6ab8d155f8f28018c4ce7eb2b40f0a08e..17e2551a25e485e4e8fe0cfd81118c1553fd59a8 100644 --- a/Makefile.am +++ b/Makefile.am @@ -53,7 +53,7 @@ libecm_la_CFLAGS = $(OPENMP_CFLAGS) -g libecm_la_LDFLAGS = $(LIBECM_LDFLAGS) -version-info 1:0:0 -g libecm_la_LIBADD = $(MULREDCLIBRARY) if WANT_GPU - libecm_la_SOURCES += cudakernel.cu + libecm_la_SOURCES += cudakernel.cu cudacommon.cu if WANT_CGBN libecm_la_SOURCES += cgbn_stage1.cu endif @@ -111,7 +111,8 @@ endif include_HEADERS = ecm.h noinst_HEADERS = basicdefs.h ecm-impl.h ecm-gmp.h ecm-ecm.h sp.h longlong.h \ - ecm-params.h mpmod.h ecm-gpu.h cudakernel.h cgbn_stage1.h \ + ecm-params.h mpmod.h ecm-gpu.h \ + cudakernel.h cudacommon.h cgbn_stage1.h \ addlaws.h getprime_r.h ecm_int.h \ aprtcle/mpz_aprcl.h aprtcle/jacobi_sum.h diff --git a/cgbn_stage1.cu b/cgbn_stage1.cu index 6e3cc9d6495bdb4b024f9802cd9e409ff87f6661..57dffb4ba7a28b3294846fd8690c4fde0906d32e 100644 --- a/cgbn_stage1.cu +++ b/cgbn_stage1.cu @@ -21,6 +21,10 @@ http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., #ifndef _CGBN_STAGE1_CU #define _CGBN_STAGE1_CU 1 +#ifndef __CUDACC__ +#error "This file should only be compiled with nvcc" +#endif + #include "cgbn_stage1.h" #include <cassert> @@ -33,6 +37,8 @@ http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., #include <cgbn.h> #include <cuda.h> +#include "cudacommon.h" + #include "ecm.h" #include "ecm-gpu.h" @@ -57,16 +63,7 @@ http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., #define FORCE_INLINE #endif -void cuda_check(cudaError_t status, const char *action=NULL, const char *file=NULL, int32_t line=0) { - // check for cuda errors - if (status!=cudaSuccess) { - fprintf (stderr, "CUDA error (%d) occurred: %s\n", status, cudaGetErrorString(status)); - if (action!=NULL) - fprintf (stderr, "While running %s (file %s, line %d)\n", action, file, line); - exit(1); - } -} - +// support routine copied from "CGBN/samples/utility/support.h" void cgbn_check(cgbn_error_report_t *report, const char *file=NULL, int32_t line=0) { // check for cgbn errors @@ -92,8 +89,6 @@ void cgbn_check(cgbn_error_report_t *report, const char *file=NULL, int32_t line } } -// Unify this with cudakernel.cu -#define CUDA_CHECK(action) cuda_check(action, #action, __FILE__, __LINE__) #define CGBN_CHECK(report) cgbn_check(report, __FILE__, __LINE__) static @@ -107,7 +102,7 @@ void from_mpz(const mpz_t s, uint32_t *x, uint32_t count) { if(mpz_sizeinbase (s, 2) > count * 32) { fprintf (stderr, "from_mpz failed -- result does not fit\n"); - exit(1); + exit(EXIT_FAILURE); } mpz_export (x, &words, -1, sizeof(uint32_t), 0, 0, s); @@ -473,9 +468,9 @@ int findfactor(mpz_t factor, const mpz_t N, const mpz_t x_final, const mpz_t y_f mpz_t temp; mpz_init(temp); - // Check if factor found - + /* Check if factor found */ bool inverted = mpz_invert(temp, y_final, N); // aY ^ (N-2) % N + if (inverted) { mpz_mul(temp, x_final, temp); // aX * aY^-1 mpz_mod(factor, temp, N); // "Residual" @@ -493,7 +488,7 @@ static int verify_size_of_n(const mpz_t N, size_t max_bits) { size_t n_log2 = mpz_sizeinbase(N, 2); - // using check_gpuecm.sage it looks like 4 bits would suffice + /* Using check_gpuecm.sage it looks like 4 bits would suffice. */ size_t max_usable_bits = max_bits - CARRY_BITS; if (n_log2 <= max_usable_bits) @@ -595,8 +590,9 @@ int process_results(mpz_t *factors, int *array_found, } int cgbn_ecm_stage1(mpz_t *factors, int *array_found, + const mpz_t N, const mpz_t s, uint32_t curves, uint32_t sigma, - const mpz_t N, const mpz_t s, float *gputime) + float *gputime, int verbose) { assert( sigma > 0 ); assert( ((uint64_t) sigma + curves) <= 0xFFFFFFFF ); // no overflow @@ -685,6 +681,10 @@ int cgbn_ecm_stage1(mpz_t *factors, int *array_found, TPI = (BITS <= 512) ? 4 : (BITS <= 2048) ? 8 : (BITS <= 8192) ? 16 : 32; IPB = TPB / TPI; BLOCK_COUNT = (curves + IPB - 1) / IPB; + + /* Print some debug info about kernel. */ + kernel_info((const void*)kernel_double_add<cgbn_params_512>, verbose); + break; } } @@ -828,4 +828,10 @@ int cgbn_ecm_stage1(mpz_t *factors, int *array_found, return youpi; } +#ifdef __CUDA_ARCH__ + #if __CUDA_ARCH__ < 300 + #error "Unsupported architecture" + #endif +#endif + #endif /* _CGBN_STAGE1_CU */ diff --git a/cgbn_stage1.h b/cgbn_stage1.h index e26a1e030076ed017653da536b0a5154b441468d..954beb0a40700f8dbf982fb68938e4c6ee500e3c 100644 --- a/cgbn_stage1.h +++ b/cgbn_stage1.h @@ -30,8 +30,9 @@ extern "C" { #endif int cgbn_ecm_stage1(mpz_t *factors, int *array_found, + const mpz_t N, const mpz_t s, uint32_t curves, uint32_t sigma, - const mpz_t N, const mpz_t s, float *gputime); + float *gputime, int verbose); #ifdef __cplusplus } diff --git a/cudacommon.cu b/cudacommon.cu new file mode 100644 index 0000000000000000000000000000000000000000..c35e3bfe50cb55392c04c2b7b48b796d5bb77219 --- /dev/null +++ b/cudacommon.cu @@ -0,0 +1,148 @@ +/* When compiling the CUDA code, we do not want to include all ecm-impl.h*/ +#define _DO_NOT_INCLUDE_ECM_IMPL_H + +#include "cudacommon.h" +#include "ecm-gpu.h" + +#include <stdio.h> + + +#ifndef __CUDACC__ +#error "This file should only be compiled with nvcc" +#endif + +/* First call to a global function initialize the device */ +__global__ void Cuda_Init_Device () +{ +} + +/* Given the compute compatibility (as major.minor), return the number of block + * to be run on one multiprocessor. */ +static unsigned int +getNumberOfBlockPerMultiProcessor (int major, int minor) +{ + /* For 2.0 and 2.1, limited by the maximum number of threads per MP and the + * number of available registrer (need 23 registers per threads). + */ + if (major == 2) + return 1; + /* For 3.0, 3.2, 3.5 and 3.7 limited by the maximum number of threads per MP. + */ + else if (major == 3) + return 2; + /* For 5.0, 5.2, and 5.3 limited by the maximum number of threads per MP. */ + else if (major == 5) + return 2; + /* We assume that for newer compute capability the properties of the GPU won't + * decrease. + */ + else + return 2; +} + +extern "C" +int +get_device_prop(int device, cudaDeviceProp *deviceProp) +{ + cudaError_t err; + + if (device!=-1) + { + err = cudaSetDevice(device); + if (err != cudaSuccess) + { + fprintf (stderr, "GPU: Error: Could not use device %d\n", device); + fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err)); + return 0; + } + } + + err = cudaGetDevice (&device); + if (err != cudaSuccess) + { + fprintf (stderr, "GPU: Error: no active device.\n"); + fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err)); + return 0; + } + + err = cudaGetDeviceProperties (deviceProp, device); + if (err != cudaSuccess) + { + fprintf (stderr, "GPU: Error while getting device's properties.\n"); + fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err)); + return 0; + } + return 1; +} + +extern "C" +int +select_and_init_GPU (int device, unsigned int *number_of_curves, int verbose, int schedule) +{ + cudaDeviceProp deviceProp; + + if (device!=-1 && verbose) + fprintf (stdout, "GPU: device %d is required.\n", device); + + if (!get_device_prop(device, &deviceProp)) + return -1; + + if (verbose) + { + printf ("GPU: will use device %d: %s, compute capability %d.%d, %d MPs.\n" + "GPU: maxSharedPerBlock = %zu maxThreadsPerBlock = %d " + "maxRegsPerBlock = %d\n", device, deviceProp.name, + deviceProp.major, deviceProp.minor, + deviceProp.multiProcessorCount, deviceProp.sharedMemPerBlock, + deviceProp.maxThreadsPerBlock, deviceProp.regsPerBlock); + } + + + if (*number_of_curves == 0) /* if choose the number of curves */ + { + unsigned int n, m = ECM_GPU_CURVES_BY_BLOCK; + n = getNumberOfBlockPerMultiProcessor (deviceProp.major, deviceProp.minor); + *number_of_curves = n * deviceProp.multiProcessorCount * m; + } + else if (*number_of_curves % ECM_GPU_CURVES_BY_BLOCK != 0) + { + /* number_of_curves should be a multiple of ECM_GPU_CURVES_BY_BLOCK */ + *number_of_curves = (*number_of_curves / ECM_GPU_CURVES_BY_BLOCK + 1) * + ECM_GPU_CURVES_BY_BLOCK; + if (verbose) + fprintf(stderr, "GPU: the requested number of curves has been " + "modified to %u\n", *number_of_curves); + } + + /* First call to a global function initialize the device */ + if (schedule == 1) + { + cuda_check (cudaSetDeviceFlags (cudaDeviceScheduleBlockingSync)); + } + else + { + cuda_check (cudaSetDeviceFlags (cudaDeviceScheduleYield)); + } + Cuda_Init_Device<<<1, 1>>> (); + cuda_check (cudaGetLastError()); + + return 0; +} + +void +kernel_info(const void* func, int verbose) +{ + if (verbose) + { + struct cudaFuncAttributes kernelAttr; + cudaError_t err = cudaFuncGetAttributes (&kernelAttr, func); + if (err == cudaSuccess) + printf ("GPU: Using device code targeted for architecture compile_%d\n" + "GPU: Ptx version is %d\nGPU: maxThreadsPerBlock = %d\n" + "GPU: numRegsPerThread = %d sharedMemPerBlock = %zu bytes\n", + kernelAttr.binaryVersion, kernelAttr.ptxVersion, + kernelAttr.maxThreadsPerBlock, kernelAttr.numRegs, + kernelAttr.sharedSizeBytes); + } +} + diff --git a/cudacommon.h b/cudacommon.h new file mode 100644 index 0000000000000000000000000000000000000000..b668c6f1deca3d619ab7dabe551c8498b68e5f7c --- /dev/null +++ b/cudacommon.h @@ -0,0 +1,38 @@ +#ifndef _CUDACOMMON_H +#define _CUDACOMMON_H 1 + +#include <cuda.h> +#include <cuda_runtime_api.h> +#include <stdio.h> + +#ifdef __cplusplus +/* cpp + CUDA only code */ + +#define CUDA_CHECK(action) cuda_check(action, #action, __FILE__, __LINE__) + +inline void cuda_check(cudaError_t status, const char *action=NULL, const char *file=NULL, int32_t line=0) { + if (status != cudaSuccess) { + fprintf (stderr, "CUDA error (%d) occurred: %s\n", status, cudaGetErrorString(status)); + if (action!=NULL) + fprintf (stderr, "While running %s (file %s, line %d)\n", action, file, line); + exit(EXIT_FAILURE); + } +} + + +void kernel_info(const void* func, int verbose); +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +int get_device_prop(int device, struct cudaDeviceProp *deviceProp); +int select_and_init_GPU (int, unsigned int*, int, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _CUDACOMMON_H */ diff --git a/cudakernel.cu b/cudakernel.cu index c4f49f1aa717276ec5d241b7181014acdfce7301..ae121c7dc4292c7d39005333a026d6ecb0758dc2 100644 --- a/cudakernel.cu +++ b/cudakernel.cu @@ -3,6 +3,7 @@ #include "ecm-gpu.h" #include <gmp.h> #include "cudakernel.h" +#include "cudacommon.h" #ifndef __CUDACC__ #error "This file should only be compiled with nvcc" @@ -14,152 +15,14 @@ __device__ biguint_t d_3Ncst; __device__ biguint_t d_Mcst; -#define errCheck(err) cuda_errCheck (err, __FILE__, __LINE__) -#define cudaMalloc(d, size) errCheck (cudaMalloc (d, size)) -#define cudaMemcpyHtoD(d, h, size) errCheck (cudaMemcpy ((void *) d, \ +#define cudaMalloc(d, size) cuda_check (cudaMalloc (d, size)) +#define cudaMemcpyHtoD(d, h, size) cuda_check (cudaMemcpy ((void *) d, \ (void *) h, size, cudaMemcpyHostToDevice)) -#define cudaMemcpyDtoH(h, d, size) errCheck (cudaMemcpy ((void *) h, \ +#define cudaMemcpyDtoH(h, d, size) cuda_check (cudaMemcpy ((void *) h, \ (void *) d, size, cudaMemcpyDeviceToHost)) -#define cudaMemcpyCst(d, h, size) errCheck (cudaMemcpyToSymbol (d, h, size)) +#define cudaMemcpyCst(d, h, size) cuda_check (cudaMemcpyToSymbol (d, h, size)) -/******************************/ -/* Host code handling the GPU */ -/******************************/ - -inline void cuda_errCheck (cudaError err, const char *file, const int line) -{ - if( err != cudaSuccess ) - { - fprintf(stderr, "%s(%i) : Error cuda : %s.\n", - file, line, cudaGetErrorString( err) ); - exit(EXIT_FAILURE); - } -} - -/* First call to a global function initialize the device */ -__global__ void Cuda_Init_Device () -{ -} - -/* Given the compute compatibility (as major.minor), return the number of block - * to be run on one multiprocessor. */ -unsigned int -getNumberOfBlockPerMultiProcessor (int major, int minor) -{ - /* For 2.0 and 2.1, limited by the maximum number of threads per MP and the - * number of available registrer (need 23 registers per threads). - */ - if (major == 2) - return 1; - /* For 3.0, 3.2, 3.5 and 3.7 limited by the maximum number of threads per MP. - */ - else if (major == 3) - return 2; - /* For 5.0, 5.2, and 5.3 limited by the maximum number of threads per MP. */ - else if (major == 5) - return 2; - /* We assume that for newer compute capability the properties of the GPU won't - * decrease. - */ - else - return 2; -} - -extern "C" -int -select_and_init_GPU (int device, unsigned int *number_of_curves, int verbose, int schedule) -{ - cudaDeviceProp deviceProp; - cudaError_t err; - - if (device!=-1) - { - if (verbose) - fprintf (stdout, "GPU: device %d is required.\n", device); - - err = cudaSetDevice(device); - if (err != cudaSuccess) - { - fprintf (stderr, "GPU: Error: Could not use device %d\n", device); - fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err)); - return -1; - } - } - - err = cudaGetDevice (&device); - if (err != cudaSuccess) - { - fprintf (stderr, "GPU: Error: no active device.\n"); - fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err)); - return -1; - } - - err = cudaGetDeviceProperties (&deviceProp, device); - if (err != cudaSuccess) - { - fprintf (stderr, "GPU: Error while getting device's properties.\n"); - fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err)); - return -1; - } - - if (verbose) - { - printf ("GPU: will use device %d: %s, compute capability %d.%d, %d MPs.\n" - "GPU: maxSharedPerBlock = %zu maxThreadsPerBlock = %d " - "maxRegsPerBlock = %d\n", device, deviceProp.name, - deviceProp.major, deviceProp.minor, - deviceProp.multiProcessorCount, deviceProp.sharedMemPerBlock, - deviceProp.maxThreadsPerBlock, deviceProp.regsPerBlock); - } - - - if (*number_of_curves == 0) /* if choose the number of curves */ - { - unsigned int n, m = ECM_GPU_CURVES_BY_BLOCK; - n = getNumberOfBlockPerMultiProcessor (deviceProp.major, deviceProp.minor); - *number_of_curves = n * deviceProp.multiProcessorCount * m; - } - else if (*number_of_curves % ECM_GPU_CURVES_BY_BLOCK != 0) - { - /* number_of_curves should be a multiple of ECM_GPU_CURVES_BY_BLOCK */ - *number_of_curves = (*number_of_curves / ECM_GPU_CURVES_BY_BLOCK + 1) * - ECM_GPU_CURVES_BY_BLOCK; - if (verbose) - fprintf(stderr, "GPU: the requested number of curves has been " - "modified to %u\n", *number_of_curves); - } - - /* First call to a global function initialize the device */ - if (schedule == 1) - { - errCheck (cudaSetDeviceFlags (cudaDeviceScheduleBlockingSync)); - } - else - { - errCheck (cudaSetDeviceFlags (cudaDeviceScheduleYield)); - } - Cuda_Init_Device<<<1, 1>>> (); - errCheck (cudaGetLastError()); - - if (verbose) - { - struct cudaFuncAttributes kernelAttr; - err = cudaFuncGetAttributes (&kernelAttr, Cuda_Ell_DblAdd); - if (err == cudaSuccess) - { - printf ("GPU: Using device code targeted for architecture compile_%d\n" - "GPU: Ptx version is %d\nGPU: maxThreadsPerBlock = %d\n" - "GPU: numRegsPerThread = %d sharedMemPerBlock = %zu bytes\n", - kernelAttr.binaryVersion, kernelAttr.ptxVersion, - kernelAttr.maxThreadsPerBlock, kernelAttr.numRegs, - kernelAttr.sharedSizeBytes); - } - } - - return 0; -} - extern "C" float cuda_Main (biguint_t h_N, biguint_t h_3N, biguint_t h_M, digit_t h_invN, biguint_t *h_xarray, biguint_t *h_zarray, @@ -167,6 +30,10 @@ float cuda_Main (biguint_t h_N, biguint_t h_3N, biguint_t h_M, digit_t h_invN, unsigned int firstinvd, unsigned int number_of_curves, int verbose) { + /* Print some debug info about the kernel */ + kernel_info((const void*) Cuda_Ell_DblAdd, verbose); + + cudaEvent_t start, stop; cudaEventCreate (&start); cudaEventCreate (&stop); @@ -197,7 +64,7 @@ float cuda_Main (biguint_t h_N, biguint_t h_3N, biguint_t h_M, digit_t h_invN, /* Create a pair of events to pace ourselves */ for (i=0; i<MAXEVENTS; i++) - errCheck (cudaEventCreateWithFlags (&event[i], + cuda_check (cudaEventCreateWithFlags (&event[i], cudaEventBlockingSync|cudaEventDisableTiming)); cudaMalloc (&d_xA, array_size); @@ -259,7 +126,7 @@ float cuda_Main (biguint_t h_N, biguint_t h_3N, biguint_t h_M, digit_t h_invN, } /* If an error occurs during the kernel calls in the loop */ - errCheck (cudaGetLastError()); + cuda_check (cudaGetLastError()); /* Await for last recorded events */ while (nEventsRecorded != 0) @@ -275,7 +142,7 @@ float cuda_Main (biguint_t h_N, biguint_t h_3N, biguint_t h_M, digit_t h_invN, /* Clean up our events and our stream handle */ for (i=0; i<MAXEVENTS; i++) - errCheck (cudaEventDestroy(event[i])); + cuda_check (cudaEventDestroy(event[i])); cudaFree ((void *) d_xA); @@ -288,8 +155,8 @@ float cuda_Main (biguint_t h_N, biguint_t h_3N, biguint_t h_M, digit_t h_invN, cudaEventElapsedTime (&elltime, start, stop); - errCheck (cudaEventDestroy (start)); - errCheck (cudaEventDestroy (stop)); + cuda_check (cudaEventDestroy (start)); + cuda_check (cudaEventDestroy (stop)); return elltime; } diff --git a/cudakernel.h b/cudakernel.h index 4e05d796c1e8990efb55ff46710978a88a2a3f47..173c2a2ce871fb1e4a80e3447761aa0d973a4b55 100644 --- a/cudakernel.h +++ b/cudakernel.h @@ -1,9 +1,12 @@ #ifndef _CUDAKERNEL_H #define _CUDAKERNEL_H 1 +#include <cuda.h> + #ifdef __cplusplus __global__ void Cuda_Ell_DblAdd (biguint_t *xarg, biguint_t *zarg, biguint_t *x2arg, biguint_t *z2arg, unsigned int firstinvd); + #endif @@ -11,6 +14,7 @@ __global__ void Cuda_Ell_DblAdd (biguint_t *xarg, biguint_t *zarg, extern "C" { #endif +int get_device_prop(int device, struct cudaDeviceProp *deviceProp); int select_and_init_GPU (int, unsigned int*, int, int); float cuda_Main (biguint_t, biguint_t, biguint_t, digit_t, biguint_t*, biguint_t*, biguint_t*, biguint_t*, mpz_t, unsigned int, diff --git a/cudawrapper.c b/cudawrapper.c index bb684dd4976dc3d1d28fd9e9267600a7c34311f5..d2f2b8a02befcd6a06761ee5ae242908a824103d 100644 --- a/cudawrapper.c +++ b/cudawrapper.c @@ -2,10 +2,13 @@ #ifdef WITH_GPU +#include "cudacommon.h" +#include "cudakernel.h" + #ifdef HAVE_CGBN_H #include "cgbn_stage1.h" #endif /* HAVE_CGBN_H */ -#include "cudakernel.h" + #define TWO32 4294967296 /* 2^32 */ @@ -490,7 +493,7 @@ gpu_ecm (mpz_t f, mpz_t x, int param, mpz_t firstsigma, mpz_t n, mpz_t go, /* Set cudaDeviceScheduleBlockingSync with -cgbn, else cudaDeviceScheduleYield */ int schedule = use_cgbn ? 1 : 0; - /* Initialize the GPU if necessary */ + /* Initialize the GPU if necessary and determine nb_curves */ if (!*device_init) { st = cputime (); @@ -509,7 +512,7 @@ gpu_ecm (mpz_t f, mpz_t x, int param, mpz_t firstsigma, mpz_t n, mpz_t go, /* try running 'nvidia-smi -q -l' on the background . */ *device_init = 1; } - + /* Init arrays */ factors = (mpz_t *) malloc (*nb_curves * sizeof (mpz_t)); ASSERT_ALWAYS (factors != NULL); @@ -578,7 +581,7 @@ gpu_ecm (mpz_t f, mpz_t x, int param, mpz_t firstsigma, mpz_t n, mpz_t go, print_expcurves (B1, B2, dF, k, root_params.S, param); } } - + st = cputime (); if (use_cgbn) { @@ -761,7 +764,5 @@ end_gpu_ecm2: return youpi; } -#endif - - +#endif /* HAVE_GPU */