diff --git a/Makefile.am b/Makefile.am
index b6af5fd6ab8d155f8f28018c4ce7eb2b40f0a08e..17e2551a25e485e4e8fe0cfd81118c1553fd59a8 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -53,7 +53,7 @@ libecm_la_CFLAGS = $(OPENMP_CFLAGS) -g
 libecm_la_LDFLAGS = $(LIBECM_LDFLAGS) -version-info 1:0:0 -g
 libecm_la_LIBADD = $(MULREDCLIBRARY)
 if WANT_GPU 
-  libecm_la_SOURCES += cudakernel.cu
+  libecm_la_SOURCES += cudakernel.cu cudacommon.cu
 if WANT_CGBN
   libecm_la_SOURCES += cgbn_stage1.cu
 endif
@@ -111,7 +111,8 @@ endif
 
 include_HEADERS = ecm.h
 noinst_HEADERS = basicdefs.h ecm-impl.h ecm-gmp.h ecm-ecm.h sp.h longlong.h \
-                 ecm-params.h mpmod.h ecm-gpu.h cudakernel.h cgbn_stage1.h \
+                 ecm-params.h mpmod.h ecm-gpu.h \
+                 cudakernel.h cudacommon.h cgbn_stage1.h \
                  addlaws.h getprime_r.h ecm_int.h \
                  aprtcle/mpz_aprcl.h aprtcle/jacobi_sum.h
 
diff --git a/cgbn_stage1.cu b/cgbn_stage1.cu
index 6e3cc9d6495bdb4b024f9802cd9e409ff87f6661..57dffb4ba7a28b3294846fd8690c4fde0906d32e 100644
--- a/cgbn_stage1.cu
+++ b/cgbn_stage1.cu
@@ -21,6 +21,10 @@ http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
 #ifndef _CGBN_STAGE1_CU
 #define _CGBN_STAGE1_CU 1
 
+#ifndef __CUDACC__
+#error "This file should only be compiled with nvcc"
+#endif
+
 #include "cgbn_stage1.h"
 
 #include <cassert>
@@ -33,6 +37,8 @@ http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
 #include <cgbn.h>
 #include <cuda.h>
 
+#include "cudacommon.h"
+
 #include "ecm.h"
 #include "ecm-gpu.h"
 
@@ -57,16 +63,7 @@ http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
     #define FORCE_INLINE
 #endif
 
-void cuda_check(cudaError_t status, const char *action=NULL, const char *file=NULL, int32_t line=0) {
-  // check for cuda errors
-  if (status!=cudaSuccess) {
-    fprintf (stderr, "CUDA error (%d) occurred: %s\n", status, cudaGetErrorString(status));
-    if (action!=NULL)
-      fprintf (stderr, "While running %s   (file %s, line %d)\n", action, file, line);
-    exit(1);
-  }
-}
-
+// support routine copied from  "CGBN/samples/utility/support.h"
 void cgbn_check(cgbn_error_report_t *report, const char *file=NULL, int32_t line=0) {
   // check for cgbn errors
 
@@ -92,8 +89,6 @@ void cgbn_check(cgbn_error_report_t *report, const char *file=NULL, int32_t line
   }
 }
 
-// Unify this with cudakernel.cu
-#define CUDA_CHECK(action) cuda_check(action, #action, __FILE__, __LINE__)
 #define CGBN_CHECK(report) cgbn_check(report, __FILE__, __LINE__)
 
 static
@@ -107,7 +102,7 @@ void from_mpz(const mpz_t s, uint32_t *x, uint32_t count) {
 
   if(mpz_sizeinbase (s, 2) > count * 32) {
     fprintf (stderr, "from_mpz failed -- result does not fit\n");
-    exit(1);
+    exit(EXIT_FAILURE);
   }
 
   mpz_export (x, &words, -1, sizeof(uint32_t), 0, 0, s);
@@ -473,9 +468,9 @@ int findfactor(mpz_t factor, const mpz_t N, const mpz_t x_final, const mpz_t y_f
     mpz_t temp;
     mpz_init(temp);
 
-    // Check if factor found
-
+    /* Check if factor found */
     bool inverted = mpz_invert(temp, y_final, N);    // aY ^ (N-2) % N
+
     if (inverted) {
         mpz_mul(temp, x_final, temp);         // aX * aY^-1
         mpz_mod(factor, temp, N);             // "Residual"
@@ -493,7 +488,7 @@ static
 int verify_size_of_n(const mpz_t N, size_t max_bits) {
   size_t n_log2 = mpz_sizeinbase(N, 2);
 
-  // using check_gpuecm.sage it looks like 4 bits would suffice
+  /* Using check_gpuecm.sage it looks like 4 bits would suffice. */
   size_t max_usable_bits = max_bits - CARRY_BITS;
 
   if (n_log2 <= max_usable_bits)
@@ -595,8 +590,9 @@ int process_results(mpz_t *factors, int *array_found,
 }
 
 int cgbn_ecm_stage1(mpz_t *factors, int *array_found,
+             const mpz_t N, const mpz_t s,
              uint32_t curves, uint32_t sigma,
-             const mpz_t N, const mpz_t s, float *gputime)
+             float *gputime, int verbose)
 {
   assert( sigma > 0 );
   assert( ((uint64_t) sigma + curves) <= 0xFFFFFFFF ); // no overflow
@@ -685,6 +681,10 @@ int cgbn_ecm_stage1(mpz_t *factors, int *array_found,
       TPI = (BITS <= 512) ? 4 : (BITS <= 2048) ? 8 : (BITS <= 8192) ? 16 : 32;
       IPB = TPB / TPI;
       BLOCK_COUNT = (curves + IPB - 1) / IPB;
+
+      /* Print some debug info about kernel. */
+      kernel_info((const void*)kernel_double_add<cgbn_params_512>, verbose);
+
       break;
     }
   }
@@ -828,4 +828,10 @@ int cgbn_ecm_stage1(mpz_t *factors, int *array_found,
   return youpi;
 }
 
+#ifdef __CUDA_ARCH__
+  #if __CUDA_ARCH__ < 300
+    #error "Unsupported architecture"
+  #endif
+#endif
+
 #endif  /* _CGBN_STAGE1_CU */
diff --git a/cgbn_stage1.h b/cgbn_stage1.h
index e26a1e030076ed017653da536b0a5154b441468d..954beb0a40700f8dbf982fb68938e4c6ee500e3c 100644
--- a/cgbn_stage1.h
+++ b/cgbn_stage1.h
@@ -30,8 +30,9 @@ extern "C" {
 #endif
 
 int cgbn_ecm_stage1(mpz_t *factors, int *array_found,
+             const mpz_t N, const mpz_t s,
              uint32_t curves, uint32_t sigma,
-             const mpz_t N, const mpz_t s, float *gputime);
+             float *gputime, int verbose);
 
 #ifdef __cplusplus
 }
diff --git a/cudacommon.cu b/cudacommon.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c35e3bfe50cb55392c04c2b7b48b796d5bb77219
--- /dev/null
+++ b/cudacommon.cu
@@ -0,0 +1,148 @@
+/* When compiling the CUDA code, we do not want to include all ecm-impl.h*/
+#define _DO_NOT_INCLUDE_ECM_IMPL_H
+
+#include "cudacommon.h"
+#include "ecm-gpu.h"
+
+#include <stdio.h>
+
+
+#ifndef __CUDACC__
+#error "This file should only be compiled with nvcc"
+#endif
+
+/* First call to a global function initialize the device */
+__global__ void Cuda_Init_Device ()
+{
+}
+
+/* Given the compute compatibility (as major.minor), return the number of block
+ * to be run on one multiprocessor. */
+static unsigned int
+getNumberOfBlockPerMultiProcessor (int major, int minor)
+{
+  /* For 2.0 and 2.1, limited by the maximum number of threads per MP and the
+   * number of available registrer (need 23 registers per threads).
+   */
+  if (major == 2)
+    return 1;
+  /* For 3.0, 3.2, 3.5 and 3.7 limited by the maximum number of threads per MP.
+   */
+  else if (major == 3)
+    return 2;
+  /* For 5.0, 5.2, and 5.3 limited by the maximum number of threads per MP. */
+  else if (major == 5)
+    return 2;
+  /* We assume that for newer compute capability the properties of the GPU won't
+   * decrease.
+   */
+  else
+    return 2;
+}
+
+extern "C"
+int
+get_device_prop(int device, cudaDeviceProp *deviceProp)
+{
+  cudaError_t err;
+
+  if (device!=-1)
+    {
+      err = cudaSetDevice(device);
+      if (err != cudaSuccess)
+        {
+          fprintf (stderr, "GPU: Error: Could not use device %d\n", device);
+          fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err));
+          return 0;
+        }
+    }
+
+  err = cudaGetDevice (&device);
+  if (err != cudaSuccess)
+    {
+      fprintf (stderr, "GPU: Error: no active device.\n");
+      fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err));
+      return 0;
+    }
+
+  err = cudaGetDeviceProperties (deviceProp, device);
+  if (err != cudaSuccess)
+    {
+      fprintf (stderr, "GPU: Error while getting device's properties.\n");
+      fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err));
+      return 0;
+    }
+  return 1;
+}
+
+extern "C"
+int
+select_and_init_GPU (int device, unsigned int *number_of_curves, int verbose, int schedule)
+{
+  cudaDeviceProp deviceProp;
+
+  if (device!=-1 && verbose)
+    fprintf (stdout, "GPU: device %d is required.\n", device);
+
+  if (!get_device_prop(device, &deviceProp))
+    return -1;
+
+  if (verbose)
+    {
+      printf ("GPU: will use device %d: %s, compute capability %d.%d, %d MPs.\n"
+              "GPU: maxSharedPerBlock = %zu maxThreadsPerBlock = %d "
+              "maxRegsPerBlock = %d\n", device, deviceProp.name,
+              deviceProp.major, deviceProp.minor,
+              deviceProp.multiProcessorCount, deviceProp.sharedMemPerBlock,
+              deviceProp.maxThreadsPerBlock, deviceProp.regsPerBlock);
+    }
+
+
+  if (*number_of_curves == 0) /* if choose the number of curves */
+    {
+      unsigned int n, m = ECM_GPU_CURVES_BY_BLOCK;
+      n = getNumberOfBlockPerMultiProcessor (deviceProp.major, deviceProp.minor);
+      *number_of_curves = n * deviceProp.multiProcessorCount * m;
+    }
+  else if (*number_of_curves % ECM_GPU_CURVES_BY_BLOCK != 0)
+    {
+      /* number_of_curves should be a multiple of ECM_GPU_CURVES_BY_BLOCK */
+      *number_of_curves = (*number_of_curves / ECM_GPU_CURVES_BY_BLOCK + 1) *
+                                                        ECM_GPU_CURVES_BY_BLOCK;
+      if (verbose)
+          fprintf(stderr, "GPU: the requested number of curves has been "
+                          "modified to %u\n", *number_of_curves);
+    }
+
+  /* First call to a global function initialize the device */
+  if (schedule == 1)
+    {
+      cuda_check (cudaSetDeviceFlags (cudaDeviceScheduleBlockingSync));
+    }
+  else
+    {
+      cuda_check (cudaSetDeviceFlags (cudaDeviceScheduleYield));
+    }
+  Cuda_Init_Device<<<1, 1>>> ();
+  cuda_check (cudaGetLastError());
+
+  return 0;
+}
+
+void
+kernel_info(const void* func, int verbose)
+{
+  if (verbose)
+  {
+    struct cudaFuncAttributes kernelAttr;
+    cudaError_t err = cudaFuncGetAttributes (&kernelAttr, func);
+    if (err == cudaSuccess)
+      printf ("GPU: Using device code targeted for architecture compile_%d\n"
+              "GPU: Ptx version is %d\nGPU: maxThreadsPerBlock = %d\n"
+              "GPU: numRegsPerThread = %d sharedMemPerBlock = %zu bytes\n",
+              kernelAttr.binaryVersion, kernelAttr.ptxVersion,
+              kernelAttr.maxThreadsPerBlock, kernelAttr.numRegs,
+              kernelAttr.sharedSizeBytes);
+  }
+}
+
diff --git a/cudacommon.h b/cudacommon.h
new file mode 100644
index 0000000000000000000000000000000000000000..b668c6f1deca3d619ab7dabe551c8498b68e5f7c
--- /dev/null
+++ b/cudacommon.h
@@ -0,0 +1,38 @@
+#ifndef _CUDACOMMON_H
+#define _CUDACOMMON_H 1
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <stdio.h>
+
+#ifdef __cplusplus
+/* cpp + CUDA only code */
+
+#define CUDA_CHECK(action) cuda_check(action, #action, __FILE__, __LINE__)
+
+inline void cuda_check(cudaError_t status, const char *action=NULL, const char *file=NULL, int32_t line=0) {
+  if (status != cudaSuccess) {
+    fprintf (stderr, "CUDA error (%d) occurred: %s\n", status, cudaGetErrorString(status));
+    if (action!=NULL)
+      fprintf (stderr, "While running %s   (file %s, line %d)\n", action, file, line);
+    exit(EXIT_FAILURE);
+  }
+}
+
+
+void kernel_info(const void* func, int verbose);
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int get_device_prop(int device, struct cudaDeviceProp *deviceProp);
+int select_and_init_GPU (int, unsigned int*, int, int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CUDACOMMON_H */
diff --git a/cudakernel.cu b/cudakernel.cu
index c4f49f1aa717276ec5d241b7181014acdfce7301..ae121c7dc4292c7d39005333a026d6ecb0758dc2 100644
--- a/cudakernel.cu
+++ b/cudakernel.cu
@@ -3,6 +3,7 @@
 #include "ecm-gpu.h"
 #include <gmp.h>
 #include "cudakernel.h"
+#include "cudacommon.h"
 
 #ifndef __CUDACC__
 #error "This file should only be compiled with nvcc"
@@ -14,152 +15,14 @@ __device__ biguint_t d_3Ncst;
 __device__ biguint_t d_Mcst;
 
 
-#define errCheck(err) cuda_errCheck (err, __FILE__, __LINE__)
-#define cudaMalloc(d, size) errCheck (cudaMalloc (d, size))
-#define cudaMemcpyHtoD(d, h, size) errCheck (cudaMemcpy ((void *) d, \
+#define cudaMalloc(d, size) cuda_check (cudaMalloc (d, size))
+#define cudaMemcpyHtoD(d, h, size) cuda_check (cudaMemcpy ((void *) d, \
                                     (void *) h, size, cudaMemcpyHostToDevice))
-#define cudaMemcpyDtoH(h, d, size) errCheck (cudaMemcpy ((void *) h, \
+#define cudaMemcpyDtoH(h, d, size) cuda_check (cudaMemcpy ((void *) h, \
                                     (void *) d, size, cudaMemcpyDeviceToHost))
-#define cudaMemcpyCst(d, h, size) errCheck (cudaMemcpyToSymbol (d, h, size))
+#define cudaMemcpyCst(d, h, size) cuda_check (cudaMemcpyToSymbol (d, h, size))
 
 
-/******************************/
-/* Host code handling the GPU */
-/******************************/
-
-inline void cuda_errCheck (cudaError err, const char *file, const int line)
-{
-  if( err != cudaSuccess ) 
-  {
-    fprintf(stderr, "%s(%i) : Error cuda : %s.\n",
-              file, line, cudaGetErrorString( err) );
-    exit(EXIT_FAILURE);
-  }
-}
-
-/* First call to a global function initialize the device */
-__global__ void Cuda_Init_Device ()
-{
-}
-
-/* Given the compute compatibility (as major.minor), return the number of block
- * to be run on one multiprocessor. */
-unsigned int
-getNumberOfBlockPerMultiProcessor (int major, int minor)
-{
-  /* For 2.0 and 2.1, limited by the maximum number of threads per MP and the
-   * number of available registrer (need 23 registers per threads).
-   */
-  if (major == 2)
-    return 1;
-  /* For 3.0, 3.2, 3.5 and 3.7 limited by the maximum number of threads per MP.
-   */
-  else if (major == 3)
-    return 2;
-  /* For 5.0, 5.2, and 5.3 limited by the maximum number of threads per MP. */
-  else if (major == 5)
-    return 2;
-  /* We assume that for newer compute capability the properties of the GPU won't
-   * decrease.
-   */
-  else
-    return 2;
-}
-
-extern "C" 
-int 
-select_and_init_GPU (int device, unsigned int *number_of_curves, int verbose, int schedule)
-{
-  cudaDeviceProp deviceProp;
-  cudaError_t err;
-        
-  if (device!=-1)
-    {
-      if (verbose)
-          fprintf (stdout, "GPU: device %d is required.\n", device);
-
-      err = cudaSetDevice(device);
-      if (err != cudaSuccess)
-        {
-          fprintf (stderr, "GPU: Error: Could not use device %d\n", device);
-          fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err));
-          return -1;
-        }
-    }
-  
-  err = cudaGetDevice (&device);
-  if (err != cudaSuccess)
-    {
-      fprintf (stderr, "GPU: Error: no active device.\n");
-      fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err));
-      return -1;
-    }
-
-  err = cudaGetDeviceProperties (&deviceProp, device);
-  if (err != cudaSuccess)
-    {
-      fprintf (stderr, "GPU: Error while getting device's properties.\n");
-      fprintf (stderr, "GPU: Error msg: %s\n", cudaGetErrorString(err));
-      return -1;
-    }
-
-  if (verbose)
-    {
-      printf ("GPU: will use device %d: %s, compute capability %d.%d, %d MPs.\n"
-              "GPU: maxSharedPerBlock = %zu maxThreadsPerBlock = %d "
-              "maxRegsPerBlock = %d\n", device, deviceProp.name,
-              deviceProp.major, deviceProp.minor,
-              deviceProp.multiProcessorCount, deviceProp.sharedMemPerBlock,
-              deviceProp.maxThreadsPerBlock, deviceProp.regsPerBlock);
-    }
-
-
-  if (*number_of_curves == 0) /* if choose the number of curves */
-    {
-      unsigned int n, m = ECM_GPU_CURVES_BY_BLOCK;
-      n = getNumberOfBlockPerMultiProcessor (deviceProp.major, deviceProp.minor);
-      *number_of_curves = n * deviceProp.multiProcessorCount * m;
-    }
-  else if (*number_of_curves % ECM_GPU_CURVES_BY_BLOCK != 0)
-    {
-      /* number_of_curves should be a multiple of ECM_GPU_CURVES_BY_BLOCK */
-      *number_of_curves = (*number_of_curves / ECM_GPU_CURVES_BY_BLOCK + 1) * 
-                                                        ECM_GPU_CURVES_BY_BLOCK;
-      if (verbose)
-          fprintf(stderr, "GPU: the requested number of curves has been "
-                          "modified to %u\n", *number_of_curves);
-    }
-
-  /* First call to a global function initialize the device */
-  if (schedule == 1)
-    {
-      errCheck (cudaSetDeviceFlags (cudaDeviceScheduleBlockingSync));
-    }
-  else
-    {
-      errCheck (cudaSetDeviceFlags (cudaDeviceScheduleYield));
-    }
-  Cuda_Init_Device<<<1, 1>>> ();
-  errCheck (cudaGetLastError()); 
-
-  if (verbose)
-  {
-    struct cudaFuncAttributes kernelAttr;
-    err = cudaFuncGetAttributes (&kernelAttr, Cuda_Ell_DblAdd);
-    if (err == cudaSuccess)
-    {
-      printf ("GPU: Using device code targeted for architecture compile_%d\n"
-              "GPU: Ptx version is %d\nGPU: maxThreadsPerBlock = %d\n"
-              "GPU: numRegsPerThread = %d sharedMemPerBlock = %zu bytes\n",
-              kernelAttr.binaryVersion, kernelAttr.ptxVersion,
-              kernelAttr.maxThreadsPerBlock, kernelAttr.numRegs,
-              kernelAttr.sharedSizeBytes);
-    }
-  }
-
-  return 0;
-}
-
 extern "C"
 float cuda_Main (biguint_t h_N, biguint_t h_3N, biguint_t h_M, digit_t h_invN,
                  biguint_t *h_xarray, biguint_t *h_zarray,
@@ -167,6 +30,10 @@ float cuda_Main (biguint_t h_N, biguint_t h_3N, biguint_t h_M, digit_t h_invN,
                  unsigned int firstinvd, unsigned int number_of_curves,
                  int verbose)
 {
+  /* Print some debug info about the kernel */
+  kernel_info((const void*) Cuda_Ell_DblAdd, verbose);
+
+
   cudaEvent_t start, stop;
   cudaEventCreate (&start);
   cudaEventCreate (&stop);
@@ -197,7 +64,7 @@ float cuda_Main (biguint_t h_N, biguint_t h_3N, biguint_t h_M, digit_t h_invN,
 
   /* Create a pair of events to pace ourselves */
   for (i=0; i<MAXEVENTS; i++)
-    errCheck (cudaEventCreateWithFlags (&event[i], 
+    cuda_check (cudaEventCreateWithFlags (&event[i],
                               cudaEventBlockingSync|cudaEventDisableTiming));
 
   cudaMalloc (&d_xA, array_size);
@@ -259,7 +126,7 @@ float cuda_Main (biguint_t h_N, biguint_t h_3N, biguint_t h_M, digit_t h_invN,
   }
 
   /* If an error occurs during the kernel calls in the loop */
-  errCheck (cudaGetLastError()); 
+  cuda_check (cudaGetLastError());
 
   /* Await for last recorded events */
   while (nEventsRecorded != 0) 
@@ -275,7 +142,7 @@ float cuda_Main (biguint_t h_N, biguint_t h_3N, biguint_t h_M, digit_t h_invN,
 
   /* Clean up our events and our stream handle */
   for (i=0; i<MAXEVENTS; i++)
-    errCheck (cudaEventDestroy(event[i]));
+    cuda_check (cudaEventDestroy(event[i]));
 
 
   cudaFree ((void *) d_xA);
@@ -288,8 +155,8 @@ float cuda_Main (biguint_t h_N, biguint_t h_3N, biguint_t h_M, digit_t h_invN,
 
   cudaEventElapsedTime (&elltime, start, stop);
 
-  errCheck (cudaEventDestroy (start));
-  errCheck (cudaEventDestroy (stop));
+  cuda_check (cudaEventDestroy (start));
+  cuda_check (cudaEventDestroy (stop));
 
   return elltime;
 }
diff --git a/cudakernel.h b/cudakernel.h
index 4e05d796c1e8990efb55ff46710978a88a2a3f47..173c2a2ce871fb1e4a80e3447761aa0d973a4b55 100644
--- a/cudakernel.h
+++ b/cudakernel.h
@@ -1,9 +1,12 @@
 #ifndef _CUDAKERNEL_H
 #define _CUDAKERNEL_H 1
 
+#include <cuda.h>
+
 #ifdef __cplusplus
 __global__ void Cuda_Ell_DblAdd (biguint_t *xarg, biguint_t *zarg, 
                   biguint_t *x2arg, biguint_t *z2arg, unsigned int firstinvd);
+
 #endif
 
 
@@ -11,6 +14,7 @@ __global__ void Cuda_Ell_DblAdd (biguint_t *xarg, biguint_t *zarg,
 extern "C" {
 #endif
 
+int get_device_prop(int device, struct cudaDeviceProp *deviceProp);
 int select_and_init_GPU (int, unsigned int*, int, int);
 float cuda_Main (biguint_t, biguint_t, biguint_t, digit_t, biguint_t*,
                         biguint_t*, biguint_t*, biguint_t*, mpz_t, unsigned int,
diff --git a/cudawrapper.c b/cudawrapper.c
index bb684dd4976dc3d1d28fd9e9267600a7c34311f5..d2f2b8a02befcd6a06761ee5ae242908a824103d 100644
--- a/cudawrapper.c
+++ b/cudawrapper.c
@@ -2,10 +2,13 @@
 
 #ifdef WITH_GPU
 
+#include "cudacommon.h"
+#include "cudakernel.h"
+
 #ifdef HAVE_CGBN_H
 #include "cgbn_stage1.h"
 #endif /* HAVE_CGBN_H */
-#include "cudakernel.h"
+
 
 #define TWO32 4294967296 /* 2^32 */
 
@@ -490,7 +493,7 @@ gpu_ecm (mpz_t f, mpz_t x, int param, mpz_t firstsigma, mpz_t n, mpz_t go,
   /* Set cudaDeviceScheduleBlockingSync with -cgbn, else cudaDeviceScheduleYield */
   int schedule = use_cgbn ? 1 : 0;
 
-  /* Initialize the GPU if necessary */
+  /* Initialize the GPU if necessary and determine nb_curves */
   if (!*device_init)
     {
       st = cputime ();
@@ -509,7 +512,7 @@ gpu_ecm (mpz_t f, mpz_t x, int param, mpz_t firstsigma, mpz_t n, mpz_t go,
       /* try running 'nvidia-smi -q -l' on the background .                 */
       *device_init = 1;
     }
-  
+
   /* Init arrays */
   factors = (mpz_t *) malloc (*nb_curves * sizeof (mpz_t));
   ASSERT_ALWAYS (factors != NULL);
@@ -578,7 +581,7 @@ gpu_ecm (mpz_t f, mpz_t x, int param, mpz_t firstsigma, mpz_t n, mpz_t go,
           print_expcurves (B1, B2, dF, k, root_params.S, param);
         }
     }
-  
+
   st = cputime ();
 
   if (use_cgbn) {
@@ -761,7 +764,5 @@ end_gpu_ecm2:
 
   return youpi;
 }
-#endif
-
-
 
+#endif /* HAVE_GPU */