diff --git a/steps/data/setup/puppet/modules/env/manifests/big/configure_nvidia_gpu/cuda.pp b/steps/data/setup/puppet/modules/env/manifests/big/configure_nvidia_gpu/cuda.pp index d00f4bad212c1feb772c6a5f1a3675549604605c..bd75319b81994000795ef3bb46b283ab7947a415 100644 --- a/steps/data/setup/puppet/modules/env/manifests/big/configure_nvidia_gpu/cuda.pp +++ b/steps/data/setup/puppet/modules/env/manifests/big/configure_nvidia_gpu/cuda.pp @@ -15,7 +15,7 @@ class env::big::configure_nvidia_gpu::cuda () { } "ppc64el": { $libcuda = '/usr/lib/powerpc64le-linux-gnu/libcuda.so' - $cuda_args = '--silent --override' # Do not check gcc version with --override arg (cf. #12791) + $cuda_args = '--silent' } } } diff --git a/steps/data/setup/puppet/modules/env/manifests/big/configure_nvidia_gpu/drivers.pp b/steps/data/setup/puppet/modules/env/manifests/big/configure_nvidia_gpu/drivers.pp index a277c80abfe178291faf82ae5e68e6dacb897863..206b6125ded94da14cfe38ee00109fafcea687c0 100644 --- a/steps/data/setup/puppet/modules/env/manifests/big/configure_nvidia_gpu/drivers.pp +++ b/steps/data/setup/puppet/modules/env/manifests/big/configure_nvidia_gpu/drivers.pp @@ -4,10 +4,24 @@ class env::big::configure_nvidia_gpu::drivers () { include env::big::prepare_kernel_module_build - $driver_source = "http://packages.grid5000.fr/other/nvidia/NVIDIA-Linux-$::env::common::software_versions::nvidia_driver.run" + case "$env::deb_arch" { + "amd64": { + $libdir = '/usr/lib/x86_64-linux-gnu' + } + "ppc64el": { + $libdir = '/usr/lib/powerpc64le-linux-gnu' + } + } + + $driver_source = "http://packages.grid5000.fr/other/nvidia/NVIDIA-Linux-${::env::common::software_versions::nvidia_driver_arch}-${::env::common::software_versions::nvidia_driver}.run" $nvidia_basename = 'NVIDIA-Linux' $nvidia_runfile = "$nvidia_basename.run" + file{ + "/tmp/$nvidia_runfile": + ensure => file, + require => Exec['retrieve_nvidia_drivers']; + } exec{ 'retrieve_nvidia_drivers': command => "/usr/bin/wget -q $driver_source -O /tmp/$nvidia_runfile; chmod u+x /tmp/$nvidia_runfile", @@ -25,6 +39,10 @@ class env::big::configure_nvidia_gpu::drivers () { command => "/usr/bin/sed -i 's/MODULE_LICENSE(\"NVIDIA\");/MODULE_LICENSE(\"GPL\");/' /tmp/$nvidia_basename/kernel/nvidia-modeset/nvidia-modeset-linux.c; /usr/bin/sed -i 's/MODULE_LICENSE(\"NVIDIA\");/MODULE_LICENSE(\"GPL\");/' /tmp/$nvidia_basename/kernel/nvidia/nv-frontend.c", user => root, require => Exec['extract_nvidia_driver']; + 'cleanup_nvidia_extracted': + command => "/bin/rm -r /tmp/$nvidia_basename", + user => root, + require => Exec['install_nvidia_driver']; } $nvidia_installer = "$nvidia_basename/nvidia-installer" } else { @@ -41,14 +59,62 @@ class env::big::configure_nvidia_gpu::drivers () { environment => ['MODPROBE_OPTIONS=--dry-run'], require => [Exec['prepare_kernel_module_build'], File["/tmp/$nvidia_runfile"]]; 'cleanup_nvidia': - command => "/bin/rm -r /tmp/$nvidia_basename*", + command => "/bin/rm /tmp/$nvidia_runfile", user => root, require => Exec['install_nvidia_driver']; } - file{ - "/tmp/$nvidia_runfile": - ensure => file, - require => Exec['retrieve_nvidia_drivers']; + if ($::env::common::software_versions::nvidia_user_driver != undef) { + + # Install a different user-mode driver. + # See https://docs.nvidia.com/deploy/cuda-compatibility/index.html#forward-compatible-upgrade + # This allows to use an old kernel driver with a newer user-mode driver (and thus support newer CUDA) + # It is based on the NVIDIA driver installer, but we only extract relevant files. + $user_driver_source = "http://packages.grid5000.fr/other/nvidia/NVIDIA-Linux-${::env::common::software_versions::nvidia_driver_arch}-${::env::common::software_versions::nvidia_user_driver}.run" + + file{ + '/tmp/NVIDIA-Linux-user-driver.run': + ensure => file, + require => Exec['retrieve_nvidia_user_driver']; + } + exec{ + 'retrieve_nvidia_user_driver': + command => "/usr/bin/wget -q $user_driver_source -O /tmp/NVIDIA-Linux-user-driver.run; chmod u+x /tmp/NVIDIA-Linux-user-driver.run", + timeout => 1200, # 20 min + creates => "/tmp/NVIDIA-Linux-user-driver.run"; + 'extract_nvidia_user_driver': + command => "/tmp/NVIDIA-Linux-user-driver.run -x --target /tmp/NVIDIA-Linux-user-driver", + timeout => 600, # 10 min, + require => File['/tmp/NVIDIA-Linux-user-driver.run']; + 'cleanup_nvidia_user_driver': + command => "/bin/rm /tmp/NVIDIA-Linux-user-driver.run", + require => Exec['extract_nvidia_user_driver']; + } + file{ + # Copy libraries from the newer driver + "${libdir}/libcuda.so.${::env::common::software_versions::nvidia_user_driver}": + source => "/tmp/NVIDIA-Linux-user-driver/libcuda.so.${::env::common::software_versions::nvidia_user_driver}", + mode => '0755', + require => Exec['extract_nvidia_user_driver']; + "${libdir}/libnvidia-ptxjitcompiler.so.${::env::common::software_versions::nvidia_user_driver}": + source => "/tmp/NVIDIA-Linux-user-driver/libnvidia-ptxjitcompiler.so.${::env::common::software_versions::nvidia_user_driver}", + mode => '0755', + require => Exec['extract_nvidia_user_driver']; + # Override symlinks so that they point to the newer driver + "${libdir}/libcuda.so.1": + ensure => link, + target => "libcuda.so.${::env::common::software_versions::nvidia_user_driver}", + replace => true, + require => Exec['install_nvidia_driver']; + "${libdir}/libnvidia-ptxjitcompiler.so.1": + ensure => link, + target => "libnvidia-ptxjitcompiler.so.${::env::common::software_versions::nvidia_user_driver}", + replace => true, + require => Exec['install_nvidia_driver']; + } -> + exec{ + 'cleanup_nvidia_user_driver_files': + command => "/bin/rm -r /tmp/NVIDIA-Linux-user-driver", + } } } diff --git a/steps/data/setup/puppet/modules/env/manifests/common/software_versions.pp b/steps/data/setup/puppet/modules/env/manifests/common/software_versions.pp index 1dcb39c9e5a5e3f5c68035fe57128f541f445eb3..7ce507322b5886815683567c9d1cd7a74ddfc1d1 100644 --- a/steps/data/setup/puppet/modules/env/manifests/common/software_versions.pp +++ b/steps/data/setup/puppet/modules/env/manifests/common/software_versions.pp @@ -14,15 +14,16 @@ class env::common::software_versions { case "$env::deb_arch" { 'amd64': { + $nvidia_driver_arch = 'x86_64' case $lsbdistcodename { 'stretch', 'buster': { - $nvidia_driver = 'x86_64-450.119.04' + $nvidia_driver = '450.119.04' $nvidia_cuda = '10.1.243_418.87.00_linux' $datacenter_gpu_manager = '1:1.7.2' $dcgm_exporter = '2.0.0-rc.11' } 'bullseye': { - $nvidia_driver = 'x86_64-460.73.01' + $nvidia_driver = '460.73.01' $nvidia_cuda = '11.2.2_460.32.03_linux' $datacenter_gpu_manager = '1:2.1.4' $dcgm_exporter = '2.3.0-1' @@ -30,16 +31,21 @@ class env::common::software_versions { } } 'ppc64el': { - # Newer version of the driver (440.X, 450.X) are unstable and cause kernel panic. + # We are stuck on driver 418 for ppc64. + # Newer version of the driver (440.X, 450.X, 460.X) are unstable and cause kernel panic. # See https://intranet.grid5000.fr/bugzilla/show_bug.cgi?id=12545 - $nvidia_cuda = '10.1.243_418.87.00_linux_ppc64le' - $nvidia_driver = 'ppc64le-418.197.02' + $nvidia_driver_arch = 'ppc64le' case $lsbdistcodename { 'stretch', 'buster': { + $nvidia_driver = '418.197.02' + $nvidia_cuda = '10.1.243_418.87.00_linux_ppc64le' $datacenter_gpu_manager = '1:1.7.2' $dcgm_exporter = '2.0.0-rc.11' } 'bullseye': { + $nvidia_driver = '418.197.02' + $nvidia_user_driver = '460.73.01' + $nvidia_cuda = '11.2.2_460.32.03_linux_ppc64le' $datacenter_gpu_manager = '1:2.0.15' $dcgm_exporter = '2.3.0-1' }