Hello,
I am trying to figure out how to configure correctly my SwarmSpawner to be able to spawn a GPU-enabled single-user image.
I have a swarm of GPU-capable worker nodes. Everything works fine but when the GPU-enabled image is spawned, it does not see the GPUs and nvidia-smi is not available.
When I run the docker container manually on the node it works I can see the GPU and nvidia-smi is available. The problem comes from the configuration of DockerSpawner or SwarmSpawner I believe.
The image I use comes from GitHub - iot-salzburg/gpu-jupyter: GPU-Jupyter: Leverage the flexibility of Jupyterlab through the power of your NVIDIA GPU to run your code from Tensorflow and Pytorch in collaborative notebooks on the GPU..
I pull the image cschranz/gpu-jupyter:v1.6_cuda-12.0_ubuntu-22.04
.
If I run this command on a worker node manually:
docker run --gpus all -d -it -p 80:8888 -v $(pwd)/data:/home/jovyan/work -e GRANT_SUDO=yes -e JUPYTER_ENABLE_LAB=yes --user root cschranz/gpu-jupyter:v1.6_cuda-12.0_ubuntu-22.04
I can access the Jupyter interface and nvidia-smi
command exists and has the correct output.
What am I missing ?
Here are the docker-compose.yaml
and jupyterhub_config.py
files:
version: "3.9"
services:
jupyterhub:
image: 127.0.0.1:5000/jupyterhub-swarm-hub
hostname: jupyterhub
deploy:
mode: replicated
replicas: 1
restart_policy:
condition: on-failure
delay: 30s
window: 300s
# Ensure that we execute on a Swarm manager
placement:
constraints:
- node.role == manager
networks:
- jupyterhub-net
ports:
- "8080:8000"
environment:
DOCKER_NETWORK_NAME: jupyterhub_network
configs:
- source: jupyter-conf
target: /srv/jupyterhub/jupyterhub_config.py
mode: 0500
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- /etc/hostname:/etc/hostname
configs:
jupyter-conf:
external: true
name: jupyterhub_config.py
networks:
jupyterhub-net:
driver: overlay
attachable: true
name: jupyterhub_network
and the config file:
import docker
import os
import shlex
from shutil import chown
import socket
from dockerspawner import SwarmSpawner
from docker.types import Mount
from ldap3 import Server, Connection, ALL
### GLOBAL VARIABLE
worker_hostname = ""
class CustomSwarmSpawner(SwarmSpawner):
"""
Custom SwarmSpawner class
"""
worker_hostname = ""
def get_worker_hostname(self):
return self.worker_hostname
@property
def mounts(self):
"""
Define mounts for the container.
"""
if len(self.volume_binds):
driver = self.mount_driver_config
return [
Mount(
target=vol["bind"],
source=host_loc,
type="bind",
read_only=False,
driver_config=None,
)
for host_loc, vol in self.volume_binds.items()
]
else:
return []
def _options_form_default(self):
cpu_options = [1, 10, 20, 30, 40, 48] # Possible values for CPUs
memory_options = [1, 2, 4, 8, 16, 32, 64] # Possible values for memory in GB
form_template = """
<div class="form-group">
<label for="args">Choisissez le type de serveur</label>
<select id="args" class="form-control" name="args">
<option value="lab">Jupyter Lab</option>
<option value="notebook">Notebook classique</option>
</select>
</div>
<div class="form-group">
<label for="stack">Choisissez une image</label>
<select id="stacks" class="form-control" name="stack">
<option value="cschranz/gpu-jupyter:v1.6_cuda-12.0_ubuntu-22.04">Tensorflow/Pytorch GPU</option>
<option value="jupyter/minimal-notebook:x86_64-python-3.11.6">Minimal</option>
<option value="quay.io/jupyter/datascience-notebook">Datascience</option>
<option value="quay.io/jupyter/r-notebook">R</option>
</select>
</div>
<div class="form-group">
<label for="hostname">Choisissez la machine sur laquelle vous souhaitez lancer l'image</label>
<select id="hostname" class="form-control" name="hostname">
<option value="host1">host1 - GPU: RTX A5000 / CPU: 40</option>
<option value="host2">host2 - GPU: RTX A5000 / CPU: 40</option>
<option value="host3">host3 - GPU: RTX A5000 / CPU: 40</option>
<option value="host4">host4 - GPU: RTX 3090 / CPU: 48</option>
<option value="host5">host5 - GPU: RTX 2080 / CPU: 48</option>
<option value="host6">host6 - GPU: RTX 2080 Super / CPU: 40</option>
</select>
</div>
<label for="cpu_limit">Nombre de CPUs:</label>
<select class="form-control" name="cpu_limit" id="cpu_limit">
{cpu_options}
</select>
<label for="memory_limit">Mémoire RAM (GiB):</label>
<select class="form-control" name="memory_limit" id="memory_limit">
{memory_options}
</select>
"""
# Populate the CPU options
cpu_select = "\n".join([f"<option>{cpu}</option>" for cpu in cpu_options])
# Populate the memory options
memory_select = "\n".join([f"<option>{memory}</option>" for memory in memory_options])
# Render the final form
options_form = form_template.format(cpu_options=cpu_select, memory_options=memory_select)
return options_form
def options_from_form(self, formdata):
global worker_hostname
"""Override to parse form submission"""
options = {}
# Extract selected Docker image, node, and wether to launch JupyterLab or simple notebook
options['stack'] = formdata.get('stack', [''])[0].strip()
options['hostname'] = formdata.get('hostname', [''])[0].strip()
options['mem_limit'] = formdata.get('memory_limit', [''])[0].strip() + "G"
options['cpu_limit'] = int(formdata.get('cpu_limit', [''])[0].strip())
lab_or_notebook = formdata.get('args', [''])[0].strip()
if lab_or_notebook == "lab":
self.default_url = '/lab'
else:
self.default_url = '/tree'
self.log.info(f"Selected image: {options['stack']} on machine: {options['hostname']}")
self.image = options['stack']
self.extra_placement_spec = { 'constraints' : ['node.hostname==' + options['hostname']] }
self.worker_hostname = options['hostname']
## TODO: check availability of resources
self.mem_limit = options['mem_limit']
self.cpu_limit = options['cpu_limit']
return options
def pre_spawn_custom_hook(SwarmSpawner):
"""
This function is run before the singleuser image is launched.
* Retrieve user uid and gid grom LDAP server
* Mount local user home directory into /home/jovyan
"""
server = Server("10.115.31.3", get_info=ALL)
conn = Connection(server, auto_bind=True)
conn.search('ou=Users,dc=XX,dc=XX,dc=XX', f'(&(objectclass=posixAccount)(uid={SwarmSpawner.user.name}))', attributes=['cn', 'givenName', 'uidNumber', 'gidNumber'])
SwarmSpawner.environment = {
'GRANT_SUDO': 'yes',
'NB_USER': SwarmSpawner.user.name,
'NB_UID': conn.entries[0].uidNumber.value,
'NB_GID': conn.entries[0].gidNumber.value,
}
mounts = [
Mount(type = 'bind',
source = os.path.join('/home', SwarmSpawner.get_worker_hostname(), SwarmSpawner.user.name),
target = os.path.join(f'/home/{SwarmSpawner.user.name}'),
read_only = False
)
]
SwarmSpawner.extra_container_spec = {
'mounts': mounts,
'hostname': SwarmSpawner.get_worker_hostname(),
'user': '0',
}
SwarmSpawner.notebook_dir = f'/home/{SwarmSpawner.user.name}'
# JupyterHub Configuration
c = get_config()
# JupyterHub settings
c.JupyterHub.port = 8000
c.JupyterHub.hub_ip = 'jupyterhub'
c.JupyterHub.hub_connect_ip = 'jupyterhub'
c.JupyterHub.hub_port = 8081
c.JupyterHub.cleanup_servers = False
c.JupyterHub.allow_named_servers = True
c.JupyterHub.admin_access = True
c.JupyterHub.spawner_class = CustomSwarmSpawner
# Authenticator settings
c.JupyterHub.authenticator_class = 'ldapauthenticator.LDAPAuthenticator'
c.Authenticator.admin_users = {'cretin'}
c.LDAPAuthenticator.server_port = 389
c.LDAPAuthenticator.server_address = 'X.X.X.X'
c.LDAPAuthenticator.lookup_dn = True
c.LDAPAuthenticator.use_ssl = False
c.LDAPAuthenticator.user_search_base = 'ou=Users,dc=XX,dc=XX,dc=XX'
c.LDAPAuthenticator.user_attribute = 'uid'
c.LDAPAuthenticator.lookup_dn_user_dn_attribute = 'uid'
# DockerSpawner settings
c.SwarmSpawner.pull_policy = 'ifnotpresent'
c.SwarmSpawner.remove_containers = True
c.SwarmSpawner.http_timeout = 300
c.SwarmSpawner.start_timeout = 300
c.SwarmSpawner.pre_spawn_hook = pre_spawn_custom_hook
# Logs
c.SwarmSpawner.debug = True
c.DockerSpawner.debug = True
# Network settings
network_name = os.environ['DOCKER_NETWORK_NAME']
c.SwarmSpawner.network_name = network_name
c.SwarmSpawner.extra_host_config = {'network_mode': network_name}
# Proxy settings
c.ConfigurableHTTPProxy.should_start = True
# Device requests for GPU
c.SwarmSpawner.extra_host_config = {
"device_requests": [
docker.types.DeviceRequest(
count=-1,
capabilities=[["gpu"]],
driver="nvidia",
options={"--gpus": "all"}
),
],
}
# Monitoring settings
c.ResourceUseDisplay.track_cpu_percent = True
c.JupyterHub.authenticate_prometheus = False
# Timeout settings
c.Spawner.start_timeout = 90