I try to deploy JupyterHub single-user docker containers with docker swarm.
The swarm leader node is a web server on which I have an apache2 server running. The JupyterHub web interface runs on this manager node behind a reverse proxy. To launch it I run:
$ docker compose up -d --build
My swarm is composed of several different machines.
Here are my config files:
Dockerfile
# base image: jupyterhub
# this is built by docker-compose
# from the root of this repo
FROM jupyterhub/jupyterhub:4.0.2
ENV JUPYTERHUB_SINGLEUSER_EXTENSION=0
# install dockerspawner from the current repo, dummyauthenticator and ssh authenticator
RUN python3 -m pip install --upgrade pip && \
pip install --no-cache dockerspawner jupyterhub-ldapauthenticator
# load example configuration
ADD jupyterhub_config.py /srv/jupyterhub/jupyterhub_config.py
docker-compose.yaml
version: "3"
services:
proxy:
env_file: .env
image: jupyterhub/configurable-http-proxy:4
networks:
- jupyterhub-net
# expose the proxy to the world
ports:
- "8000:8000"
command:
- configurable-http-proxy
- "--error-target"
- "http://hub/hub/error"
hub:
# build an image with SwarmSpawner and our jupyterhub_config.py
env_file: .env
build:
context: "./"
dockerfile: "Dockerfile"
# mount the docker socket
volumes:
- "/var/run/docker.sock:/var/run/docker.sock"
- "/etc/hostname:/etc/hostname"
- "./jupyterhub_config.py:/srv/jupyterhub/jupyterhub_config.py"
networks:
- jupyterhub-net
# Ensure that we execute on a Swarm manager
deploy:
replicas: 1
placement:
constraints:
- node.role == manager
networks:
jupyterhub-net:
driver: overlay
attachable: true
jupyterhub_config.py
import docker
import os
from shutil import chown
import socket
from dockerspawner import SwarmSpawner
from docker.types import Mount
def get_username(spawner):
return str(spawner.user.name)
def get_uid(spawner):
"""
Cette fonction prend en entrée une instance de Spawner et retourne l'UID de l'utilisateur associé à cette instance.
"""
uid = -1
try:
# Récupération du nom d'utilisateur associé au spawner
username = spawner.user.name
# Récupération des informations sur l'utilisateur à partir de /etc/passwd
passwd_entry = os.popen('getent passwd ' + username).read().strip().split(':')
uid = int(passwd_entry[2])
except:
# En cas d'erreur, on laisse les valeurs par défaut à -1
pass
return uid
def get_gid(spawner):
"""
Cette fonction prend en entrée une instance de Spawner et retourne le GID de l'utilisateur associé à cette instance.
"""
gid = -1
try:
# Récupération du nom d'utilisateur associé au spawner
username = spawner.user.name
# Récupération des informations sur l'utilisateur à partir de /etc/passwd
passwd_entry = os.popen('getent passwd ' + username).read().strip().split(':')
gid = int(passwd_entry[3])
except:
# En cas d'erreur, on laisse les valeurs par défaut à -1
pass
return gid
class CustomImageFormSpawner(SwarmSpawner):
hostname = ""
def get_hostname(self):
return self.hostname
@property
def mounts(self):
if len(self.volume_binds):
driver = self.mount_driver_config
return [
Mount(
target=vol["bind"],
source=host_loc,
type="bind",
read_only=False,
driver_config=None,
)
for host_loc, vol in self.volume_binds.items()
]
else:
return []
def _options_form_default(self):
default_stack = "jupyter/minimal-notebook"
return """
<div class="form-group">
<label for="stack">Select your desired stack image</label>
<select name="stack" size="1">
<option value="cschranz/gpu-jupyter:v1.6_cuda-12.0_ubuntu-22.04">Tensorflow/Pytorch GPU</option>
<option value="quay.io/jupyter/datascience-notebook">Datascience</option>
<option value="quay.io/jupyter/r-notebook">R</option>
</select>
</div>
<div class="form-group">
<label for="hostname">Select the machine on which you want to compute</label>
<select name="hostname" size="1">
<option value="auber">Auber - GPU: RTX A5000 / CPU: 40</option>
<option value="chili">Chili - GPU: RTX A5000 / CPU: 40</option>
<option value="grenade">Grenade - GPU: RTX A5000 / CPU: 40</option>
<option value="voltaire">Voltaire - GPU: RTX 3090 / CPU: 48</option>
<option value="wasabi">Wasabi - GPU: RTX 2080 Super / CPU: 40</option>
<option value="nation">Nation - GPU: RTX 2080 / CPU: 48</option>
<option value="segur">Segur - GPU: RTX 2080 / CPU: 40</option>
<option value="vanille">Vanille - GPU: GTX 1080 / CPU: 12</option>
<option value="jasmin">Jasmin - GPU: GTX 1080 / CPU: 32</option>
<option value="louvre">Louvre - GPU: GTX 1080 / CPU: 32</option>
</select>
</div>
""".format(stack=default_stack)
def options_from_form(self, formdata):
options = {}
options['stack'] = formdata['stack']
options['hostname'] = formdata['hostname']
container_image = ''.join(formdata['stack'])
hostname = ''.join(formdata['hostname'])
print("SPAWN: " + container_image + " IMAGE" )
self.image = container_image
self.extra_placement_spec = { 'constraints' : ['node.hostname==' + hostname] }
self.hostname = hostname
return options
def create_dir_hook(SwarmSpawner):
SwarmSpawner.log.debug('RUNNING PRE-SPAWN HOOK')
#SwarmSpawner.environment['NB_UID'] = users_uid[SwarmSpawner.user.name]
#SwarmSpawner.environment['NB_UID'] = users_uid[SwarmSpawner.user.name]
SwarmSpawner.environment['NB_USER'] = get_uid(SwarmSpawner)
SwarmSpawner.environment['NB_USER'] = get_gid(SwarmSpawner)
mounts = [
Mount(type = 'bind',
source = '/etc/hostname',
target = '/etc/hostname'),
Mount(type = 'bind',
source = os.path.join('/home', SwarmSpawner.get_hostname(), SwarmSpawner.user.name),
target = os.path.join('/home/jovyan/work'),
read_only = False
)
]
SwarmSpawner.extra_container_spec = {
'mounts': mounts
}
c.DockerSpawner.pull_policy = 'ifnotpresent'
c.ConfigurableHTTPProxy.should_start = False
c.ConfigurableHTTPProxy.api_url = 'http://proxy:8001'
c.JupyterHub.spawner_class = CustomImageFormSpawner
network_name = "jupyterhub-swarm_jupyterhub-net"
c.SwarmSpawner.network_name = network_name
c.SwarmSpawner.extra_host_config = {'network_mode': network_name}
c.Authenticator.admin_users = {'cretin'}
c.JupyterHub.admin_access = True
c.SwarmSpawner.remove_containers = True
c.SwarmSpawner.debug = True
c.JupyterHub.hub_ip = '0.0.0.0'
c.JupyterHub.hub_connect_ip = 'hub'
c.JupyterHub.hub_port = 8000
#c.JupyterHub.bind_url = 'https://127.0.0.1:8000'
c.JupyterHub.authenticator_class = 'ldapauthenticator.LDAPAuthenticator'
c.LDAPAuthenticator.server_port = 389
c.LDAPAuthenticator.server_address = '10.115.31.3'
c.LDAPAuthenticator.lookup_dn = True
c.LDAPAuthenticator.use_ssl = False
c.LDAPAuthenticator.user_search_base = 'ou=Users,dc=dsimb,dc=inserm,dc=fr'
c.LDAPAuthenticator.user_attribute = 'uid'
c.LDAPAuthenticator.lookup_dn_user_dn_attribute = 'uid'
#c.LDAPAuthenticator.bind_dn_template = 'uid={username},dc=dsimb,dc=inserm,dc=fr'
c.SwarmSpawner.mem_limit = "5G"
c.SwarmSpawner.cpu_limit = 8.0
c.JupyterHub.allow_named_servers = True
# Redirect to JupyterLab, instead of the plain Jupyter notebook
c.Spawner.default_url = '/lab'
# Explicitly set notebook directory because we'll be mounting a host volume to
# it. Most jupyter/docker-stacks *-notebook images run the Notebook server as
# user `jovyan`, and set the notebook directory to `/home/jovyan/work`.
# We follow the same convention.
notebook_dir = os.environ.get('DOCKER_NOTEBOOK_DIR') or '/home/jovyan/work'
c.DockerSpawner.notebook_dir = notebook_dir
# Mount the real user's Docker volume on the host to the notebook user's
# notebook directory in the container
c.DockerSpawner.volumes = {
'jupyterhub-user-{username}': { 'bind': notebook_dir, 'mode': 'rw'},
}
# Bind mount the hostname and home of host to docker
# before the image is actually deployed by swarm
c.SwarmSpawner.pre_spawn_hook = create_dir_hook
# Timeout
c.Spawner.start_timeout = 90
# increase launch timeout because initial image pulls can take a while
c.SwarmSpawner.spawn_timeout = 60
# TLS config
c.SwarmSpawner.http_timeout = 300
c.SwarmSpawner.start_timeout = 300
# Monitoring
#To enable the 'jupyter-resource-usage' to track CPU usage and report a cpu_percent value as part of the /api/metrics/v1 response.
c.ResourceUseDisplay.track_cpu_percent = True
c.JupyterHub.authenticate_prometheus = False
Then I deploy to the swarm using this command:
$ docker stack deploy -c deployment.yml swarm
with this deployment.yaml
:
version: "3"
services:
hub:
# build an image with SwarmSpawner and our jupyterhub_config.py
env_file: .env
image: jupyterhub/jupyterhub
# mount the docker socket
volumes:
- "/var/run/docker.sock:/var/run/docker.sock"
- "./jupyterhub_config.py:/srv/jupyterhub/jupyterhub_config.py"
- "/etc/hostname:/etc/hostname"
networks:
- jupyterhub-net
ports:
- "8000:8000"
networks:
jupyterhub-net:
driver: overlay
attachable: true
From the web interface I am able to connect with my ldap credentials, I’m recognized as admin. When I try to spawn an image, I get the following error on the selected node:
root@chili:~# docker ps
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
5fd3bcc6eae0 quay.io/jupyter/datascience-notebook:latest "tini -g -- start.sh…" 6 seconds ago Up 4 seconds (health: starting) 8888/tcp jupyter-cretin.1.wgwh8azahd7pfhmkrd9nl19ru
root@chili:~# docker logs -f 5fd3bcc6eae0
Entered start.sh with args: start-notebook.py
Running hooks in: /usr/local/bin/start-notebook.d as uid: 1000 gid: 100
Done running hooks in: /usr/local/bin/start-notebook.d
WARNING: container must be started as root to change the desired user's name with NB_USER="-1"!
Running hooks in: /usr/local/bin/before-notebook.d as uid: 1000 gid: 100
Sourcing shell script: /usr/local/bin/before-notebook.d/10activate-conda-env.sh
Done running hooks in: /usr/local/bin/before-notebook.d
Executing the command: start-notebook.py
[I 2024-02-19 01:13:59.785 ServerApp] Extension package jupyterlab took 0.1108s to import
[W 2024-02-19 01:14:00.876 ServerApp] A `_jupyter_server_extension_points` function was not found in nbclassic. Instead, a `_jupyter_server_extension_paths` function was found and will be used for now. This function name will be deprecated in future releases of Jupyter Server.
[W 2024-02-19 01:14:00.881 ServerApp] A `_jupyter_server_extension_points` function was not found in notebook_shim. Instead, a `_jupyter_server_extension_paths` function was found and will be used for now. This function name will be deprecated in future releases of Jupyter Server.
[I 2024-02-19 01:14:00.881 ServerApp] jupyter_lsp | extension was successfully linked.
[I 2024-02-19 01:14:00.888 ServerApp] jupyter_server_mathjax | extension was successfully linked.
[I 2024-02-19 01:14:00.888 ServerApp] jupyter_server_proxy | extension was successfully linked.
[I 2024-02-19 01:14:00.893 ServerApp] jupyter_server_terminals | extension was successfully linked.
[I 2024-02-19 01:14:00.894 JupyterHubSingleUser] Starting jupyterhub single-user server extension version 4.0.2
[I 2024-02-19 01:14:00.894 JupyterHubSingleUser] Using default url from environment $JUPYTERHUB_DEFAULT_URL: /lab
[I 2024-02-19 01:14:00.898 ServerApp] jupyterhub | extension was successfully linked.
[W 2024-02-19 01:14:00.900 LabApp] 'extra_template_paths' was found in both NotebookApp and ServerApp. This is likely a recent change. This config will only be set in NotebookApp. Please check if you should also config these traits in ServerApp for your purpose.
[I 2024-02-19 01:14:00.907 ServerApp] jupyterlab | extension was successfully linked.
[I 2024-02-19 01:14:00.907 ServerApp] jupyterlab_git | extension was successfully linked.
[W 2024-02-19 01:14:00.910 NotebookApp] 'extra_template_paths' was found in both NotebookApp and ServerApp. This is likely a recent change. This config will only be set in NotebookApp. Please check if you should also config these traits in ServerApp for your purpose.
[I 2024-02-19 01:14:00.913 ServerApp] nbclassic | extension was successfully linked.
[I 2024-02-19 01:14:00.913 ServerApp] nbdime | extension was successfully linked.
[W 2024-02-19 01:14:00.914 JupyterNotebookApp] 'extra_template_paths' was found in both NotebookApp and ServerApp. This is likely a recent change. This config will only be set in NotebookApp. Please check if you should also config these traits in ServerApp for your purpose.
[I 2024-02-19 01:14:00.919 ServerApp] notebook | extension was successfully linked.
[I 2024-02-19 01:14:00.921 ServerApp] Writing Jupyter server cookie secret to /home/jovyan/.local/share/jupyter/runtime/jupyter_cookie_secret
[I 2024-02-19 01:14:01.345 ServerApp] notebook_shim | extension was successfully linked.
[I 2024-02-19 01:14:01.374 ServerApp] notebook_shim | extension was successfully loaded.
[I 2024-02-19 01:14:01.376 ServerApp] jupyter_lsp | extension was successfully loaded.
[I 2024-02-19 01:14:01.377 ServerApp] jupyter_server_mathjax | extension was successfully loaded.
[I 2024-02-19 01:14:01.393 ServerApp] jupyter_server_proxy | extension was successfully loaded.
[I 2024-02-19 01:14:01.395 ServerApp] jupyter_server_terminals | extension was successfully loaded.
[E 2024-02-19 01:14:01.405 JupyterHubSingleUser] Failed to connect to my Hub at http://hub:8000/hub/api (attempt 1/5). Is it running?
Traceback (most recent call last):
File "/opt/conda/lib/python3.11/site-packages/jupyterhub/singleuser/extension.py", line 336, in check_hub_version
resp = await client.fetch(self.hub_auth.api_url)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/tornado/simple_httpclient.py", line 340, in run
stream = await self.tcp_client.connect(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/tornado/tcpclient.py", line 269, in connect
addrinfo = await self.resolver.resolve(host, port, af)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/tornado/netutil.py", line 433, in resolve
for fam, _, _, _, address in await asyncio.get_running_loop().getaddrinfo(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/asyncio/base_events.py", line 867, in getaddrinfo
return await self.run_in_executor(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/socket.py", line 962, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
socket.gaierror: [Errno -2] Name or service not known
[I 2024-02-19 01:14:02.955 ServerApp] Skipped non-installed server(s): bash-language-server, dockerfile-language-server-nodejs, javascript-typescript-langserver, jedi-language-server, julia-language-server, pyright, python-language-server, python-lsp-server, r-languageserver, sql-language-server, texlab, typescript-language-server, unified-language-server, vscode-css-languageserver-bin, vscode-html-languageserver-bin, vscode-json-languageserver-bin, yaml-language-server
[E 2024-02-19 01:14:03.415 JupyterHubSingleUser] Failed to connect to my Hub at http://hub:8000/hub/api (attempt 2/5). Is it running?
Traceback (most recent call last):
File "/opt/conda/lib/python3.11/site-packages/jupyterhub/singleuser/extension.py", line 336, in check_hub_version
resp = await client.fetch(self.hub_auth.api_url)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/tornado/simple_httpclient.py", line 340, in run
stream = await self.tcp_client.connect(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/tornado/tcpclient.py", line 269, in connect
addrinfo = await self.resolver.resolve(host, port, af)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/tornado/netutil.py", line 433, in resolve
for fam, _, _, _, address in await asyncio.get_running_loop().getaddrinfo(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/asyncio/base_events.py", line 867, in getaddrinfo
return await self.run_in_executor(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/socket.py", line 962, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
socket.gaierror: [Errno -2] Name or service not known
[E 2024-02-19 01:14:07.425 JupyterHubSingleUser] Failed to connect to my Hub at http://hub:8000/hub/api (attempt 3/5). Is it running?
Traceback (most recent call last):
File "/opt/conda/lib/python3.11/site-packages/jupyterhub/singleuser/extension.py", line 336, in check_hub_version
resp = await client.fetch(self.hub_auth.api_url)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/tornado/simple_httpclient.py", line 340, in run
stream = await self.tcp_client.connect(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/tornado/tcpclient.py", line 269, in connect
addrinfo = await self.resolver.resolve(host, port, af)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/tornado/netutil.py", line 433, in resolve
for fam, _, _, _, address in await asyncio.get_running_loop().getaddrinfo(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/asyncio/base_events.py", line 867, in getaddrinfo
return await self.run_in_executor(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/socket.py", line 962, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
socket.gaierror: [Errno -2] Name or service not known
[C 2024-02-19 01:14:10.129 ServerApp] received signal 15, stopping
[I 2024-02-19 01:14:10.129 ServerApp] Shutting down 11 extensions
[E 2024-02-19 01:14:10.130 JupyterHubSingleUser] Failed to load JupyterHubSingleUser server extension
Traceback (most recent call last):
File "/opt/conda/lib/python3.11/site-packages/jupyterhub/singleuser/extension.py", line 274, in wrapped
r = f(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/jupyterhub/singleuser/extension.py", line 633, in initialize
app.io_loop.run_sync(self.check_hub_version)
File "/opt/conda/lib/python3.11/site-packages/tornado/ioloop.py", line 526, in run_sync
raise TimeoutError("Operation timed out after %s seconds" % timeout)
TimeoutError: Operation timed out after None seconds
The network looks fine to me but for some reason the node is not able to communicate with the hub…
I am unsure if I deploy everything correctly though