Hello,
I’m trying to set up a HPC-Enabled Jupyterhub to launch notebooks on our compute and gpu nodes, with the hub service running on the login node. So far, everything seems to be working, I have jupyterhub installed in a conda environment (eventually this will migrate to a systemd control unit), and I can start local notebooks via the LocalProcessSpawner.
I can also start notebooks via the SlurmSpawner, and my batch scheduler reacts, starts a job, and as far as I can see, the logs from the Jupyterhub also recognise the correct node and start (trying?) to connect to it. However, it seems as if the way back from the compute node to the login node where the jupyterhub is listening is running into some problems.
Here’s my spawner as configured in jupyterhub_config.py
:
def dict_exclude_keys(d: dict, excludes: list[str]) -> dict:
keys = set(list(d.keys())) - set(excludes)
return {k: d[k] for k in keys if k in d}
class AWIProfilesSpawner(wrapspawner.ProfilesSpawner):
form_template = Unicode(
"""<label for="profile">Select a job profile:</label>
<select class="form-control" name="profile" required autofocus>
{% for profile in temp_keys %}
<option value="{{ profile.key }}" {{ profile.first }}> {{ profile.display }}</option>
{% endfor %}
</select>
<label for="computing_account"> Select your computing account: </label>
<select class="form-control" name="computing_account" required>
{% for account in allowed_accounts %}
<option value="{{ account }}"> {{ account }}</option>
{% endfor %}
</form>
<br>
"""
)
def construct_child(self):
self.child_profile = self.user_options.get("profile", "")
self.select_profile(self.child_profile)
# Add in our options:
tmp_user_options = dict_exclude_keys(self.user_options, ["profile"])
self.child_config.update(**tmp_user_options)
# Have a look at the child config here!
return super().construct_child()
def _options_form_default(self):
# breakpoint()
# super()._options_form_default()
self._assign_computing_accounts_to_user()
environment = jinja2.Environment()
template = environment.from_string(self.form_template)
temp_keys = [
dict(display=p[0], key=p[1], type=p[2], first="") for p in self.profiles
]
temp_keys[0]["first"] = self.first_template
rendered_form_template = template.render(
temp_keys=temp_keys,
allowed_accounts=self.user_options["computing_accounts"],
)
return rendered_form_template
def _assign_computing_accounts_to_user(self):
self.user_options["computing_accounts"] = get_computing_accounts(self.user.name)
def options_from_form(self, formdata):
options = dict(
profile=formdata.get("profile", [self.profiles[0][1]])[0],
computing_account=formdata.get("computing_account", "")[0],
)
return options
c.JupyterHub.spawner_class = AWIProfilesSpawner
# NOTE(PG): Not needed, seems to be able to get computing accounts during rendinging of the template
# def userdata_hook(spawner, auth_state):
# breakpoint()
# spawner.userdata = auth_state
# c.Spawner.auth_state_hook = userdata_hook
# PG: QUICK JUMP
# For later: sacctmgr show user withassoc format=account%-25 where $USER
class AWILocalProcessSpawner(jupyterhub.spawner.LocalProcessSpawner):
pass
class AWIComputingAccountSlurmSpawner(batchspawner.SlurmSpawner):
# FIXME(PG): Replace computing.computing in account:
batch_script = Unicode(
"""#!/bin/bash
#SBATCH --output={{homedir}}/jupyterhub_slurmspawner_%j.log
#SBATCH --job-name=spawner-jupyterhub
#SBATCH --chdir={{homedir}}
#SBATCH --export={{keepvars}}
#SBATCH --get-user-env=L
#SBATCH --account={{computing_account}}
{% if partition %}#SBATCH --partition={{partition}}{% endif %}
{% if runtime %}#SBATCH --time={{runtime}}{% endif %}
{% if memory %}#SBATCH --mem={{memory}}{% endif %}
{% if gres %}#SBATCH --gres={{gres}}{% endif %}
{% if nprocs %}#SBATCH --cpus-per-task={{nprocs}}{% endif %}
{% if reservation%}#SBATCH --reservation={{reservation}}{% endif %}
{% if options %}#SBATCH {{options}}{% endif %}
set -euo pipefail
trap 'echo SIGTERM received' TERM
module load conda
eval "$(conda shell.bash hook)"
conda activate jupyterhub
which jupyterhub-singleuser
{% if srun %}{{srun}} {% endif %}{{cmd}} --debug
echo "jupyterhub-singleuser ended gracefully"
{{epilogue}}
"""
).tag(config=True)
def state_gethost(self):
assert self.state_exechost_re, "Misconfigured: define state_exechost_re"
match = re.search(self.state_exechost_re, self.job_status)
if not match:
self.log.error(
"Spawner unable to match host addr in job status: " + self.job_status
)
return
if not self.state_exechost_exp:
self.log.warning(f"Looking for Jupyter running on {match.groups()[0]}.")
return match.groups()[0]
else:
self.log.warning(
f"Looking for Jupyter running on {self.state_exechost_exp}"
)
return match.expand(self.state_exechost_exp)
class AWIComputeNodeSlurmSpawner(AWIComputingAccountSlurmSpawner):
pass
class AWIGPUNodeSlurmSpawner(AWIComputingAccountSlurmSpawner):
pass
c.ProfilesSpawner.profiles = [
(
"Albedo 0 (Login Node)",
"local",
AWILocalProcessSpawner,
{
# "timeout": 60,
},
),
(
"Albedo Compute Node",
"prod",
AWIComputeNodeSlurmSpawner,
# {"ip": "0.0.0.0", "timeout": 60, "runtime": "12:00:00"},
{"ip": "albedo0", "runtime": "12:00:00"},
),
(
"Albedo GPU Node",
"gpu",
AWIGPUNodeSlurmSpawner,
# {"ip": "0.0.0.0", "timeout": 60, "partition": "gpu", "runtime": "12:00:00"},
{"ip": "albedo0", "partition": "gpu", "runtime": "12:00:00"},
),
]
Here’s the generated SLURM script:
#!/bin/bash
#SBATCH --output=/albedo/home/pgierz/jupyterhub_slurmspawner_%j.log
#SBATCH --job-name=spawner-jupyterhub
#SBATCH --chdir=/albedo/home/pgierz
#SBATCH --export=PATH,CONDA_DEFAULT_ENV,LANG,LC_ALL,JUPYTERHUB_API_TOKEN,JPY_API_TOKEN,JUPYTERHUB_CLIENT_ID,JUPYTERHUB_HOST,JUPYTERHUB_OAUTH_CALLBACK_URL,JUPYTERHUB_OAUTH_SCOPES,JUPYTERHUB_OAUTH_ACCESS_SCOPES,JUPYTERHUB_OAUTH_CLIENT_ALLOWED_SCOPES,JUPYTERHUB_USER,JUPYTERHUB_SERVER_NAME,JUPYTERHUB_API_URL,JUPYTERHUB_ACTIVITY_URL,JUPYTERHUB_BASE_URL,JUPYTERHUB_SERVICE_PREFIX,JUPYTERHUB_SERVICE_URL,USER,HOME,SHELL
#SBATCH --get-user-env=L
#SBATCH --account=computing.computing
set -euo pipefail
trap 'echo SIGTERM received' TERM
module load conda
eval "$(conda shell.bash hook)"
conda activate jupyterhub
which jupyterhub-singleuser
srun batchspawner-singleuser jupyterhub-singleuser --debug
echo "jupyterhub-singleuser ended gracefully"
And here are the relevant version:
$ jupyterhub --version
3.1.1
$ jupyter --version
Selected Jupyter core packages...
IPython : 8.14.0
ipykernel : 6.20.2
ipywidgets : not installed
jupyter_client : 8.3.0
jupyter_core : 5.3.0
jupyter_server : 2.7.0
jupyterlab : 4.0.2
nbclient : 0.8.0
nbconvert : 7.4.0
nbformat : 5.9.0
notebook : 6.5.4
qtconsole : not installed
traitlets : 5.9.0
Examining the singleuser server log in more detail, I stumbled across two problems. At the beginning, something with yarn?
Module for mambaforge version 22.9.0-2-Linux-x86_64 loaded
/albedo/home/pgierz/spack/opt/spack/linux-rocky8-zen/gcc-8.5.0/miniforge3-4.8.3-4-Linux-x86_64-3jusiy6dnj7t6yxh6zwstnjqg7s7jzig/envs/jupyterhub/bin/jupyterhub-singleuser
/albedo/home/pgierz/spack/opt/spack/linux-rocky8-zen/gcc-8.5.0/miniforge3-4.8.3-4-Linux-x86_64-3jusiy6dnj7t6yxh6zwstnjqg7s7jzig/envs/jupyterhub/lib/python3.11/site-packages/batchspawner/singleuser.py:17: RuntimeWarning: coroutine 'HubAuth._api_request' was never awaited
hub_auth._api_request(
RuntimeWarning: Enable tracemalloc to get the object allocation traceback
Fail to get yarn configuration. /albedo/home/pgierz/spack/opt/spack/linux-rocky8-zen/gcc-8.5.0/miniforge3-4.8.3-4-Linux-x86_64-3jusiy6dnj7t6yxh6zwstnjqg7s7jzig/envs/jupyterhub/lib/python3.11/site-packages/jupyterlab/staging/yarn.js:4
(()=>{var Qge=Object.create;var AS=Object.defineProperty;var bge=Object.getOwnPropertyDescriptor;var Sge=Object.getOwnPropertyNames;var vge=Object.getPrototypeOf,xge=Object.prototype.hasOwnProperty;var J=(r=>typeof require<"u"?require:typeof Proxy<"u"?new Proxy(r,{get:(e,t)=>(typeof require<"u"?require:e)[t]}):r)(function(r){if(typeof require<"u")return require.apply(this,arguments);throw new Error('Dynamic require of "'+r+'" is not supported')});var Pge=(r,e)=>()=>(r&&(e=r(r=0)),e);var w=(r,e)=>()=>(e||r((e={exports:{}}).exports,e),e.exports),ut=(r,e)=>{for(var t in e)AS(r,t,{get:e[t],enumerable:!0})},Dge=(r,e,t,i)=>{if(e&&typeof e=="object"||typeof e=="function")for(let n of Sge(e))!xge.call(r,n)&&n!==t&&AS(r,n,{get:()=>e[n],enumerable:!(i=bge(e,n))||i.enumerable});return r};var Pe=(r,e,t)=>(t=r!=null?Qge(v
SyntaxError: Unexpected token {
at createScript (vm.js:56:10)
at Object.runInThisContext (vm.js:97:10)
at Module._compile (module.js:542:28)
at Object.Module._extensions..js (module.js:579:10)
at Module.load (module.js:487:32)
at tryModuleLoad (module.js:446:12)
at Function.Module._load (module.js:438:3)
at Module.runMain (module.js:604:10)
at run (bootstrap_node.js:389:7)
at startup (bootstrap_node.js:149:9)
Second, it seems as if updates are being sent, but they aren’t arriving…?
[I 2023-06-28 15:40:07.406 SingleUserLabApp serverapp:2801] Jupyter Server 2.7.0 is running at:
[I 2023-06-28 15:40:07.406 SingleUserLabApp serverapp:2801] http://prod-041:35225/user/pgierz/lab?token=e33ba472449b016bb17e1be57388e03e47b3d5f27befa2b6
[I 2023-06-28 15:40:07.406 SingleUserLabApp serverapp:2801] http://127.0.0.1:35225/user/pgierz/lab?token=e33ba472449b016bb17e1be57388e03e47b3d5f27befa2b6
[I 2023-06-28 15:40:07.406 SingleUserLabApp serverapp:2802] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
[C 2023-06-28 15:40:07.481 SingleUserLabApp serverapp:2864]
To access the server, open this file in a browser:
file:///albedo/home/pgierz/.local/share/jupyter/runtime/jpserver-146475-open.html
Or copy and paste one of these URLs:
http://prod-041:35225/user/pgierz/lab?token=e33ba472449b016bb17e1be57388e03e47b3d5f27befa2b6
http://127.0.0.1:35225/user/pgierz/lab?token=e33ba472449b016bb17e1be57388e03e47b3d5f27befa2b6
[I 2023-06-28 15:40:07.482 SingleUserLabApp mixins:591] Updating Hub with activity every 300 seconds
[D 2023-06-28 15:40:07.482 SingleUserLabApp mixins:553] Notifying Hub of activity 2023-06-28T13:40:07.140800Z
Do I need to configure anything with the ip addresses in the jupyterhub_config.py
to get this to work correctly?
Any help would be very greatly appreciated, I’m quite stuck at the moment…