Hello @rob
I did a fresh install of melissa as per the documentation on G5K and the binaries are generated ~/.local/
.
I was able to obtain the results for Sensitivity Analysis example using config_oar.json
However, when I run the DL example, it is producing strange errors regarding melissa-server
binary.
I have pasted all the logs and the config_oar.json
. Could you please confirm if the configurations (at the end) are correct? I haven’t made any major changes except for the paths and oarsub
specific arguments.
EDIT:
I was able to run the DL study with the following option by removing the GPU request for oarsub
.
"scheduler_arg_server": [
"/nodes=1/core=16,walltime=01:00:00",
Any thoughts on this?
oar.0.err
You can see strange import errors as well as syntax error (which is most likely coming from a shell script).
+ . /home/apurandare/melissa/melissa_set_env.sh
+ export PATH=/home/apurandare/melissa/install/bin:/home/apurandare/melissa/build/.melissa/bin:/home/apurandare/.guix-profile/bin:/home/apurandare/.guix-profile/bin:/home/apurandare/.vscode-server/bin/019f4d1419fbc8219a181fab7892ebccf7ee29a2/bin/remote-cli:/home/apurandare/.guix-profile/bin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/grid5000/code/bin:/opt/puppetlabs/bin:/home/apurandare/.local/bin:/home/apurandare/.local/bin
+ export LD_LIBRARY_PATH=/home/apurandare/melissa/install/lib:
+ export LD_LIBRARY_PATH=/lib:/home/apurandare/melissa/install/lib:
+ export MELISSA_INSTALL_PREFIX=/home/apurandare/melissa/install
+ export PYTHONPATH=:/home/apurandare/melissa/install/lib:/home/apurandare/melissa/install/melissa:/home/apurandare/melissa
+ export PATH=/home/apurandare/.local/bin:/home/apurandare/melissa/install/bin:/home/apurandare/melissa/build/.melissa/bin:/home/apurandare/.guix-profile/bin:/home/apurandare/.guix-profile/bin:/home/apurandare/.vscode-server/bin/019f4d1419fbc8219a181fab7892ebccf7ee29a2/bin/remote-cli:/home/apurandare/.guix-profile/bin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/grid5000/code/bin:/opt/puppetlabs/bin:/home/apurandare/.local/bin:/home/apurandare/.local/bin
+ export PYTHONPATH=/home/apurandare/.local/::/home/apurandare/melissa/install/lib:/home/apurandare/melissa/install/melissa:/home/apurandare/melissa
+ date
+ echo DATE =Fri 08 Mar 2024 10:09:21 AM CET
+ hostname -s
+ echo Hostname =drac-5
+ pwd
+ echo Working directory =/home/apurandare/melissa/examples/heat-pde/heat-pde-dl/STUDY_OUT
+ echo
+ echo /home/apurandare/.local/::/home/apurandare/melissa/install/lib:/home/apurandare/melissa/install/melissa:/home/apurandare/melissa
+ set -e
+ exec melissa-server --project_dir /home/apurandare/melissa/examples/heat-pde/heat-pde-dl/ --config_name config_oar
/home/apurandare/.local/bin/melissa-server: 3: import: not found
/home/apurandare/.local/bin/melissa-server: 4: import: not found
from: too many arguments
/home/apurandare/.local/bin/melissa-server: 7: Syntax error: "(" unexpected (expecting "then")
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:
Process name: [[30255,1],0]
Exit code: 2
--------------------------------------------------------------------------
oarsub.0.sh
#!/bin/sh
#OAR -O ./stdout/oar.0.out
#OAR -E ./stdout/oar.0.err
#OAR --resource /nodes=1/gpu=1/core=1,walltime=01:00:00
exec mpirun -machinefile "$OAR_NODE_FILE" \
-- env MELISSA_LAUNCHER_HOST=fgrenoble MELISSA_LAUNCHER_PORT=40145 MELISSA_LAUNCHER_PROTOCOL=SCTP MELISSA_FAULT_TOLERANCE=OFF MELISSA_RESTART=0 PATH=/home/apurandare/melissa/build/.melissa/bin:/home/apurandare/.guix-profile/bin:/home/apurandare/.guix-profile/bin:/home/apurandare/.vscode-server/bin/019f4d1419fbc8219a181fab7892ebccf7ee29a2/bin/remote-cli:/home/apurandare/.guix-profile/bin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/grid5000/code/bin:/opt/puppetlabs/bin:/home/apurandare/.local/bin:/home/apurandare/.local/bin /home/apurandare/melissa/examples/heat-pde/heat-pde-dl/STUDY_OUT/server.sh
server.sh
#!/bin/sh
set -x
# Melissa will paste the `preprocessing_instructions`
# the remainder of this file should be left untouched.
# melissa-launcher will find and replace values in
# curly brackets (e.g. /home/apurandare/melissa/melissa_set_env.sh) with
# the proper values.
. /home/apurandare/melissa/melissa_set_env.sh
echo "DATE =$(date)"
echo "Hostname =$(hostname -s)"
echo "Working directory =$(pwd)"
echo ""
echo $PYTHONPATH
set -e
exec melissa-server --project_dir /home/apurandare/melissa/examples/heat-pde/heat-pde-dl/ --config_name config_oar
DL: config_oar.json
// Please make sure that all entries preceded by a comment including
// the "FIXME" keyword are changed before running Melissa with this
// config file
{
// FIXME: the server_filename enables to switch between torch (default) and tensorflow servers (tf_heat_pde_dl.py)
"server_filename": "heatpde_dl_server.py",
"server_class": "HeatPDEServerDL",
"output_dir": "STUDY_OUT",
"study_options": {
"field_names": [
"temperature"
],
// parameter_sweep_size is the number of clients (i.e. simulations) to execute
"parameter_sweep_size": 20,
// num_samples is the number *expected* from the simulation, not the set number
// if this number is not provided the server will get it at client finalization
"num_samples": 100,
"nb_parameters": 5,
"parameter_range": [100, 500],
// this option sets Nx = Ny = mesh_size
"mesh_size": 100,
// this option yields dt = 1 / time_discretization but does not change num_samples
"time_discretization": 100,
"seed": 123,
"simulation_timeout": 10,
"crashes_before_redraw": 1000,
"verbosity": 2
},
"dl_config": {
"n_batches_update": 10,
"batch_size": 10,
"per_server_watermark": 100,
"buffer_size": 6000,
"zmq_hwm": 10,
"buffer": "FIFO"
},
"launcher_config": {
"http_port": 6666,
"scheduler": "oar",
// the following option enables to execute the server on
// two non-radeon like GPUs (LYON)
// "scheduler_arg_server": [
// "gpu=2/core=1,walltime=01:00:00",
// "-t exotic",
// "-p gpu_model NOT LIKE 'Radeon%'"
// ],
"scheduler_arg_server": [
"/nodes=1/gpu=1/core=1,walltime=01:00:00",
"-t exotic",
"-p cluster='drac'"
],
// replacing the option above with the one below will
// have for consequence to execute the server on two cores instead
// "scheduler_arg_server": [
// "core=2,walltime=01:00:00"
// ],
"scheduler_arg_client": [
"core=2,walltime=00:01:00"
],
"job_limit": 11,
"timer_delay": 1,
"fault_tolerance": false,
"verbosity": 2
},
"client_config": {
// FIXME: the executable command needs to be replaced with the appropriate path
"executable_command": "$HOME/melissa/examples/heat-pde/executables/build/heatc",
// all bash commands to be executed on the job node prior to melissa study
"preprocessing_commands": [
// ". $HOME/.init-melissa.sh",
// "module load openmpi/4.1.5_gcc-10.4.0",
// "melissa-env"
]
},
"server_config": {
"preprocessing_commands": [
// ". $HOME/.init-melissa.sh",
// "module load openmpi/4.1.5_gcc-10.4.0",
// "melissa-env"
]
}
}
SA: config_oar.json
// Please make sure that all entries preceded by a comment including
// the "FIXME" keyword are changed before running Melissa with this
// config file
{
"server_filename": "heatpde_sa_server.py",
"server_class": "HeatPDEServerSA",
"output_dir": "STUDY_OUT",
"study_options": {
"field_names": [
"temperature"
],
// when Sobol indices are computed, parameter_sweep_size is the number of groups to execute
// otherwise it is the number of clients (i.e. simulations) to execute
"parameter_sweep_size": 10,
"num_samples": 100,
"nb_parameters": 5,
"parameter_range": [100, 200],
"simulation_timeout": 400,
"crashes_before_redraw": 1000
},
"sa_config": {
"mean": true,
"variance": true,
"skewness": true,
"kurtosis": true,
"sobol_indices": false
},
"launcher_config": {
// the standard oar scheduler is invoked with the following options
"scheduler": "oar",
"scheduler_arg_server": [
"core=1,walltime=01:00:00"
],
"scheduler_arg_client": [
"core=1,walltime=00:01:00"
],
// otherwise an hybrid scheduling strategy based on job containers
// and taking advantage of the best-effort queue can be requested
// "scheduler": "oar-hybrid",
// "container_max_number_of_clients": 5,
// "besteffort_allocation_frequency": 2,
// "scheduler_arg_container": [
// "core=6,walltime=01:10:00"
// ],
// "scheduler_arg_server": [
// "core=1,walltime=01:00:00"
// ],
// "scheduler_arg_client": [
// "core=1,walltime=00:10:00"
// ],
"fault_tolerance": false,
"verbosity": 2
},
"client_config": {
// FIXME: the executable command needs to be replaced with the appropriate path
"executable_command": "$HOME/melissa/examples/heat-pde/executables/build/heatc 100 100 100",
// all bash commands to be executed on the job node prior to melissa study
"preprocessing_commands": [ "scheduler_arg_server": [
"/nodes=1/core=16,walltime=01:00:00",
// ". $HOME/.init-melissa.sh",
// "module load openmpi/4.1.5_gcc-10.4.0",
// "melissa-env"
]
}
}