#!/usr/bin/env bash #SBATCH --job-name=base-train # Nombre identificador del trabajo en la cola #SBATCH --partition=gpu # Particion (cola) a usar — CPU o GPU #SBATCH --nodes=1 # Numero de nodos del cluster a reservar #SBATCH --ntasks=1 # Numero de tareas MPI (procesos paralelos) #SBATCH --cpus-per-task=19 # Hilos de CPU por tarea #SBATCH --gres=gpu:1 # Recurso generico: 1 GPU H100 NVL o gpu:1 para cualquiera #SBATCH --mem=31G # Memoria RAM total reservada en el nodo #SBATCH --time=24:00:00 # Tiempo maximo de ejecucion (HH:MM:SS) #SBATCH --output=/slurm/home/%u/output/%j/terminal.out # stdout #SBATCH --error=/slurm/home/%u/output/%j/terminal.err # stderr set -euo pipefail OUTDIR="/slurm/home/$USER/output/$SLURM_JOB_ID" mkdir -p "$OUTDIR" SCRATCH="${SLURM_TMPDIR:-/scratch/slurm/$USER/$SLURM_JOB_ID/tmp}" JOBSCRATCH="${SCRATCH}/base-${SLURM_JOB_ID}" mkdir -p "$JOBSCRATCH" SUBMIT_DIR="${SLURM_SUBMIT_DIR:-$PWD}" cleanup() { rsync -a --ignore-missing-args \ "$JOBSCRATCH/code/artifacts/" "$SUBMIT_DIR/artifacts/" 2>/dev/null || true } trap cleanup EXIT INT TERM rsync -a "$SUBMIT_DIR/code/" "$JOBSCRATCH/code/" rsync -a "$SUBMIT_DIR/data/" "$JOBSCRATCH/data/" if [[ -d "$SUBMIT_DIR/artifacts" ]]; then mkdir -p "$JOBSCRATCH/code/artifacts" rsync -a "$SUBMIT_DIR/artifacts/" "$JOBSCRATCH/code/artifacts/" fi rsync -a "$SUBMIT_DIR/lib/" "$JOBSCRATCH/lib/" source "$JOBSCRATCH/lib/install_env.sh" export ARTIFACTS_DIR="$JOBSCRATCH/code/artifacts" export PERSISTENT_ARTIFACTS_DIR="/slurm/home/$USER/artifacts" cd "$JOBSCRATCH/code" python main.py 2>&1 | tee "$OUTDIR/training.log"