From 4465b26d50ba21a3120e57abca71a0e2aae869f0 Mon Sep 17 00:00:00 2001 From: jwilber Date: Thu, 18 Dec 2025 15:44:37 -0800 Subject: [PATCH 1/2] experiment with lepton dev pod container workflow on compute node Signed-off-by: jwilber --- .../configs/recipes/container.yaml | 182 ++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 ci/lepton/model_convergence/configs/recipes/container.yaml diff --git a/ci/lepton/model_convergence/configs/recipes/container.yaml b/ci/lepton/model_convergence/configs/recipes/container.yaml new file mode 100644 index 0000000000..013836461a --- /dev/null +++ b/ci/lepton/model_convergence/configs/recipes/container.yaml @@ -0,0 +1,182 @@ +############################################################ +# Template Type +# Defines the template type for the job. +# - convergence_tests: for convergence tests +# - scdl_performance: for SCDL performance tests +############################################################ +template_type: convergence_tests + +job_name: container_test + +############################################################ +# Container Runtime +# Defines the base Docker image and registry auth needed +############################################################ +container: + image: nvcr.io/nvidia/pytorch:25.11-py3 + registry_auth: lepton-nvidia + +############################################################ +# Environment Variables +# These keys must be present for the job to authenticate with +# external services (W&B, Kratos, Lepton) and control runtime caching. +# HF_HOME is optional but recommended to speed up Hugging Face model loading. +############################################################ +environment_variables: + - name: WANDB_API_KEY + value_from: JWILBER_WANDB_API_KEY + - name: KRATOS_SSA_URL + value_from: KRATOS_SSA_URL + - name: KRATOS_SSA_CLIENT_ID + value_from: KRATOS_SSA_CLIENT_ID + - name: KRATOS_SSA_SECRET + value_from: KRATOS_SSA_SECRET.jwilber + - name: LEP_LOGIN_CREDENTIALS + value_from: LEP_LOGIN_CREDENTIALS + - name: HF_HOME + value: /data/esm2/cache + - name: HF_TOKEN + value_from: HUGGING_FACE_HUB_TOKEN.jwilber + +############################################################ +# Lepton Cluster Selection & Node Group +# Select the GPU cluster where the job will run. +# - h100: yo-bom-lepton-001 +# - h200: nv-int-multiteam-nebius-h200-01 +# - a100: az-sat-lepton-001 +############################################################ +node_group: yo-bom-lepton-001 + +############################################################ +# Shared Mounts +# Mount paths for accessing shared datasets, model checkpoints, +# or intermediate artifacts. The NFS source should match the cluster. +# - yo-bom-lepton-001 uses node-nfs:fs1 +# - nv-int-multiteam-nebius-h200-01 uses node-nfs:lepton-shared-fs +############################################################ +mount_from: node-nfs:fs1 + +mounts: + - path: /BioNeMo + mount_path: /data + from_: ${mount_from} + +############################################################ +# W&B Initialization +# Configure how runs are logged to Weights & Biases. +############################################################ +wandb_init_args: + group: "model_convergence__recipes" + mode: "online" + +############################################################ +# Git Checkout Options +# Configure which version of the recipe to pull from GitHub. +# - `branch`: defaults to main +# - `commit_sha`: overrides branch if provided +############################################################ +branch: jwilber/lepton-build-container +commit_sha: "" + +############################################################ +# Checkout Script +# Standardized script to clone the BioNeMo repository and install +# dependencies before the training run starts. Child configs can +# inherit and reuse this logic without modification. +############################################################ +checkout_script: | + set -euo pipefail + + echo "========================================" + echo "DIAGNOSTIC: System Capabilities Check" + echo "========================================" + + echo -e "\n=== User Info ===" + whoami + id + groups + echo "HOME: $HOME" + echo "PWD: $PWD" + + echo -e "\n=== Sudo Access ===" + if sudo -n true 2>/dev/null; then + echo "✓ Sudo available WITHOUT password" + sudo -V | head -n 1 + elif sudo -v 2>/dev/null; then + echo "⚠ Sudo available but requires password" + else + echo "✗ No sudo access" + fi + + echo -e "\n=== Docker Availability ===" + if which docker >/dev/null 2>&1; then + echo "✓ Docker binary found: $(which docker)" + docker --version || echo "✗ Docker version check failed" + if docker info >/dev/null 2>&1; then + echo "✓ Docker daemon accessible!" + docker info | grep -E "Server Version|Storage Driver|Runtimes" + else + echo "✗ Docker daemon not accessible (may need sudo or socket permissions)" + fi + else + echo "✗ Docker not installed" + fi + + echo -e "\n=== Docker Socket Check ===" + if [ -S /var/run/docker.sock ]; then + echo "✓ Docker socket exists: /var/run/docker.sock" + ls -la /var/run/docker.sock + if [ -r /var/run/docker.sock ] && [ -w /var/run/docker.sock ]; then + echo "✓ Socket is readable and writable" + else + echo "⚠ Socket exists but may not be accessible" + fi + else + echo "✗ Docker socket not found" + fi + + echo -e "\n=== GPU Access ===" + if which nvidia-smi >/dev/null 2>&1; then + echo "✓ nvidia-smi found" + nvidia-smi --query-gpu=name,driver_version --format=csv,noheader | head -n 1 + else + echo "✗ nvidia-smi not found" + fi + + echo -e "\n=== Package Management ===" + if apt-get --version >/dev/null 2>&1; then + echo "✓ apt-get available" + if sudo -n apt-get update -y >/dev/null 2>&1; then + echo "✓ Can run apt-get with sudo" + else + echo "✗ Cannot run apt-get (no sudo or permission denied)" + fi + fi + + echo -e "\n=== Writable Locations ===" + for dir in /tmp $HOME /data; do + if [ -d "$dir" ] && [ -w "$dir" ]; then + echo "✓ $dir is writable" + else + echo "✗ $dir not writable or doesn't exist" + fi + done + + echo -e "\n=== Installed Tools ===" + for tool in git python3 pip curl wget; do + if which $tool >/dev/null 2>&1; then + echo "✓ $tool: $(which $tool)" + else + echo "✗ $tool: not found" + fi + done + + echo -e "\n========================================" + echo "DIAGNOSTIC COMPLETE" + echo "========================================" + +run_script: "" + +script: | + ${checkout_script} + ${run_script} From 0954eb7f0a12ec30a1f7f78c60545d124735c436 Mon Sep 17 00:00:00 2001 From: jwilber Date: Fri, 19 Dec 2025 11:42:02 -0800 Subject: [PATCH 2/2] working version, no dockerfile Signed-off-by: jwilber --- .../configs/recipes/container.yaml | 285 +++++++++--------- 1 file changed, 146 insertions(+), 139 deletions(-) diff --git a/ci/lepton/model_convergence/configs/recipes/container.yaml b/ci/lepton/model_convergence/configs/recipes/container.yaml index 013836461a..701a6a4b93 100644 --- a/ci/lepton/model_convergence/configs/recipes/container.yaml +++ b/ci/lepton/model_convergence/configs/recipes/container.yaml @@ -1,82 +1,82 @@ -############################################################ -# Template Type -# Defines the template type for the job. -# - convergence_tests: for convergence tests -# - scdl_performance: for SCDL performance tests -############################################################ -template_type: convergence_tests +# @package _global_ +defaults: + - /base + - _self_ -job_name: container_test +job_name: "conatinertest" ############################################################ -# Container Runtime -# Defines the base Docker image and registry auth needed +# lepton job info ############################################################ -container: - image: nvcr.io/nvidia/pytorch:25.11-py3 - registry_auth: lepton-nvidia +node_group: yo-bom-lepton-001 +mount_from: node-nfs:fs1 +num_nodes: 1 +device_type: gpu +num_devices: 2 +gpu_type: h100-sxm +resource_shape: "${device_type}.${num_devices}x${gpu_type}" ############################################################ -# Environment Variables -# These keys must be present for the job to authenticate with -# external services (W&B, Kratos, Lepton) and control runtime caching. -# HF_HOME is optional but recommended to speed up Hugging Face model loading. +# kratos info: where to log data ############################################################ -environment_variables: - - name: WANDB_API_KEY - value_from: JWILBER_WANDB_API_KEY - - name: KRATOS_SSA_URL - value_from: KRATOS_SSA_URL - - name: KRATOS_SSA_CLIENT_ID - value_from: KRATOS_SSA_CLIENT_ID - - name: KRATOS_SSA_SECRET - value_from: KRATOS_SSA_SECRET.jwilber - - name: LEP_LOGIN_CREDENTIALS - value_from: LEP_LOGIN_CREDENTIALS - - name: HF_HOME - value: /data/esm2/cache - - name: HF_TOKEN - value_from: HUGGING_FACE_HUB_TOKEN.jwilber +kratos_subject: "convergence_tests_v0.0.3" ############################################################ -# Lepton Cluster Selection & Node Group -# Select the GPU cluster where the job will run. -# - h100: yo-bom-lepton-001 -# - h200: nv-int-multiteam-nebius-h200-01 -# - a100: az-sat-lepton-001 +# recipe identifiers +# mostly used for logging and observability ############################################################ -node_group: yo-bom-lepton-001 +recipe_subdir: esm2_native_te +model_type: esm2 +variant: train # train, finetune -############################################################ -# Shared Mounts -# Mount paths for accessing shared datasets, model checkpoints, -# or intermediate artifacts. The NFS source should match the cluster. -# - yo-bom-lepton-001 uses node-nfs:fs1 -# - nv-int-multiteam-nebius-h200-01 uses node-nfs:lepton-shared-fs -############################################################ -mount_from: node-nfs:fs1 +# Core identifiers for filtering +framework: native # native, accelerate +precision: fp16 # likely bf16 or fp8 +te_enabled: true +fp8_enabled: false +# thd_enabled: false -mounts: - - path: /BioNeMo - mount_path: /data - from_: ${mount_from} +# Catchall for additional features/configs +extras: [] # e.g. [thd] ############################################################ -# W&B Initialization -# Configure how runs are logged to Weights & Biases. +# wandb info (total_gpus used for group name) ############################################################ +# `total_gpus` calculated from lepton job info above +total_gpus: ${multiply:${num_devices},${num_nodes}} + wandb_init_args: - group: "model_convergence__recipes" - mode: "online" + project: "test_convergence__recipes__${sanitize:${branch}}" + group: "${model_type}__${task_cmd}__${total_gpus}gpus__${sanitize:${gpu_type}}" + job_type: "${recipe_subdir}" + name: null ############################################################ -# Git Checkout Options -# Configure which version of the recipe to pull from GitHub. -# - `branch`: defaults to main -# - `commit_sha`: overrides branch if provided +# task commands +# shared across all products (if not explicitly overridden) ############################################################ -branch: jwilber/lepton-build-container -commit_sha: "" + +# script overrides +# these should match the keys in the recipe's config file +model_tag: nvidia/esm2_t36_3B_UR50D +task_cmd: train_fsdp2 # mfsdp +num_train_steps: 20_000 +# dataset commands +micro_batch_size: 16 +load_dataset_kwargs_path: nvidia/esm2_uniref_pretraining_data +load_dataset_kwargs_streaming: true +load_dataset_kwargs_revision: 4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret + +# lr commands +num_warmup_steps: 2_000 +# checkpoint controls +ckpt_dir: "" +save_checkpoints: false +save_final_model: false +resume_from_checkpoint: false +use_distributed_checkpoint_fsdp2: false + +log_to_kratos: false ############################################################ # Checkout Script @@ -88,95 +88,102 @@ checkout_script: | set -euo pipefail echo "========================================" - echo "DIAGNOSTIC: System Capabilities Check" + echo "Setting up BioNeMo environment" echo "========================================" - echo -e "\n=== User Info ===" - whoami - id - groups - echo "HOME: $HOME" - echo "PWD: $PWD" - - echo -e "\n=== Sudo Access ===" - if sudo -n true 2>/dev/null; then - echo "✓ Sudo available WITHOUT password" - sudo -V | head -n 1 - elif sudo -v 2>/dev/null; then - echo "⚠ Sudo available but requires password" - else - echo "✗ No sudo access" - fi + # Clone repo + git clone https://github.com/NVIDIA/bionemo-framework.git + cd bionemo-framework/ + git checkout jstjohn/evo2_megatron_bridge_recipe + # build container from dockerfile here + cd bionemo-recipes/recipes/evo2_megatron - echo -e "\n=== Docker Availability ===" - if which docker >/dev/null 2>&1; then - echo "✓ Docker binary found: $(which docker)" - docker --version || echo "✗ Docker version check failed" - if docker info >/dev/null 2>&1; then - echo "✓ Docker daemon accessible!" - docker info | grep -E "Server Version|Storage Driver|Runtimes" - else - echo "✗ Docker daemon not accessible (may need sudo or socket permissions)" - fi - else - echo "✗ Docker not installed" + # Install uv (if not already available) + if ! command -v uv &> /dev/null; then + curl -LsSf https://astral.sh/uv/install.sh | sh + export PATH="$HOME/.cargo/bin:$PATH" fi - echo -e "\n=== Docker Socket Check ===" - if [ -S /var/run/docker.sock ]; then - echo "✓ Docker socket exists: /var/run/docker.sock" - ls -la /var/run/docker.sock - if [ -r /var/run/docker.sock ] && [ -w /var/run/docker.sock ]; then - echo "✓ Socket is readable and writable" - else - echo "⚠ Socket exists but may not be accessible" - fi - else - echo "✗ Docker socket not found" - fi + # Fix TransformerEngine direct_url issue + rm -f /usr/local/lib/python*/dist-packages/transformer_engine-*.dist-info/direct_url.json - echo -e "\n=== GPU Access ===" - if which nvidia-smi >/dev/null 2>&1; then - echo "✓ nvidia-smi found" - nvidia-smi --query-gpu=name,driver_version --format=csv,noheader | head -n 1 - else - echo "✗ nvidia-smi not found" - fi + # Create venv with system site packages + export UV_LINK_MODE=copy + export VIRTUAL_ENV=/workspace/.venv + export PATH="$VIRTUAL_ENV/bin:$PATH" - echo -e "\n=== Package Management ===" - if apt-get --version >/dev/null 2>&1; then - echo "✓ apt-get available" - if sudo -n apt-get update -y >/dev/null 2>&1; then - echo "✓ Can run apt-get with sudo" - else - echo "✗ Cannot run apt-get (no sudo or permission denied)" - fi - fi + uv venv --system-site-packages --seed $VIRTUAL_ENV - echo -e "\n=== Writable Locations ===" - for dir in /tmp $HOME /data; do - if [ -d "$dir" ] && [ -w "$dir" ]; then - echo "✓ $dir is writable" - else - echo "✗ $dir not writable or doesn't exist" - fi - done + # Create constraints file + pip freeze | grep transformer_engine > pip-constraints.txt - echo -e "\n=== Installed Tools ===" - for tool in git python3 pip curl wget; do - if which $tool >/dev/null 2>&1; then - echo "✓ $tool: $(which $tool)" - else - echo "✗ $tool: not found" - fi - done + # Install dependencies + uv pip install -r build_requirements.txt --no-build-isolation + uv pip install -c pip-constraints.txt -e . --no-build-isolation - echo -e "\n========================================" - echo "DIAGNOSTIC COMPLETE" echo "========================================" + echo "BioNeMo environment ready!" + echo "========================================" + + +run_script: | + pwd + + ls + + echo "ls ../../.." + ls ../../.. + echo "ls ../../../.." + ls ../../../.. + echo "ls ../../../.." + ls ../../../../../.. -run_script: "" -script: | - ${checkout_script} - ${run_script} + train_evo2 \ + --hf-tokenizer-model-path tokenizers/nucleotide_fast_tokenizer_256 \ + --sharded-eden-data \ + --seq-length=8192 \ + --stride 7992 \ + --sequence-db-dir ../../../../../data/bcr_eden/OG2_database_splits \ + --train-window-db ../../../../../data/bcr_eden/OG2_database_splits/og2__train__short.sqlite \ + --val-window-db ../../../../../data/bcr_eden/OG2_database_splits/og2__validation__short.sqlite \ + --test-window-db ../../../../../data/bcr_eden/OG2_database_splits/og2__test__short.sqlite \ + --most-recent-k 3 \ + --max-steps=72926 \ + --constant-steps 1024 \ + --seed 1234 \ + --dataset-seed 1234 \ + --no-weight-decay-embeddings \ + --grad-reduce-in-fp32 \ + --activation-checkpoint-recompute-num-layers 1 \ + --mixed-precision-recipe bf16-with-fp8-delayed-scaling-mixed \ + --hybrid-override-pattern SDH*SDHSDH*SDHSDH*SDHSDH* \ + --use-precision-aware-optimizer \ + --log-num-zeros-in-grad \ + --enable-preemption \ + --no-fp32-residual-connection \ + --ckpt-async-save \ + --overlap-grad-reduce \ + --clip-grad 1 \ + --eod-pad-in-loss-mask \ + --wandb-project evo2-recipes-verification \ + --lr 3e-04 \ + --wd 0.01 \ + --min-lr 6e-06 \ + --warmup-steps 1024 \ + --attention-dropout 0.001 \ + --hidden-dropout 0.001 \ + --eval-iters=10 \ + --eval-interval=100 \ + --debug-ddp-parity-freq 100 \ + --experiment-name=pretrain_striped_hyena_1b_nv_parallel \ + --result-dir=FIXME \ + --tensor-model-parallel-size=1 \ + --context-parallel-size=1 \ + --pipeline-model-parallel-size=1 \ + --workers 8 \ + --log-interval 5 \ + --no-renormalize-loss \ + --micro-batch-size=20 \ + --global-batch-size=960 \ + --model-size=striped_hyena_1b_nv_parallel \ No newline at end of file