sgl-project · gq112 · Feb 27, 2026 · Feb 28, 2026 · Feb 28, 2026
@@ -0,0 +1,54 @@
+{
+  "architectures": [
+    "DFlashDraftModel"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "block_size": 16,
+  "bos_token_id": 151643,
+  "dflash_config": {
+    "mask_token_id": 151669,
+    "target_layer_ids": [
+      3,
+      9,
+      17,
+      25,
+      33
+    ]
+  },
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 262144,
+  "model_type": "qwen3_vl_text",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 5,
+  "num_key_value_heads": 8,
+  "num_target_layers": 36,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "mrope_interleaved": true,
+    "mrope_section": [
+      24,
+      20,
+      20
+    ],
+    "rope_type": "default"
+  },
+  "rope_theta": 5000000,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "use_cache": true,
+  "vocab_size": 151936
+}
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ROOT_DIR=$(dirname $SCRIPT_DIR)
+
+export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels
+
+NUM_GPUS=${1:-8}
+ATTENTION_BACKEND=${2:-flex_attention}
+BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-16}
+
+torchrun \
+    --standalone \
+    --nproc_per_node $NUM_GPUS \
+    $ROOT_DIR/scripts/train_dflash.py \
+    --target-model-path Qwen/Qwen3-VL-8B-Instruct \
+    --draft-model-config $ROOT_DIR/configs/qwen3-vl-8b-dflash.json \
+    --target-model-backend hf \
+    --is-vlm \
+    --trust-remote-code \
+    --train-data-path $ROOT_DIR/cache/dataset/allava4v-mix-20k_train.localimg_regen.jsonl \
+    --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \
+    --min-pixels 50176 \
+    --max-pixels 802816 \
+    --output-dir $ROOT_DIR/outputs/qwen3-vl-8b-allava4v20k-dflash \
+    --cache-dir $ROOT_DIR/cache \
+    --num-epochs 6 \
+    --batch-size 2 \
+    --learning-rate 1e-4 \
+    --warmup-ratio 0.08 \
+    --max-grad-norm 1.0 \
+    --max-length 4096 \
+    --num-draft-layers 5 \
+    --chat-template qwen3-vl \
+    --attention-backend $ATTENTION_BACKEND \
+    --block-size 16 \
+    --num-anchors 512 \
+    --loss-decay-gamma 7.0 \
+    --log-interval 50 \
+    --save-interval 1000