Added test for fq+lora tuning on CUDA

ljaljushkin · ljaljushkin · commit 1ac5afe9ee77 · 2025-04-04T16:04:59.000+02:00
diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml
@@ -56,6 +56,7 @@ jobs:
         run: |
           set +e
           python -m pytest -s -ra tests/cross_fw/examples \
+            -m 'not cuda' \
             --junit-xml=pytest-results.xml \
             --durations-path=tests/cross_fw/examples/.test_durations \
             --splitting-algorithm=least_duration \
@@ -72,6 +73,77 @@ jobs:
           pip install defusedxml==0.7.1
           python .github/scripts/pytest_md_summary.py pytest-results.xml >> $GITHUB_STEP_SUMMARY
 
+  examples-cuda:
+    name: Test examples CUDA [${{ matrix.group }}/1]
+    runs-on: aks-linux-4-cores-28gb-gpu-tesla-t4
+    timeout-minutes: 40
+    if: ${{ inputs.gpu_enabled == true }}
+    strategy:
+      fail-fast: false
+      matrix:
+        group: [1]
+    defaults:
+      run:
+        shell: bash
+    env:
+      DEBIAN_FRONTEND: noninteractive
+    steps:
+      - name: Install dependencies
+        run : |
+          sudo apt-get update
+          sudo apt-get --assume-yes install build-essential ninja-build libgl1-mesa-dev libglib2.0-0 wget make
+      - name: Download CUDA
+        run: |
+          wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run
+          sudo sh cuda_12.4.0_550.54.14_linux.run --toolkit --silent
+      - name: Runner info
+        continue-on-error: true
+        run: |
+          export PATH=/usr/local/cuda-12.4/bin${PATH:+:${PATH}}
+          export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+          nvidia-smi
+          cat /proc/cpuinfo
+          nvcc --version
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+            lfs: true
+            fetch-depth: 0  # Fetch full history to allow checking out any branch or PR
+      - name: Fetch and Checkout the Pull Request Branch
+        if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.pull_request_number != '' }}
+        run: |
+          git fetch origin pull/${{ github.event.inputs.pull_request_number }}/head:pr-${{ github.event.inputs.pull_request_number }}
+          git checkout pr-${{ github.event.inputs.pull_request_number }}
+      - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
+        with:
+          python-version: 3.10.14
+      - name: cpuinfo
+        run: cat /proc/cpuinfo
+      - name: Install test requirements
+        run: |
+          pip install -r tests/cross_fw/examples/requirements.txt
+      - name: Print installed modules
+        run: pip list
+      - name: Run examples test scope
+        run: |
+          set +e
+          python -m pytest -s -ra tests/cross_fw/examples \
+            -m cuda \
+            --junit-xml=pytest-results.xml \
+            --durations-path=tests/cross_fw/examples/.test_durations \
+            --splitting-algorithm=least_duration \
+            --splits 1 \
+            --group ${{ matrix.group }} \
+            ${{ github.event.inputs.pytest_args || '' }}
+          ret=$?
+          [ $ret -eq 5 ] && [ -n "${{ github.event.inputs.pytest_args || '' }}" ]  && exit 0 || exit $ret
+        env:
+          TQDM_DISABLE: 1
+      - name: Test Summary
+        if: ${{ !cancelled() }}
+        run: |
+          pip install defusedxml==0.7.1
+          python .github/scripts/pytest_md_summary.py pytest-results.xml >> $GITHUB_STEP_SUMMARY
+
   examples-win-cpu:
     timeout-minutes: 80
     name: Test examples CPU Windows [${{ matrix.group }}/4]
diff --git a/examples/llm_compression/torch/qat_with_lora/main.py b/examples/llm_compression/torch/qat_with_lora/main.py
@@ -13,7 +13,7 @@
 import sys
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 import torch
 import torch.nn.functional as F
@@ -24,39 +24,39 @@
 from torch import Tensor
 from torch import nn
 from torch.utils.tensorboard import SummaryWriter
-from tqdm import tqdm
-from tqdm import trange
 from transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 from whowhatbench import TextEvaluator
 
 import nncf
+from nncf.common.logging.track_progress import track
 from nncf.data.dataset import Dataset
 from nncf.parameters import CompressionFormat
 from nncf.parameters import CompressWeightsMode
 from nncf.parameters import StripFormat
+from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
 from nncf.quantization.quantize_model import compress_weights
 from nncf.torch.model_creation import load_from_config
 from nncf.torch.quantization.layers import AsymmetricLoraQuantizer
 from nncf.torch.quantization.layers import SymmetricLoraQuantizer
 
 
-def get_wikitext2(nsamples: int, seqlen: int, tokenizer: Any, device: torch.device) -> List[Tensor]:
+def get_wikitext2(num_samples: int, seqlen: int, tokenizer: Any, device: torch.device) -> List[Tensor]:
     """
     Loads and processes the Wikitext-2 dataset for training.
 
-    :param nsamples: Number of samples to generate.
+    :param num_samples: Number of samples to generate.
     :param seqlen: Sequence length for each sample.
     :param tokenizer: Tokenizer to encode the text.
     :param device: Device to move the tensors to (e.g., 'cpu' or 'cuda').
     :return: A list of tensors containing the tokenized text samples.
     """
     traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
-    limit = nsamples * seqlen // 4  # ~1k for 128 samples with seqlen=32 to be aligned with optimum
+    limit = num_samples * seqlen // 4  # ~1k for 128 samples with seqlen=32 to be aligned with optimum
     text = "".join([" \n" if s == "" else s for s in traindata["text"][:limit]])
     trainenc = tokenizer(text, return_tensors="pt")
     trainloader = []
-    for _ in range(nsamples):
+    for _ in range(num_samples):
         # Crop a sequence of tokens of length seqlen starting at a random position
         i = torch.randint(0, trainenc.input_ids.shape[1] - seqlen - 1, (1,)).item()
         j = i + seqlen
@@ -66,7 +66,7 @@ def get_wikitext2(nsamples: int, seqlen: int, tokenizer: Any, device: torch.devi
 
 
 @torch.no_grad()
-def save_wwb_ref(model: str, tokenizer: Any, wwb_ref_file: Path) -> None:
+def save_wwb_ref(model: str, tokenizer: Any, wwb_ref_file: Path, num_samples: Optional[int] = None) -> None:
     """
     Save the reference answers for the WWB (WhoWhatBenchmark) evaluation.
 
@@ -76,12 +76,14 @@ def save_wwb_ref(model: str, tokenizer: Any, wwb_ref_file: Path) -> None:
     """
     if not wwb_ref_file.exists():
         print("#" * 50 + " Collect reference answers for WWB " + "#" * 50)
-        wwb_eval = TextEvaluator(base_model=model, tokenizer=tokenizer, use_chat_template=True)
+        wwb_eval = TextEvaluator(base_model=model, tokenizer=tokenizer, use_chat_template=True, num_samples=num_samples)
         wwb_eval.dump_gt(str(wwb_ref_file))
         torch.cuda.empty_cache()
 
 
-def measure_similarity(model_for_eval: OVModelForCausalLM, tokenizer: Any, wwb_ref_file: Path) -> float:
+def measure_similarity(
+    model_for_eval: OVModelForCausalLM, tokenizer: Any, wwb_ref_file: Path, num_samples: Optional[int] = None
+) -> float:
     """
     Measures the similarity of a model's output to a reference outputs from a given file using WWB evaluation.
 
@@ -92,7 +94,11 @@ def measure_similarity(model_for_eval: OVModelForCausalLM, tokenizer: Any, wwb_r
     """
     print("#" * 50 + " Evaluate via WWB " + "#" * 50)
     wwb_eval = TextEvaluator(
-        tokenizer=tokenizer, gt_data=wwb_ref_file, test_data=str(wwb_ref_file), use_chat_template=True
+        tokenizer=tokenizer,
+        gt_data=wwb_ref_file,
+        test_data=str(wwb_ref_file),
+        use_chat_template=True,
+        num_samples=num_samples,
     )
     _, all_metrics = wwb_eval.score(model_for_eval)
     return float(all_metrics["similarity"].iloc[0])
@@ -108,8 +114,8 @@ def calc_hiddens(model: nn.Module, dataloader: List[Tensor]) -> List[Tensor]:
     :return: A list of hidden states for each input in the dataloader.
     """
     orig_hiddens = []
-    for i in trange(len(dataloader), total=len(dataloader), desc="Calculating original hiddens", leave=False):
-        model_input = get_model_input(dataloader[i])
+    for data in track(dataloader, description="Calculating original hiddens"):
+        model_input = get_model_input(data)
         orig_hiddens.append(model.model(**model_input).last_hidden_state)
     torch.cuda.empty_cache()
     return orig_hiddens
@@ -260,10 +266,12 @@ def get_argument_parser() -> argparse.ArgumentParser:
         help="Whether to start from previously saved checkpoint. If not specified or checkpoint does not exist, "
         "start from scratch by post-training weight compression initialization.",
     )
+    parser.add_argument("--lora_rank", type=int, default=256, help="Rank of lora adapters")
 
     # Data params
-    parser.add_argument("--nsamples", type=int, default=1024, help="Number of training samples")
+    parser.add_argument("--num_train_samples", type=int, default=1024, help="Number of training samples")
     parser.add_argument("--seqlen", type=int, default=1024, help="Calibration data context length.")
+    parser.add_argument("--num_val_samples", type=int, default=None, help="Number of validation samples for WWB.")
 
     # Training params
     parser.add_argument(
@@ -286,7 +294,7 @@ def get_argument_parser() -> argparse.ArgumentParser:
 
 def main(argv) -> float:
     """
-    Fine-tunes the specified model and returns the best validation similarity score.
+    Fine-tunes the specified model and returns the difference between initial and best validation similarity scores.
     """
     parser = get_argument_parser()
     args = parser.parse_args(argv)
@@ -295,7 +303,10 @@ def main(argv) -> float:
     device = "cuda"
     torch_dtype = torch.bfloat16
     compression_config = dict(
-        mode=CompressWeightsMode.INT4_ASYM, group_size=64, compression_format=CompressionFormat.FQ_LORA
+        mode=CompressWeightsMode.INT4_ASYM,
+        group_size=64,
+        compression_format=CompressionFormat.FQ_LORA,
+        advanced_parameters=AdvancedCompressionParameters(lora_adapter_rank=args.lora_rank),
     )
 
     # Configure output and log files.
@@ -320,11 +331,13 @@ def main(argv) -> float:
     # computed by for data generated by two models, original floating-point one and optimized.
     # TODO: (nlyalyus) Use original model for collecting reference, once the bug in WWB resolved.
     wwb_ref_model = AutoModelForCausalLM.from_pretrained(args.pretrained, torch_dtype=torch_dtype, device_map="cpu")
-    save_wwb_ref(wwb_ref_model, tokenizer, wwb_ref_file)
+    save_wwb_ref(wwb_ref_model, tokenizer, wwb_ref_file, args.num_val_samples)
     del wwb_ref_model
 
     # Prepare training data and pre-compute hiddens of teacher model for distillation loss.
-    train_loader = get_wikitext2(nsamples=args.nsamples, seqlen=args.seqlen, tokenizer=tokenizer, device=device)
+    train_loader = get_wikitext2(
+        num_samples=args.num_train_samples, seqlen=args.seqlen, tokenizer=tokenizer, device=device
+    )
     orig_hiddens = calc_hiddens(model, train_loader)
 
     # Create or load model to tune with Fake Quantizers and absorbable LoRA adapters.
@@ -341,9 +354,11 @@ def main(argv) -> float:
 
     # Convert torch checkpoint to an OpenVINO model and evaluate it via WWB.
     model_for_eval = export_to_openvino(args.pretrained, train_loader[0], ckpt_file, last_dir)
-    best_similarity = measure_similarity(model_for_eval, tokenizer, wwb_ref_file)
-    tb.add_scalar("similarity", best_similarity, 0)
-    print(f"Initial WWB similarity= {best_similarity:.4f}")
+    initial_similarity = best_similarity = measure_similarity(
+        model_for_eval, tokenizer, wwb_ref_file, args.num_val_samples
+    )
+    tb.add_scalar("similarity", initial_similarity, 0)
+    print(f"Initial WWB similarity= {initial_similarity:.4f}")
 
     # Run tuning with distillation loss and validation on WWB after each epoch.
     grad_accumulation_steps = args.batch_size // args.microbatch_size
@@ -354,7 +369,7 @@ def main(argv) -> float:
     loss_numerator = grad_steps = total_microbatches = 0
     for epoch in range(args.epochs):
         batch_indices_epoch = torch.randperm(num_samples)[:epoch_samples].chunk(microbatches_per_epoch)
-        for indices in tqdm(batch_indices_epoch, desc=f"Train epoch {epoch}", leave=[False]):
+        for indices in track(batch_indices_epoch, description=f"Train epoch {epoch}"):
             indices = indices.tolist()
             total_microbatches += 1
 
@@ -373,7 +388,7 @@ def form_batch(inputs: List[Tensor], model_input: bool):
                         targets = torch.tanh(targets)
                         targets = targets * fls
             outputs = model(**inputs).logits
-            loss = kl_div(outputs, targets.to(dtype=torch_dtype))
+            loss = kl_div(outputs, targets.to(device=device, dtype=torch_dtype))
 
             # Perform an optimization step after accumulating gradients over multiple minibatches.
             loss_numerator += loss.item()
@@ -393,16 +408,16 @@ def form_batch(inputs: List[Tensor], model_input: bool):
         # Save the best checkpoint and OpenVINO IR for the highest similarity score obtained from WWB.
         save_checkpoint(model, ckpt_file)
         model_for_eval = export_to_openvino(args.pretrained, train_loader[0], ckpt_file, last_dir)
-        similarity = measure_similarity(model_for_eval, tokenizer, wwb_ref_file)
-        print(f"[Epoch {epoch}], WWB similarity = {similarity:.4f}")
+        similarity = measure_similarity(model_for_eval, tokenizer, wwb_ref_file, args.num_val_samples)
+        print(f"[Epoch {epoch + 1}], WWB similarity = {similarity:.4f}")
         tb.add_scalar("similarity", similarity, total_microbatches)
         if similarity > best_similarity:
             print(f"New best WWB similarity = {similarity:.4f}")
             best_similarity = similarity
             shutil.copytree(last_dir, best_dir, dirs_exist_ok=True)
 
-    print(f"The finetuned OV model with the best similarity={best_similarity} saved to: {best_dir}")
-    return best_similarity
+    print(f"The finetuned OV model with the best similarity={best_similarity:.4f} saved to: {best_dir}")
+    return best_similarity - initial_similarity
 
 
 if __name__ == "__main__":
diff --git a/examples/llm_compression/torch/qat_with_lora/requirements.txt b/examples/llm_compression/torch/qat_with_lora/requirements.txt
@@ -1,5 +1,5 @@
-tqdm
-tensorboard
+tensorboard==2.13.0
+torch==2.6.0
 whowhatbench @ git+https://github.com/openvinotoolkit/openvino.genai#subdirectory=tools/who_what_benchmark
 numpy>=1.23.5,<2
 openvino==2025.0
diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py
@@ -365,26 +365,23 @@ class AdvancedCompressionParameters:
     :type statistics_path: str
     :param awq_params: Advanced parameters for AWQ algorithm.
     :type awq_params: AdvancedAWQParameters
-    :param scale_estimation_params: Advanced parameters for scale estimation algorithm.
+    :param scale_estimation_params: Advanced parameters for Scale Estimation algorithm.
     :type scale_estimation_params: AdvancedScaleEstimationParameters
+    :param gptq_params: Advanced parameters for GPTQ algorithm.
+    :type gptq_params: AdvancedGPTQParameters
+    :param lora_correction_params: Advanced parameters for Lora Correction algorithm.
+    :type lora_correction_params: AdvancedLoraCorrectionParameters
+    :param lora_adapter_rank: Rank of lora adapters for FQ_LORA format. Defaults to 256.
+    :type lora_adapter_rank: int
     """
 
     statistics_path: Optional[str] = None
-    # Advanced AWQ algorithm parameters
     awq_params: AdvancedAWQParameters = field(default_factory=AdvancedAWQParameters)
-
-    # Advanced scale estimation algorithm parameters
     scale_estimation_params: AdvancedScaleEstimationParameters = field(
         default_factory=AdvancedScaleEstimationParameters
     )
-
-    # Advanced GPTQ algorithm parameters
     gptq_params: AdvancedGPTQParameters = field(default_factory=AdvancedGPTQParameters)
-
-    # Advanced Lora Correction algorithm parameters
     lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters)
-
-    # rank of lora adapters for FQ_LORA format. Defaults to 256.
     lora_adapter_rank: int = 256
 
 
diff --git a/nncf/torch/quantization/layers.py b/nncf/torch/quantization/layers.py
@@ -1079,7 +1079,10 @@ def init_lora(self, lspec: PTLoraSpec):
         out_features, in_features = lspec.orig_weight_shape
         rank = lspec.lora_rank
         if rank > out_features or rank > in_features:
-            msg = f"Specified LoRA rank={rank} cannot exceed any dimension of the weight tensor"
+            msg = (
+                f"Specified LoRA rank={rank} cannot exceed any dimension of the weight tensor: "
+                f"[{out_features}, {in_features}]"
+            )
             raise nncf.ValidationError(msg)
         self.lora_A = torch.nn.Parameter(torch.ones((rank, in_features), dtype=default_lora_dtype))
         self.lora_B = torch.nn.Parameter(torch.zeros((out_features, rank), dtype=default_lora_dtype))
diff --git a/tests/cross_fw/examples/.test_durations b/tests/cross_fw/examples/.test_durations
@@ -15,5 +15,6 @@
     "tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_torch_resnet18]": 1251.144,
     "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_torch_fx_resnet18]": 412.243,
     "tests/cross_fw/examples/test_examples.py::test_examples[fp8_llm_quantization]": 229.69,
-    "tests.cross_fw.examples.test_examples.test_examples[quantization_aware_training_tensorflow_mobilenet_v2]": 1500.00
+    "tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_tensorflow_mobilenet_v2]": 1500.00,
+    "tests/cross_fw/examples/test_examples.py::test_examples[llm_compression_qat_with_lora]": 665
 }
diff --git a/tests/cross_fw/examples/example_scope.json b/tests/cross_fw/examples/example_scope.json
@@ -274,6 +274,16 @@
             ]
         }
     },
+    "llm_compression_qat_with_lora": {
+        "backend": "torch",
+        "device": "cuda",
+        "requirements": "examples/llm_compression/torch/qat_with_lora/requirements.txt",
+        "cpu": "Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz",
+        "accuracy_tolerance": 0.002,
+        "accuracy_metrics": {
+            "similarity_diff": 0.027
+        }
+    },
     "quantization_aware_training_tensorflow_mobilenet_v2": {
         "backend": "tf",
         "requirements": "examples/quantization_aware_training/tensorflow/mobilenet_v2/requirements.txt",
diff --git a/tests/cross_fw/examples/run_example.py b/tests/cross_fw/examples/run_example.py
diff --git a/tests/cross_fw/examples/test_examples.py b/tests/cross_fw/examples/test_examples.py

Original file line number	Diff line number	Diff line change
`@@ -15,5 +15,6 @@`
`15`	`15`	`"tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_torch_resnet18]": 1251.144,`
`16`	`16`	`"tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_torch_fx_resnet18]": 412.243,`
`17`	`17`	`"tests/cross_fw/examples/test_examples.py::test_examples[fp8_llm_quantization]": 229.69,`
`18`		`- "tests.cross_fw.examples.test_examples.test_examples[quantization_aware_training_tensorflow_mobilenet_v2]": 1500.00`
	`18`	`+ "tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_tensorflow_mobilenet_v2]": 1500.00,`
	`19`	`+ "tests/cross_fw/examples/test_examples.py::test_examples[llm_compression_qat_with_lora]": 665`
`19`	`20`	`}`