Skip to content

Commit 1ac5afe

Browse files
committed
Added test for fq+lora tuning on CUDA
1 parent 4c45b57 commit 1ac5afe

File tree

9 files changed

+163
-44
lines changed

9 files changed

+163
-44
lines changed

.github/workflows/examples.yml

+72
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ jobs:
5656
run: |
5757
set +e
5858
python -m pytest -s -ra tests/cross_fw/examples \
59+
-m 'not cuda' \
5960
--junit-xml=pytest-results.xml \
6061
--durations-path=tests/cross_fw/examples/.test_durations \
6162
--splitting-algorithm=least_duration \
@@ -72,6 +73,77 @@ jobs:
7273
pip install defusedxml==0.7.1
7374
python .github/scripts/pytest_md_summary.py pytest-results.xml >> $GITHUB_STEP_SUMMARY
7475
76+
examples-cuda:
77+
name: Test examples CUDA [${{ matrix.group }}/1]
78+
runs-on: aks-linux-4-cores-28gb-gpu-tesla-t4
79+
timeout-minutes: 40
80+
if: ${{ inputs.gpu_enabled == true }}
81+
strategy:
82+
fail-fast: false
83+
matrix:
84+
group: [1]
85+
defaults:
86+
run:
87+
shell: bash
88+
env:
89+
DEBIAN_FRONTEND: noninteractive
90+
steps:
91+
- name: Install dependencies
92+
run : |
93+
sudo apt-get update
94+
sudo apt-get --assume-yes install build-essential ninja-build libgl1-mesa-dev libglib2.0-0 wget make
95+
- name: Download CUDA
96+
run: |
97+
wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run
98+
sudo sh cuda_12.4.0_550.54.14_linux.run --toolkit --silent
99+
- name: Runner info
100+
continue-on-error: true
101+
run: |
102+
export PATH=/usr/local/cuda-12.4/bin${PATH:+:${PATH}}
103+
export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
104+
nvidia-smi
105+
cat /proc/cpuinfo
106+
nvcc --version
107+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
108+
with:
109+
lfs: true
110+
fetch-depth: 0 # Fetch full history to allow checking out any branch or PR
111+
- name: Fetch and Checkout the Pull Request Branch
112+
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.pull_request_number != '' }}
113+
run: |
114+
git fetch origin pull/${{ github.event.inputs.pull_request_number }}/head:pr-${{ github.event.inputs.pull_request_number }}
115+
git checkout pr-${{ github.event.inputs.pull_request_number }}
116+
- uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
117+
with:
118+
python-version: 3.10.14
119+
- name: cpuinfo
120+
run: cat /proc/cpuinfo
121+
- name: Install test requirements
122+
run: |
123+
pip install -r tests/cross_fw/examples/requirements.txt
124+
- name: Print installed modules
125+
run: pip list
126+
- name: Run examples test scope
127+
run: |
128+
set +e
129+
python -m pytest -s -ra tests/cross_fw/examples \
130+
-m cuda \
131+
--junit-xml=pytest-results.xml \
132+
--durations-path=tests/cross_fw/examples/.test_durations \
133+
--splitting-algorithm=least_duration \
134+
--splits 1 \
135+
--group ${{ matrix.group }} \
136+
${{ github.event.inputs.pytest_args || '' }}
137+
ret=$?
138+
[ $ret -eq 5 ] && [ -n "${{ github.event.inputs.pytest_args || '' }}" ] && exit 0 || exit $ret
139+
env:
140+
TQDM_DISABLE: 1
141+
- name: Test Summary
142+
if: ${{ !cancelled() }}
143+
run: |
144+
pip install defusedxml==0.7.1
145+
python .github/scripts/pytest_md_summary.py pytest-results.xml >> $GITHUB_STEP_SUMMARY
146+
75147
examples-win-cpu:
76148
timeout-minutes: 80
77149
name: Test examples CPU Windows [${{ matrix.group }}/4]

examples/llm_compression/torch/qat_with_lora/main.py

+42-27
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import sys
1414
from datetime import datetime
1515
from pathlib import Path
16-
from typing import Any, Dict, List
16+
from typing import Any, Dict, List, Optional
1717

1818
import torch
1919
import torch.nn.functional as F
@@ -24,39 +24,39 @@
2424
from torch import Tensor
2525
from torch import nn
2626
from torch.utils.tensorboard import SummaryWriter
27-
from tqdm import tqdm
28-
from tqdm import trange
2927
from transformers import AutoModelForCausalLM
3028
from transformers import AutoTokenizer
3129
from whowhatbench import TextEvaluator
3230

3331
import nncf
32+
from nncf.common.logging.track_progress import track
3433
from nncf.data.dataset import Dataset
3534
from nncf.parameters import CompressionFormat
3635
from nncf.parameters import CompressWeightsMode
3736
from nncf.parameters import StripFormat
37+
from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
3838
from nncf.quantization.quantize_model import compress_weights
3939
from nncf.torch.model_creation import load_from_config
4040
from nncf.torch.quantization.layers import AsymmetricLoraQuantizer
4141
from nncf.torch.quantization.layers import SymmetricLoraQuantizer
4242

4343

44-
def get_wikitext2(nsamples: int, seqlen: int, tokenizer: Any, device: torch.device) -> List[Tensor]:
44+
def get_wikitext2(num_samples: int, seqlen: int, tokenizer: Any, device: torch.device) -> List[Tensor]:
4545
"""
4646
Loads and processes the Wikitext-2 dataset for training.
4747
48-
:param nsamples: Number of samples to generate.
48+
:param num_samples: Number of samples to generate.
4949
:param seqlen: Sequence length for each sample.
5050
:param tokenizer: Tokenizer to encode the text.
5151
:param device: Device to move the tensors to (e.g., 'cpu' or 'cuda').
5252
:return: A list of tensors containing the tokenized text samples.
5353
"""
5454
traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
55-
limit = nsamples * seqlen // 4 # ~1k for 128 samples with seqlen=32 to be aligned with optimum
55+
limit = num_samples * seqlen // 4 # ~1k for 128 samples with seqlen=32 to be aligned with optimum
5656
text = "".join([" \n" if s == "" else s for s in traindata["text"][:limit]])
5757
trainenc = tokenizer(text, return_tensors="pt")
5858
trainloader = []
59-
for _ in range(nsamples):
59+
for _ in range(num_samples):
6060
# Crop a sequence of tokens of length seqlen starting at a random position
6161
i = torch.randint(0, trainenc.input_ids.shape[1] - seqlen - 1, (1,)).item()
6262
j = i + seqlen
@@ -66,7 +66,7 @@ def get_wikitext2(nsamples: int, seqlen: int, tokenizer: Any, device: torch.devi
6666

6767

6868
@torch.no_grad()
69-
def save_wwb_ref(model: str, tokenizer: Any, wwb_ref_file: Path) -> None:
69+
def save_wwb_ref(model: str, tokenizer: Any, wwb_ref_file: Path, num_samples: Optional[int] = None) -> None:
7070
"""
7171
Save the reference answers for the WWB (WhoWhatBenchmark) evaluation.
7272
@@ -76,12 +76,14 @@ def save_wwb_ref(model: str, tokenizer: Any, wwb_ref_file: Path) -> None:
7676
"""
7777
if not wwb_ref_file.exists():
7878
print("#" * 50 + " Collect reference answers for WWB " + "#" * 50)
79-
wwb_eval = TextEvaluator(base_model=model, tokenizer=tokenizer, use_chat_template=True)
79+
wwb_eval = TextEvaluator(base_model=model, tokenizer=tokenizer, use_chat_template=True, num_samples=num_samples)
8080
wwb_eval.dump_gt(str(wwb_ref_file))
8181
torch.cuda.empty_cache()
8282

8383

84-
def measure_similarity(model_for_eval: OVModelForCausalLM, tokenizer: Any, wwb_ref_file: Path) -> float:
84+
def measure_similarity(
85+
model_for_eval: OVModelForCausalLM, tokenizer: Any, wwb_ref_file: Path, num_samples: Optional[int] = None
86+
) -> float:
8587
"""
8688
Measures the similarity of a model's output to a reference outputs from a given file using WWB evaluation.
8789
@@ -92,7 +94,11 @@ def measure_similarity(model_for_eval: OVModelForCausalLM, tokenizer: Any, wwb_r
9294
"""
9395
print("#" * 50 + " Evaluate via WWB " + "#" * 50)
9496
wwb_eval = TextEvaluator(
95-
tokenizer=tokenizer, gt_data=wwb_ref_file, test_data=str(wwb_ref_file), use_chat_template=True
97+
tokenizer=tokenizer,
98+
gt_data=wwb_ref_file,
99+
test_data=str(wwb_ref_file),
100+
use_chat_template=True,
101+
num_samples=num_samples,
96102
)
97103
_, all_metrics = wwb_eval.score(model_for_eval)
98104
return float(all_metrics["similarity"].iloc[0])
@@ -108,8 +114,8 @@ def calc_hiddens(model: nn.Module, dataloader: List[Tensor]) -> List[Tensor]:
108114
:return: A list of hidden states for each input in the dataloader.
109115
"""
110116
orig_hiddens = []
111-
for i in trange(len(dataloader), total=len(dataloader), desc="Calculating original hiddens", leave=False):
112-
model_input = get_model_input(dataloader[i])
117+
for data in track(dataloader, description="Calculating original hiddens"):
118+
model_input = get_model_input(data)
113119
orig_hiddens.append(model.model(**model_input).last_hidden_state)
114120
torch.cuda.empty_cache()
115121
return orig_hiddens
@@ -260,10 +266,12 @@ def get_argument_parser() -> argparse.ArgumentParser:
260266
help="Whether to start from previously saved checkpoint. If not specified or checkpoint does not exist, "
261267
"start from scratch by post-training weight compression initialization.",
262268
)
269+
parser.add_argument("--lora_rank", type=int, default=256, help="Rank of lora adapters")
263270

264271
# Data params
265-
parser.add_argument("--nsamples", type=int, default=1024, help="Number of training samples")
272+
parser.add_argument("--num_train_samples", type=int, default=1024, help="Number of training samples")
266273
parser.add_argument("--seqlen", type=int, default=1024, help="Calibration data context length.")
274+
parser.add_argument("--num_val_samples", type=int, default=None, help="Number of validation samples for WWB.")
267275

268276
# Training params
269277
parser.add_argument(
@@ -286,7 +294,7 @@ def get_argument_parser() -> argparse.ArgumentParser:
286294

287295
def main(argv) -> float:
288296
"""
289-
Fine-tunes the specified model and returns the best validation similarity score.
297+
Fine-tunes the specified model and returns the difference between initial and best validation similarity scores.
290298
"""
291299
parser = get_argument_parser()
292300
args = parser.parse_args(argv)
@@ -295,7 +303,10 @@ def main(argv) -> float:
295303
device = "cuda"
296304
torch_dtype = torch.bfloat16
297305
compression_config = dict(
298-
mode=CompressWeightsMode.INT4_ASYM, group_size=64, compression_format=CompressionFormat.FQ_LORA
306+
mode=CompressWeightsMode.INT4_ASYM,
307+
group_size=64,
308+
compression_format=CompressionFormat.FQ_LORA,
309+
advanced_parameters=AdvancedCompressionParameters(lora_adapter_rank=args.lora_rank),
299310
)
300311

301312
# Configure output and log files.
@@ -320,11 +331,13 @@ def main(argv) -> float:
320331
# computed by for data generated by two models, original floating-point one and optimized.
321332
# TODO: (nlyalyus) Use original model for collecting reference, once the bug in WWB resolved.
322333
wwb_ref_model = AutoModelForCausalLM.from_pretrained(args.pretrained, torch_dtype=torch_dtype, device_map="cpu")
323-
save_wwb_ref(wwb_ref_model, tokenizer, wwb_ref_file)
334+
save_wwb_ref(wwb_ref_model, tokenizer, wwb_ref_file, args.num_val_samples)
324335
del wwb_ref_model
325336

326337
# Prepare training data and pre-compute hiddens of teacher model for distillation loss.
327-
train_loader = get_wikitext2(nsamples=args.nsamples, seqlen=args.seqlen, tokenizer=tokenizer, device=device)
338+
train_loader = get_wikitext2(
339+
num_samples=args.num_train_samples, seqlen=args.seqlen, tokenizer=tokenizer, device=device
340+
)
328341
orig_hiddens = calc_hiddens(model, train_loader)
329342

330343
# Create or load model to tune with Fake Quantizers and absorbable LoRA adapters.
@@ -341,9 +354,11 @@ def main(argv) -> float:
341354

342355
# Convert torch checkpoint to an OpenVINO model and evaluate it via WWB.
343356
model_for_eval = export_to_openvino(args.pretrained, train_loader[0], ckpt_file, last_dir)
344-
best_similarity = measure_similarity(model_for_eval, tokenizer, wwb_ref_file)
345-
tb.add_scalar("similarity", best_similarity, 0)
346-
print(f"Initial WWB similarity= {best_similarity:.4f}")
357+
initial_similarity = best_similarity = measure_similarity(
358+
model_for_eval, tokenizer, wwb_ref_file, args.num_val_samples
359+
)
360+
tb.add_scalar("similarity", initial_similarity, 0)
361+
print(f"Initial WWB similarity= {initial_similarity:.4f}")
347362

348363
# Run tuning with distillation loss and validation on WWB after each epoch.
349364
grad_accumulation_steps = args.batch_size // args.microbatch_size
@@ -354,7 +369,7 @@ def main(argv) -> float:
354369
loss_numerator = grad_steps = total_microbatches = 0
355370
for epoch in range(args.epochs):
356371
batch_indices_epoch = torch.randperm(num_samples)[:epoch_samples].chunk(microbatches_per_epoch)
357-
for indices in tqdm(batch_indices_epoch, desc=f"Train epoch {epoch}", leave=[False]):
372+
for indices in track(batch_indices_epoch, description=f"Train epoch {epoch}"):
358373
indices = indices.tolist()
359374
total_microbatches += 1
360375

@@ -373,7 +388,7 @@ def form_batch(inputs: List[Tensor], model_input: bool):
373388
targets = torch.tanh(targets)
374389
targets = targets * fls
375390
outputs = model(**inputs).logits
376-
loss = kl_div(outputs, targets.to(dtype=torch_dtype))
391+
loss = kl_div(outputs, targets.to(device=device, dtype=torch_dtype))
377392

378393
# Perform an optimization step after accumulating gradients over multiple minibatches.
379394
loss_numerator += loss.item()
@@ -393,16 +408,16 @@ def form_batch(inputs: List[Tensor], model_input: bool):
393408
# Save the best checkpoint and OpenVINO IR for the highest similarity score obtained from WWB.
394409
save_checkpoint(model, ckpt_file)
395410
model_for_eval = export_to_openvino(args.pretrained, train_loader[0], ckpt_file, last_dir)
396-
similarity = measure_similarity(model_for_eval, tokenizer, wwb_ref_file)
397-
print(f"[Epoch {epoch}], WWB similarity = {similarity:.4f}")
411+
similarity = measure_similarity(model_for_eval, tokenizer, wwb_ref_file, args.num_val_samples)
412+
print(f"[Epoch {epoch + 1}], WWB similarity = {similarity:.4f}")
398413
tb.add_scalar("similarity", similarity, total_microbatches)
399414
if similarity > best_similarity:
400415
print(f"New best WWB similarity = {similarity:.4f}")
401416
best_similarity = similarity
402417
shutil.copytree(last_dir, best_dir, dirs_exist_ok=True)
403418

404-
print(f"The finetuned OV model with the best similarity={best_similarity} saved to: {best_dir}")
405-
return best_similarity
419+
print(f"The finetuned OV model with the best similarity={best_similarity:.4f} saved to: {best_dir}")
420+
return best_similarity - initial_similarity
406421

407422

408423
if __name__ == "__main__":

examples/llm_compression/torch/qat_with_lora/requirements.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
tqdm
2-
tensorboard
1+
tensorboard==2.13.0
2+
torch==2.6.0
33
whowhatbench @ git+https://github.com/openvinotoolkit/openvino.genai#subdirectory=tools/who_what_benchmark
44
numpy>=1.23.5,<2
55
openvino==2025.0

nncf/quantization/advanced_parameters.py

+7-10
Original file line numberDiff line numberDiff line change
@@ -365,26 +365,23 @@ class AdvancedCompressionParameters:
365365
:type statistics_path: str
366366
:param awq_params: Advanced parameters for AWQ algorithm.
367367
:type awq_params: AdvancedAWQParameters
368-
:param scale_estimation_params: Advanced parameters for scale estimation algorithm.
368+
:param scale_estimation_params: Advanced parameters for Scale Estimation algorithm.
369369
:type scale_estimation_params: AdvancedScaleEstimationParameters
370+
:param gptq_params: Advanced parameters for GPTQ algorithm.
371+
:type gptq_params: AdvancedGPTQParameters
372+
:param lora_correction_params: Advanced parameters for Lora Correction algorithm.
373+
:type lora_correction_params: AdvancedLoraCorrectionParameters
374+
:param lora_adapter_rank: Rank of lora adapters for FQ_LORA format. Defaults to 256.
375+
:type lora_adapter_rank: int
370376
"""
371377

372378
statistics_path: Optional[str] = None
373-
# Advanced AWQ algorithm parameters
374379
awq_params: AdvancedAWQParameters = field(default_factory=AdvancedAWQParameters)
375-
376-
# Advanced scale estimation algorithm parameters
377380
scale_estimation_params: AdvancedScaleEstimationParameters = field(
378381
default_factory=AdvancedScaleEstimationParameters
379382
)
380-
381-
# Advanced GPTQ algorithm parameters
382383
gptq_params: AdvancedGPTQParameters = field(default_factory=AdvancedGPTQParameters)
383-
384-
# Advanced Lora Correction algorithm parameters
385384
lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters)
386-
387-
# rank of lora adapters for FQ_LORA format. Defaults to 256.
388385
lora_adapter_rank: int = 256
389386

390387

nncf/torch/quantization/layers.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1079,7 +1079,10 @@ def init_lora(self, lspec: PTLoraSpec):
10791079
out_features, in_features = lspec.orig_weight_shape
10801080
rank = lspec.lora_rank
10811081
if rank > out_features or rank > in_features:
1082-
msg = f"Specified LoRA rank={rank} cannot exceed any dimension of the weight tensor"
1082+
msg = (
1083+
f"Specified LoRA rank={rank} cannot exceed any dimension of the weight tensor: "
1084+
f"[{out_features}, {in_features}]"
1085+
)
10831086
raise nncf.ValidationError(msg)
10841087
self.lora_A = torch.nn.Parameter(torch.ones((rank, in_features), dtype=default_lora_dtype))
10851088
self.lora_B = torch.nn.Parameter(torch.zeros((out_features, rank), dtype=default_lora_dtype))

tests/cross_fw/examples/.test_durations

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,6 @@
1515
"tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_torch_resnet18]": 1251.144,
1616
"tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_torch_fx_resnet18]": 412.243,
1717
"tests/cross_fw/examples/test_examples.py::test_examples[fp8_llm_quantization]": 229.69,
18-
"tests.cross_fw.examples.test_examples.test_examples[quantization_aware_training_tensorflow_mobilenet_v2]": 1500.00
18+
"tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_tensorflow_mobilenet_v2]": 1500.00,
19+
"tests/cross_fw/examples/test_examples.py::test_examples[llm_compression_qat_with_lora]": 665
1920
}

tests/cross_fw/examples/example_scope.json

+10
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,16 @@
274274
]
275275
}
276276
},
277+
"llm_compression_qat_with_lora": {
278+
"backend": "torch",
279+
"device": "cuda",
280+
"requirements": "examples/llm_compression/torch/qat_with_lora/requirements.txt",
281+
"cpu": "Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz",
282+
"accuracy_tolerance": 0.002,
283+
"accuracy_metrics": {
284+
"similarity_diff": 0.027
285+
}
286+
},
277287
"quantization_aware_training_tensorflow_mobilenet_v2": {
278288
"backend": "tf",
279289
"requirements": "examples/quantization_aware_training/tensorflow/mobilenet_v2/requirements.txt",

0 commit comments

Comments
 (0)