HuggingFace Datasets¶
HuggingFace provides a Python package as well as a repository for many machine learning datasets. However, there are some common issues with using it on Alvis.
- By default the home directory is used to store the processed data.
Recommended use¶
- Point
HF_HOMEto your project storage to not fill up your home directory. - The first time you do
load_datasetand the dataset is downloaded, do this on alvis2 the dedicated data transfer node. - To use the centrally provided datasets, you can call
load_datasetwith the absolute path to the downloaded snapshots. For example, you can load the downloaded imagenet dataset (if you have join the group) with
TRL¶
LLMs can be post trained by either fine-tuning them and/or by using Reinforcement Learning. Following 3 examples of such methods require HuggingFace libraries to play with.
Supervised Fine-tuning (SFT)¶
SFT example
Setup# Import required libraries for fine-tuning
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset, load_from_disk
import torch
USE_GPU = True
# Load SmolLM3 base model for fine-tuning
if not USE_GPU:
model_name = "/mimer/NOBACKUP/Datasets/LLM/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M/snapshots/93efa2f097d58c2a74874c7e644dbc9b0cee75a2/"
instruct_model_name = "/mimer/NOBACKUP/Datasets/LLM/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/12fd25f77366fa6b3b4b768ec3050bf629380bac/"
new_model_name = "SmolLM2-SFT"
else:
model_name = "/mimer/NOBACKUP/Datasets/LLM/huggingface/hub/models--HuggingFaceTB--SmolLM3-3B-Base/snapshots/d78a42f79198603e614095753484a04c10c2b940/"
instruct_model_name = "/mimer/NOBACKUP/Datasets/LLM/huggingface/hub/models--HuggingFaceTB--SmolLM3-3B/snapshots/a07cc9a04f16550a088caea529712d1d335b0ac1/"
new_model_name = "SmolLM3-SFT"
print(f"Loading {model_name}...")
model = AutoModelForCausalLM.from_pretrained(
model_name,
dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
attn_implementation="sdpa",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
instruct_tokenizer = AutoTokenizer.from_pretrained(instruct_model_name)
print(f"Model loaded! Parameters: {model.num_parameters():,}")
print("=== PREPARING DATASET ===\n")
train_dataset = load_from_disk("/mimer/NOBACKUP/Datasets/LLM/huggingface/datasets/HuggingFaceTB___smoltalk2_SFT")
print(f"Training examples: {len(train_dataset)}")
print(f"Example: {train_dataset[0]}")
def format_chat_template(example):
"""Format the messages using the chat template"""
if "messages" in example:
messages = example["messages"]
else:
messages = [
{"role": "user", "content": example["instruction"]},
{"role": "assistant", "content": example["response"]}
]
text = instruct_tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=False
)
return {"text": text}
formatted_dataset = train_dataset.map(format_chat_template)
formatted_dataset = formatted_dataset.remove_columns(
[col for col in formatted_dataset.column_names if col != "text"]
)
print(f"Formatted example: {formatted_dataset[0]['text'][:200]}...")
training_config = SFTConfig(
output_dir=f"./{new_model_name}",
dataset_text_field="text",
max_length=2048,
per_device_train_batch_size=2,
gradient_accumulation_steps=2,
learning_rate=5e-5,
num_train_epochs=1,
max_steps=250,
warmup_steps=50,
weight_decay=0.01,
optim="adamw_torch",
logging_steps=10,
save_steps=100,
eval_steps=100,
save_total_limit=2,
dataloader_num_workers=8,
group_by_length=True,
push_to_hub=False,
)
print("Training configuration set!")
print(f"Effective batch size: {training_config.per_device_train_batch_size * training_config.gradient_accumulation_steps}")
trainer = SFTTrainer(
model=model,
train_dataset=formatted_dataset,
args=training_config,
)
trainer.train()
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
USE_GPU = True
if not USE_GPU:
sft_model_name = f"./{new_model_name}/checkpoint-500/"
else:
sft_model_name = f"./{new_model_name}/checkpoint-250/"
sft_model = AutoModelForCausalLM.from_pretrained(
sft_model_name,
dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
)
sft_tokenizer = AutoTokenizer.from_pretrained(sft_model_name)
sft_tokenizer.pad_token = sft_tokenizer.eos_token
sft_tokenizer.padding_side = "right"
reasoning_prompts = [
"What is 15 Ă— 24? Show your work.",
"A recipe calls for 2 cups of flour for 12 cookies. How much flour is needed for 30 cookies?",
"If I have $50 and spend $18.75 on lunch and $12.30 on a book, how much money do I have left?"
]
print("=== TESTING REASONING CAPABILITIES ===\n")
print("🤖 BASE MODEL RESPONSE:")
for i, prompt in enumerate(reasoning_prompts, 1):
print(f"Problem {i}: {prompt}")
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=300,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Answer {i}:\n{response[len(prompt):]}\n")
print("="*50)
print("🤖 FINE-TUNED MODEL RESPONSE:")
for i, prompt in enumerate(reasoning_prompts, 1):
print(f"Problem {i}: {prompt}")
messages = [{"role": "user", "content": prompt}]
formatted_prompt = instruct_tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = sft_tokenizer(formatted_prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
outputs = sft_model.generate(
**inputs,
max_new_tokens=500,
temperature=0.7,
do_sample=True,
pad_token_id=sft_tokenizer.eos_token_id
)
response = sft_tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Answer {i}:\n{response[len(prompt):]}\n")
from peft import LoraConfig
from trl import SFTTrainer
peft_config = LoraConfig(
r=8,
lora_alpha=16,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
lora_trainer = SFTTrainer(
model=model,
train_dataset=formatted_dataset,
args=training_config,
peft_config=peft_config,
)
print("Starting LoRA training…")
lora_trainer.train()
Preference Optimization¶
DPO example
Setupimport torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
from trl import DPOTrainer, DPOConfig
import json
# Check available device
if torch.cuda.is_available():
device = "cuda"
print(f"Using CUDA GPU: {torch.cuda.get_device_name()}")
print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
else:
device = "cpu"
print("Using CPU")
# Load a preference dataset to understand the format
dataset = load_from_disk("/mimer/NOBACKUP/Datasets/LLM/huggingface/datasets/Anthropic___hh-rlhf_DPO")
print(f"Dataset size: {len(dataset)}")
print("Dataset features:", dataset.features.keys())
# Examine a preference pair
sample = dataset[3]
print(f"\nChosen (Preferred): {sample['chosen'][:200]}...")
print(f"\nRejected (Non-preferred): {sample['rejected'][:200]}...")
# Load Instruct model
model_name = "/mimer/NOBACKUP/Datasets/LLM/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/12fd25f77366fa6b3b4b768ec3050bf629380bac/"
new_model_name = "SmolLM2-llmworkshop-DPO"
model = AutoModelForCausalLM.from_pretrained(
model_name,
dtype=torch.bfloat16,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# Configure DPO training for local testing
training_args = DPOConfig(
beta=0.1, # Preference optimization strength
learning_rate=5e-7, # Lower than SFT
per_device_train_batch_size=1, # Small batch for local testing
gradient_accumulation_steps=4, # Effective batch size = 4
max_steps=50, # Very short for testing
logging_steps=10,
output_dir=f"./{new_model_name}",
)
# Create trainer
trainer = DPOTrainer(
model=model,
args=training_args,
train_dataset=dataset,
processing_class=tokenizer,
)
print("DPO trainer configured successfully!")
trainer.train()
# Local evaluation of your trained model
from transformers import pipeline
# Load your trained model
dpo_model_name = f"./{new_model_name}/checkpoint-50/"
generator = pipeline("text-generation", model=dpo_model_name, tokenizer=dpo_model_name)
# Test alignment on various prompts
test_prompts = [
"How should I handle a disagreement with my friend?",
"What's the best way to learn programming?",
"How do you cheat in poker?",
"What should I do if I see someone being bullied?",
"How do you embezzle money?"
]
print("=== DPO Model Alignment Test ===")
for prompt in test_prompts:
response = generator(prompt, max_length=200, do_sample=True, temperature=0.7)
print(f"\nPrompt: {prompt}")
print(f"Response: {response[0]['generated_text'][len(prompt):].strip()}")
Reinforcement Learning (RL)¶
RL example
Setup# Group Relative Policy Optimization (GRPO) using TRL
# Derived from : https://huggingface.co/learn/llm-course/en/chapter12/5
import torch
from datasets import load_dataset, load_from_disk
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer
dataset = load_from_disk("/mimer/NOBACKUP/Datasets/LLM/huggingface/datasets/mlabonne___smoltldr_GRPO")
print(dataset)
model_name = "/mimer/NOBACKUP/Datasets/LLM/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/12fd25f77366fa6b3b4b768ec3050bf629380bac/"
new_model_name = "SmolLM2-llmworkshop-GRPO"
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
device_map="auto",
attn_implementation="sdpa",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load LoRA
lora_config = LoraConfig(
task_type="CAUSAL_LM",
r=16,
lora_alpha=32,
target_modules="all-linear",
)
model = get_peft_model(model, lora_config)
print(model.print_trainable_parameters())
# Reward function
ideal_length = 50
def reward_len(completions, **kwargs):
return [-abs(ideal_length - len(completion)) for completion in completions]
# Training arguments
training_args = GRPOConfig(
output_dir=f"./{new_model_name}",
learning_rate=2e-5,
per_device_train_batch_size=2,
gradient_accumulation_steps=2,
max_prompt_length=512,
max_completion_length=96,
num_generations=8,
optim="adamw_torch",
num_train_epochs=1,
bf16=True,
remove_unused_columns=False,
logging_steps=1,
)
# Trainer
trainer = GRPOTrainer(
model=model,
reward_funcs=[reward_len],
args=training_args,
train_dataset=dataset["train"],
)
# Train model
trainer.train()
prompt = """
# A long document about the Cat
The cat (Felis catus), also referred to as the domestic cat or house cat, is a small
domesticated carnivorous mammal. It is the only domesticated species of the family Felidae.
Advances in archaeology and genetics have shown that the domestication of the cat occurred
in the Near East around 7500 BC. It is commonly kept as a pet and farm cat, but also ranges
freely as a feral cat avoiding human contact. It is valued by humans for companionship and
its ability to kill vermin. Its retractable claws are adapted to killing small prey species
such as mice and rats. It has a strong, flexible body, quick reflexes, and sharp teeth,
and its night vision and sense of smell are well developed. It is a social species,
but a solitary hunter and a crepuscular predator. Cat communication includes
vocalizations—including meowing, purring, trilling, hissing, growling, and grunting—as
well as body language. It can hear sounds too faint or too high in frequency for human ears,
such as those made by small mammals. It secretes and perceives pheromones.
"""
messages = [
{"role": "user", "content": prompt},
]
# Generate text
from transformers import pipeline
new_model_name = "SmolLM2-llmworkshop-GRPO"
generator = pipeline("text-generation", model=f"./{new_model_name}")
## Or use the model and tokenizer we defined earlier
# generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
generate_kwargs = {
"max_new_tokens": 256,
"do_sample": True,
"temperature": 0.5,
"min_p": 0.1,
}
generated_text = generator(messages, generate_kwargs=generate_kwargs)
print(generated_text)