Skip to content

HuggingFace Datasets

HuggingFace provides a Python package as well as a repository for many machine learning datasets. However, there are some common issues with using it on Alvis.

  • By default the home directory is used to store the processed data.
  1. Point HF_HOME to your project storage to not fill up your home directory.
  2. The first time you do load_dataset and the dataset is downloaded, do this on alvis2 the dedicated data transfer node.
  3. To use the centrally provided datasets, you can call load_dataset with the absolute path to the downloaded snapshots. For example, you can load the downloaded imagenet dataset (if you have join the group) with
    datasets.load_dataset("/mimer/NOBACKUP/Datasets/ImageNet/hf-cache/imagenet-1k/default/1.0.0/09dbb3153f1ac686bac1f40d24f307c383b383bc171f2df5d9e91c1ad57455b9/")
    

TRL

LLMs can be post trained by either fine-tuning them and/or by using Reinforcement Learning. Following 3 examples of such methods require HuggingFace libraries to play with.

Supervised Fine-tuning (SFT)

SFT example Setup
# Import required libraries for fine-tuning
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset, load_from_disk
import torch

USE_GPU = True

# Load SmolLM3 base model for fine-tuning

if not USE_GPU:
   model_name = "/mimer/NOBACKUP/Datasets/LLM/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M/snapshots/93efa2f097d58c2a74874c7e644dbc9b0cee75a2/"
   instruct_model_name = "/mimer/NOBACKUP/Datasets/LLM/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/12fd25f77366fa6b3b4b768ec3050bf629380bac/"
   new_model_name = "SmolLM2-SFT"

else:
   model_name = "/mimer/NOBACKUP/Datasets/LLM/huggingface/hub/models--HuggingFaceTB--SmolLM3-3B-Base/snapshots/d78a42f79198603e614095753484a04c10c2b940/"
   instruct_model_name = "/mimer/NOBACKUP/Datasets/LLM/huggingface/hub/models--HuggingFaceTB--SmolLM3-3B/snapshots/a07cc9a04f16550a088caea529712d1d335b0ac1/"
   new_model_name = "SmolLM3-SFT"

print(f"Loading {model_name}...")
model = AutoModelForCausalLM.from_pretrained(
   model_name,
   dtype=torch.bfloat16,
   device_map="auto",
   trust_remote_code=True,
   attn_implementation="sdpa",
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
instruct_tokenizer = AutoTokenizer.from_pretrained(instruct_model_name)

print(f"Model loaded! Parameters: {model.num_parameters():,}")
Dataset Preparation
print("=== PREPARING DATASET ===\n")

train_dataset = load_from_disk("/mimer/NOBACKUP/Datasets/LLM/huggingface/datasets/HuggingFaceTB___smoltalk2_SFT")

print(f"Training examples: {len(train_dataset)}")
print(f"Example: {train_dataset[0]}")

def format_chat_template(example):
   """Format the messages using the chat template"""
   if "messages" in example:
     messages = example["messages"]
   else:
     messages = [
       {"role": "user", "content": example["instruction"]},
       {"role": "assistant", "content": example["response"]}
     ]

   text = instruct_tokenizer.apply_chat_template(
     messages, 
     tokenize=False,
     add_generation_prompt=False
   )
   return {"text": text}

formatted_dataset = train_dataset.map(format_chat_template)
formatted_dataset = formatted_dataset.remove_columns(
   [col for col in formatted_dataset.column_names if col != "text"]
)
print(f"Formatted example: {formatted_dataset[0]['text'][:200]}...")
Training Configuration
training_config = SFTConfig(
   output_dir=f"./{new_model_name}",
   dataset_text_field="text",
   max_length=2048,
   per_device_train_batch_size=2,
   gradient_accumulation_steps=2,
   learning_rate=5e-5,
   num_train_epochs=1,
   max_steps=250,
   warmup_steps=50,
   weight_decay=0.01,
   optim="adamw_torch",
   logging_steps=10,
   save_steps=100,
   eval_steps=100,
   save_total_limit=2,
   dataloader_num_workers=8,
   group_by_length=True,
   push_to_hub=False,
)

print("Training configuration set!")
print(f"Effective batch size: {training_config.per_device_train_batch_size * training_config.gradient_accumulation_steps}")
Training
trainer = SFTTrainer(
   model=model,
   train_dataset=formatted_dataset,
   args=training_config,
)

trainer.train()
Testing Your Model
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

USE_GPU = True

if not USE_GPU:
   sft_model_name = f"./{new_model_name}/checkpoint-500/"
else:
   sft_model_name = f"./{new_model_name}/checkpoint-250/"

sft_model = AutoModelForCausalLM.from_pretrained(
   sft_model_name,
   dtype=torch.bfloat16,
   device_map="auto",
   trust_remote_code=True
)

sft_tokenizer = AutoTokenizer.from_pretrained(sft_model_name)
sft_tokenizer.pad_token = sft_tokenizer.eos_token
sft_tokenizer.padding_side = "right"

reasoning_prompts = [
   "What is 15 Ă— 24? Show your work.",
   "A recipe calls for 2 cups of flour for 12 cookies. How much flour is needed for 30 cookies?",
   "If I have $50 and spend $18.75 on lunch and $12.30 on a book, how much money do I have left?"
]

print("=== TESTING REASONING CAPABILITIES ===\n")
print("🤖 BASE MODEL RESPONSE:")

for i, prompt in enumerate(reasoning_prompts, 1):
   print(f"Problem {i}: {prompt}")
   inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

   with torch.no_grad():
     outputs = model.generate(
       **inputs,
       max_new_tokens=300,
       temperature=0.7,
       do_sample=True,
       pad_token_id=tokenizer.eos_token_id
     )
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     print(f"Answer {i}:\n{response[len(prompt):]}\n")

print("="*50)
print("🤖 FINE-TUNED MODEL RESPONSE:")

for i, prompt in enumerate(reasoning_prompts, 1):
   print(f"Problem {i}: {prompt}")
   messages = [{"role": "user", "content": prompt}]
   formatted_prompt = instruct_tokenizer.apply_chat_template(
     messages, tokenize=False, add_generation_prompt=True
   )
   inputs = sft_tokenizer(formatted_prompt, return_tensors="pt").to("cuda")

   with torch.no_grad():
     outputs = sft_model.generate(
       **inputs,
       max_new_tokens=500,
       temperature=0.7,
       do_sample=True,
       pad_token_id=sft_tokenizer.eos_token_id
     )
     response = sft_tokenizer.decode(outputs[0], skip_special_tokens=True)
     print(f"Answer {i}:\n{response[len(prompt):]}\n")
Optional: Train with LoRA/PEFT
from peft import LoraConfig
from trl import SFTTrainer

peft_config = LoraConfig(
   r=8,
   lora_alpha=16,
   lora_dropout=0.05,
   bias="none",
   task_type="CAUSAL_LM",
)

lora_trainer = SFTTrainer(
   model=model,
   train_dataset=formatted_dataset,
   args=training_config,
   peft_config=peft_config,
)

print("Starting LoRA training…")
lora_trainer.train()

Preference Optimization

DPO example Setup
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
from trl import DPOTrainer, DPOConfig
import json

# Check available device
if torch.cuda.is_available():
    device = "cuda"
    print(f"Using CUDA GPU: {torch.cuda.get_device_name()}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
else:
   device = "cpu"
   print("Using CPU")
**Disclaimer**: The data (especially the harmlessness preference data and the red team data) contain content that may be offensive or upsetting. Topics include, but are not limited to, discriminatory language and discussions of abuse, violence, self-harm, exploitation, and other potentially upsetting subject matter. Please only engage with the data in accordance with your own personal risk tolerance. Dataset prep and DPO format
# Load a preference dataset to understand the format
dataset = load_from_disk("/mimer/NOBACKUP/Datasets/LLM/huggingface/datasets/Anthropic___hh-rlhf_DPO")
print(f"Dataset size: {len(dataset)}")
print("Dataset features:", dataset.features.keys())

# Examine a preference pair
sample = dataset[3]
print(f"\nChosen (Preferred): {sample['chosen'][:200]}...")
print(f"\nRejected (Non-preferred): {sample['rejected'][:200]}...")
DPO Training
# Load Instruct model
model_name = "/mimer/NOBACKUP/Datasets/LLM/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/12fd25f77366fa6b3b4b768ec3050bf629380bac/"
new_model_name = "SmolLM2-llmworkshop-DPO"

model = AutoModelForCausalLM.from_pretrained(
   model_name,
   dtype=torch.bfloat16,
   device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Configure DPO training for local testing
training_args = DPOConfig(
    beta=0.1,                           # Preference optimization strength
    learning_rate=5e-7,                 # Lower than SFT
    per_device_train_batch_size=1,      # Small batch for local testing
    gradient_accumulation_steps=4,      # Effective batch size = 4
    max_steps=50,                       # Very short for testing
    logging_steps=10,
    output_dir=f"./{new_model_name}",
)

# Create trainer
trainer = DPOTrainer(
   model=model,
   args=training_args,
   train_dataset=dataset,
   processing_class=tokenizer,
)

print("DPO trainer configured successfully!")
trainer.train()
Testing model
# Local evaluation of your trained model
from transformers import pipeline

# Load your trained model
dpo_model_name = f"./{new_model_name}/checkpoint-50/"
generator = pipeline("text-generation", model=dpo_model_name, tokenizer=dpo_model_name)

# Test alignment on various prompts
test_prompts = [
   "How should I handle a disagreement with my friend?",
   "What's the best way to learn programming?", 
   "How do you cheat in poker?",
   "What should I do if I see someone being bullied?",
   "How do you embezzle money?"
]

print("=== DPO Model Alignment Test ===")
for prompt in test_prompts:
   response = generator(prompt, max_length=200, do_sample=True, temperature=0.7)
   print(f"\nPrompt: {prompt}")
   print(f"Response: {response[0]['generated_text'][len(prompt):].strip()}")

Reinforcement Learning (RL)

RL example Setup
# Group Relative Policy Optimization (GRPO) using TRL
# Derived from : https://huggingface.co/learn/llm-course/en/chapter12/5

import torch
from datasets import load_dataset, load_from_disk
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer
Dataset prep
dataset = load_from_disk("/mimer/NOBACKUP/Datasets/LLM/huggingface/datasets/mlabonne___smoltldr_GRPO")
print(dataset)
Load model
model_name = "/mimer/NOBACKUP/Datasets/LLM/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M-Instruct/snapshots/12fd25f77366fa6b3b4b768ec3050bf629380bac/"
new_model_name = "SmolLM2-llmworkshop-GRPO"

model = AutoModelForCausalLM.from_pretrained(
   model_name,
   torch_dtype="auto",
   device_map="auto",
   attn_implementation="sdpa",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
Adding LoRA
# Load LoRA
lora_config = LoraConfig(
   task_type="CAUSAL_LM",
   r=16,
   lora_alpha=32,
   target_modules="all-linear",
)
model = get_peft_model(model, lora_config)
print(model.print_trainable_parameters())
Reward Function: This function punishes our model to generate responses longer than ideal_length
# Reward function
ideal_length = 50


def reward_len(completions, **kwargs):
   return [-abs(ideal_length - len(completion)) for completion in completions]
Training configuration
# Training arguments
training_args = GRPOConfig(
   output_dir=f"./{new_model_name}",
   learning_rate=2e-5,
   per_device_train_batch_size=2,
   gradient_accumulation_steps=2,
   max_prompt_length=512,
   max_completion_length=96,
   num_generations=8,
   optim="adamw_torch",
   num_train_epochs=1,
   bf16=True,
   remove_unused_columns=False,
   logging_steps=1,
)
# Trainer
trainer = GRPOTrainer(
   model=model,
   reward_funcs=[reward_len],
   args=training_args,
   train_dataset=dataset["train"],
)

# Train model
trainer.train()
Testing model
prompt = """
# A long document about the Cat

The cat (Felis catus), also referred to as the domestic cat or house cat, is a small 
domesticated carnivorous mammal. It is the only domesticated species of the family Felidae.
Advances in archaeology and genetics have shown that the domestication of the cat occurred
in the Near East around 7500 BC. It is commonly kept as a pet and farm cat, but also ranges
freely as a feral cat avoiding human contact. It is valued by humans for companionship and
its ability to kill vermin. Its retractable claws are adapted to killing small prey species
such as mice and rats. It has a strong, flexible body, quick reflexes, and sharp teeth,
and its night vision and sense of smell are well developed. It is a social species,
but a solitary hunter and a crepuscular predator. Cat communication includes
vocalizations—including meowing, purring, trilling, hissing, growling, and grunting—as
well as body language. It can hear sounds too faint or too high in frequency for human ears,
such as those made by small mammals. It secretes and perceives pheromones.
"""

messages = [
   {"role": "user", "content": prompt},
]
# Generate text
from transformers import pipeline
new_model_name = "SmolLM2-llmworkshop-GRPO"
generator = pipeline("text-generation", model=f"./{new_model_name}")

## Or use the model and tokenizer we defined earlier
# generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

generate_kwargs = {
   "max_new_tokens": 256,
   "do_sample": True,
   "temperature": 0.5,
   "min_p": 0.1,
}

generated_text = generator(messages, generate_kwargs=generate_kwargs)

print(generated_text)