In [4]:
!pip install transformers datasets torch accelerate



In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW
from tqdm import tqdm
from torch.cuda.amp import GradScaler, autocast  # For mixed precision training

# Step 1: Load Your Dataset
file_path = '/content/drive/MyDrive/Ai/QnA Dataset.csv'  # Update the path
df = pd.read_csv(file_path)

# Step 2: Split the Dataset into Train and Test
train_df = df.sample(frac=0.8, random_state=42)  # 80% for training
test_df = df.drop(train_df.index)  # Remaining 20% for testing

# Step 3: Create a Custom Dataset
class QnADataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        question = row['Question']
        answer = row['Answers']
        prompt = f"Q: {question} A: {answer}"

        inputs = self.tokenizer(
            prompt,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
        }

# Step 4: Initialize Tokenizer and Model
model_checkpoint = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
model.resize_token_embeddings(len(tokenizer))

# Step 5: Define Dataset and DataLoader
max_length = 512  # Example max length
batch_size = 4  # Example batch size

train_dataset = QnADataset(train_df, tokenizer, max_length)
test_dataset = QnADataset(test_df, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Step 6: Define Optimizer and Mixed Precision
optimizer = AdamW(model.parameters(), lr=2e-5)
scaler = GradScaler()

# Checkpoint paths
checkpoint_dir = '/content/drive/MyDrive/Ai/model/'
model_file = os.path.join(checkpoint_dir, 'model.pt')

# Helper function to save checkpoint
def save_checkpoint(model, tokenizer, optimizer, epoch, global_step):
    os.makedirs(checkpoint_dir, exist_ok=True)
    checkpoint = {
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "epoch": epoch,
        "global_step": global_step,
    }
    torch.save(checkpoint, model_file)
    tokenizer.save_pretrained(checkpoint_dir)
    print(f"Checkpoint saved at global step {global_step}, epoch {epoch} as {model_file}")

# Helper function to load checkpoint
def load_checkpoint(model, tokenizer, optimizer):
    if os.path.exists(model_file):
        # Load the checkpoint
        checkpoint = torch.load(model_file)
        # Load the model state dict
        model.load_state_dict(checkpoint["model_state_dict"])
        # Load the optimizer state dict
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        # Load the epoch and global step
        epoch = checkpoint["epoch"]
        global_step = checkpoint["global_step"]
        print(f"Resumed from checkpoint at {model_file}")
        return model, tokenizer, epoch, global_step
    else:
        print("No checkpoint found. Starting training from scratch.")
        return model, tokenizer, 0, 0

# Load model, tokenizer, and optimizer state if checkpoint exists
model, tokenizer, start_epoch, start_global_step = load_checkpoint(model, tokenizer, optimizer)

# Step 7: Training Loop
num_epochs = 50
global_step = start_global_step

model.train()
for epoch in range(start_epoch, num_epochs):
    loop = tqdm(train_loader, leave=True, dynamic_ncols=True, initial=global_step)
    epoch_loss = 0

    for step, batch in enumerate(loop):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        with autocast():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=input_ids,
            )
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        epoch_loss += loss.item()
        global_step += 1

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

        # Save checkpoint every 5 global steps
        if global_step % 50 == 0:
            save_checkpoint(model, tokenizer, optimizer, epoch, global_step)

        # Clear memory after each batch
        del input_ids, attention_mask, outputs, loss
        torch.cuda.empty_cache()

    print(f"Epoch {epoch} Loss: {epoch_loss / len(train_loader)}")


    save_checkpoint(model, tokenizer, optimizer, epoch, global_step)


# Step 8: Evaluate the Model
model.eval()
test_loss = 0

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        with autocast():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=input_ids,
            )
            loss = outputs.loss

        test_loss += loss.item()

        # Clear memory after each batch
        del input_ids, attention_mask, outputs, loss
        torch.cuda.empty_cache()

print(f"Test Loss: {test_loss / len(test_loader)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  scaler = GradScaler()
  checkpoint = torch.load(model_file)


Resumed from checkpoint at /content/drive/MyDrive/Ai/model/model.pt


  with autocast():
Epoch 0:   6%|▌         | 100/1668 [1:15:06<55:31:29, 127.48s/it, loss=0.12]

Checkpoint saved at global step 100, epoch 0 as /content/drive/MyDrive/Ai/model/model.pt


Epoch 0:   9%|▉         | 150/1668 [2:30:33<49:27:42, 117.30s/it, loss=0.0683]

Checkpoint saved at global step 150, epoch 0 as /content/drive/MyDrive/Ai/model/model.pt


Epoch 0:  10%|█         | 168/1668 [2:57:03<35:27:19, 85.09s/it, loss=0.0812]

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader
from tqdm import tqdm

# Paths for model and tokenizer
checkpoint_dir = '/content/drive/MyDrive/Ai/model/'
model_file = checkpoint_dir + 'model.pt'

# Load the trained model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)
# Load the model using the checkpoint file, not the directory
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct") # Load the model architecture
checkpoint = torch.load(model_file) # Load the saved checkpoint
model.load_state_dict(checkpoint["model_state_dict"]) # Load the saved weights
model.to(device)

# Set model to evaluation mode
model.eval()

# Step 1: Function for Inference
def generate_answer(question, model, tokenizer, max_length=512):
    """Generate an answer for a given question."""
    input_text = f"Q: {question} A:"
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_length,
    )
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id,
        )

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Step 2: Test the Model on a Custom Input
while True:
    print("\nType your question below (or type 'exit' to stop):")
    question = input("Q: ").strip()
    if question.lower() == "exit":
        print("Exiting...")
        break

    answer = generate_answer(question, model, tokenizer)
    print(f"A: {answer}")
