Close Menu
    Main Menu
    • Home
    • News
    • Tech
    • Robotics
    • ML & Research
    • AI
    • Digital Transformation
    • AI Ethics & Regulation
    • Thought Leadership in AI

    Subscribe to Updates

    Get the latest creative news from FooBar about art, design and business.

    What's Hot

    Google Begins Rolling Out Lengthy-Awaited @gmail.com Electronic mail Function to Customers

    January 17, 2026

    Black Forest Labs launches open supply Flux.2 [klein] to generate AI photos in lower than a second

    January 17, 2026

    Enterprise AI’s New Architectural Management Level – O’Reilly

    January 17, 2026
    Facebook X (Twitter) Instagram
    UK Tech InsiderUK Tech Insider
    Facebook X (Twitter) Instagram
    UK Tech InsiderUK Tech Insider
    Home»Machine Learning & Research»Nice-Tuning a BERT Mannequin – MachineLearningMastery.com
    Machine Learning & Research

    Nice-Tuning a BERT Mannequin – MachineLearningMastery.com

    Oliver ChambersBy Oliver ChambersDecember 7, 2025No Comments4 Mins Read
    Facebook Twitter Pinterest Telegram LinkedIn Tumblr Email Reddit
    Nice-Tuning a BERT Mannequin – MachineLearningMastery.com
    Share
    Facebook Twitter LinkedIn Pinterest Email Copy Link


    import collections

    import dataclasses

    import functools

     

    import torch

    import torch.nn as nn

    import torch.optim as optim

    import tqdm

    from datasets import load_dataset

    from tokenizers import Tokenizer

    from torch import Tensor

     

     

    # BERT config and mannequin outlined beforehand

    @dataclasses.dataclass

    class BertConfig:

        “”“Configuration for BERT mannequin.”“”

        vocab_size: int = 30522

        num_layers: int = 12

        hidden_size: int = 768

        num_heads: int = 12

        dropout_prob: float = 0.1

        pad_id: int = 0

        max_seq_len: int = 512

        num_types: int = 2

     

    class BertBlock(nn.Module):

        “”“One transformer block in BERT.”“”

        def __init__(self, hidden_size: int, num_heads: int, dropout_prob: float):

            tremendous().__init__()

            self.consideration = nn.MultiheadAttention(hidden_size, num_heads,

                                                   dropout=dropout_prob, batch_first=True)

            self.attn_norm = nn.LayerNorm(hidden_size)

            self.ff_norm = nn.LayerNorm(hidden_size)

            self.dropout = nn.Dropout(dropout_prob)

            self.feed_forward = nn.Sequential(

                nn.Linear(hidden_size, 4 * hidden_size),

                nn.GELU(),

                nn.Linear(4 * hidden_size, hidden_size),

            )

     

        def ahead(self, x: Tensor, pad_mask: Tensor) -> Tensor:

            # self-attention with padding masks and post-norm

            attn_output, _ = self.consideration(x, x, x, key_padding_mask=pad_mask)

            x = self.attn_norm(x + attn_output)

            # feed-forward with GeLU activation and post-norm

            ff_output = self.feed_forward(x)

            x = self.ff_norm(x + self.dropout(ff_output))

            return x

     

    class BertPooler(nn.Module):

        “”“Pooler layer for BERT to course of the [CLS] token output.”“”

        def __init__(self, hidden_size: int):

            tremendous().__init__()

            self.dense = nn.Linear(hidden_size, hidden_size)

            self.activation = nn.Tanh()

     

        def ahead(self, x: Tensor) -> Tensor:

            x = self.dense(x)

            x = self.activation(x)

            return x

     

    class BertModel(nn.Module):

        “”“Spine of BERT mannequin.”“”

        def __init__(self, config: BertConfig):

            tremendous().__init__()

            # embedding layers

            self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size,

                                                padding_idx=config.pad_id)

            self.type_embeddings = nn.Embedding(config.num_types, config.hidden_size)

            self.position_embeddings = nn.Embedding(config.max_seq_len, config.hidden_size)

            self.embeddings_norm = nn.LayerNorm(config.hidden_size)

            self.embeddings_dropout = nn.Dropout(config.dropout_prob)

            # transformer blocks

            self.blocks = nn.ModuleList([

                BertBlock(config.hidden_size, config.num_heads, config.dropout_prob)

                for _ in range(config.num_layers)

            ])

            # [CLS] pooler layer

            self.pooler = BertPooler(config.hidden_size)

     

        def ahead(self, input_ids: Tensor, token_type_ids: Tensor, pad_id: int = 0,

                    ) -> tuple[Tensor, Tensor]:

            # create consideration masks for padding tokens

            pad_mask = input_ids == pad_id

            # convert integer tokens to embedding vectors

            batch_size, seq_len = input_ids.form

            position_ids = torch.arange(seq_len, system=input_ids.system).unsqueeze(0)

            position_embeddings = self.position_embeddings(position_ids)

            type_embeddings = self.type_embeddings(token_type_ids)

            token_embeddings = self.word_embeddings(input_ids)

            x = token_embeddings + type_embeddings + place_embeddings

            x = self.embeddings_norm(x)

            x = self.embeddings_dropout(x)

            # course of the sequence with transformer blocks

            for block in self.blocks:

                x = block(x, pad_mask)

            # pool the hidden state of the `[CLS]` token

            pooled_output = self.pooler(x[:, 0, :])

            return x, pooled_output

     

    # Outline new BERT mannequin for query answering

    class BertForQuestionAnswering(nn.Module):

        “”“BERT mannequin for SQuAD query answering.”“”

        def __init__(self, config: BertConfig):

            tremendous().__init__()

            self.bert = BertModel(config)

            # Two outputs: begin and finish place logits

            self.qa_outputs = nn.Linear(config.hidden_size, 2)

     

        def ahead(self,

            input_ids: Tensor,

            token_type_ids: Tensor,

            pad_id: int = 0,

        ) -> tuple[Tensor, Tensor]:

            # Get sequence output from BERT (batch_size, seq_len, hidden_size)

            seq_output, pooled_output = self.bert(input_ids, token_type_ids, pad_id=pad_id)

            # Venture to start out and finish logits

            logits = self.qa_outputs(seq_output)  # (batch_size, seq_len, 2)

            start_logits = logits[:, :, 0]  # (batch_size, seq_len)

            end_logits = logits[:, :, 1]    # (batch_size, seq_len)

            return start_logits, finish_logits

     

    # Load SQuAD dataset for query answering

    dataset = load_dataset(“squad”)

     

    # Load the pretrained BERT tokenizer

    TOKENIZER_PATH = “wikitext-2_wordpiece.json”

    tokenizer = Tokenizer.from_file(TOKENIZER_PATH)

     

    # Setup collate perform to tokenize question-context pairs for the mannequin

    def collate(batch: checklist[dict], tokenizer: Tokenizer, max_len: int,

                ) -> tuple[Tensor, Tensor, Tensor, Tensor]:

        “”“Collate question-context pairs for the mannequin.”“”

        cls_id = tokenizer.token_to_id(“[CLS]”)

        sep_id = tokenizer.token_to_id(“[SEP]”)

        pad_id = tokenizer.token_to_id(“[PAD]”)

     

        input_ids_list = []

        token_type_ids_list = []

        start_positions = []

        end_positions = []

     

        for merchandise in batch:

            # Tokenize query and context

            query, context = merchandise[“question”], merchandise[“context”]

            question_ids = tokenizer.encode(query).ids

            context_ids = tokenizer.encode(context).ids

     

            # Construct enter: [CLS] query [SEP] context [SEP]

            input_ids = [cls_id, *question_ids, sep_id, *context_ids, sep_id]

            token_type_ids = [0] * (len(question_ids)+2) + [1] * (len(context_ids)+1)

     

            # Truncate or pad to max size

            if len(input_ids) > max_len:

                input_ids = input_ids[:max_len]

                token_type_ids = token_type_ids[:max_len]

            else:

                input_ids.prolong([pad_id] * (max_len – len(input_ids)))

                token_type_ids.prolong([1] * (max_len – len(token_type_ids)))

     

            # Discover reply place in tokens: Reply will not be within the context

            start_pos = end_pos = 0

            if len(merchandise[“answers”][“text”]) > 0:

                solutions = tokenizer.encode(merchandise[“answers”][“text”][0]).ids

                # discover the context offset of the reply in context_ids

                for i in vary(len(context_ids) – len(solutions) + 1):

                    if context_ids[i:i+len(answers)] == solutions:

                        start_pos = i + len(question_ids) + 2

                        end_pos = start_pos + len(solutions) – 1

                        break

                if end_pos >= max_len:

                    start_pos = end_pos = 0  # reply is clipped, therefore no reply

     

            input_ids_list.append(input_ids)

            token_type_ids_list.append(token_type_ids)

            start_positions.append(start_pos)

            end_positions.append(end_pos)

     

        input_ids_list = torch.tensor(input_ids_list)

        token_type_ids_list = torch.tensor(token_type_ids_list)

        start_positions = torch.tensor(start_positions)

        end_positions = torch.tensor(end_positions)

        return (input_ids_list, token_type_ids_list, start_positions, end_positions)

     

    batch_size = 16

    max_len = 384  # Longer for Q&A to accommodate context

    collate_fn = functools.partial(collate, tokenizer=tokenizer, max_len=max_len)

    train_loader = torch.utils.knowledge.DataLoader(dataset[“train”], batch_size=batch_size,

                                               shuffle=True, collate_fn=collate_fn)

    val_loader = torch.utils.knowledge.DataLoader(dataset[“validation”], batch_size=batch_size,

                                             shuffle=False, collate_fn=collate_fn)

     

    # Create Q&A mannequin with a pretrained basis BERT mannequin

    system = torch.system(“cuda” if torch.cuda.is_available() else “cpu”)

    config = BertConfig()

    mannequin = BertForQuestionAnswering(config)

    mannequin.to(system)

    mannequin.bert.load_state_dict(torch.load(“bert_model.pth”, map_location=system))

     

    # Coaching setup

    loss_fn = nn.CrossEntropyLoss()

    optimizer = optim.AdamW(mannequin.parameters(), lr=2e–5)

    num_epochs = 3

     

    for epoch in vary(num_epochs):

        mannequin.prepare()

        # Coaching

        with tqdm.tqdm(train_loader, desc=f“Epoch {epoch+1}/{num_epochs}”) as pbar:

            for batch in pbar:

                # get batched knowledge

                input_ids, token_type_ids, start_positions, end_positions = batch

                input_ids = input_ids.to(system)

                token_type_ids = token_type_ids.to(system)

                start_positions = start_positions.to(system)

                end_positions = end_positions.to(system)

                # ahead go

                start_logits, end_logits = mannequin(input_ids, token_type_ids)

                # backward go

                optimizer.zero_grad()

                start_loss = loss_fn(start_logits, start_positions)

                end_loss = loss_fn(end_logits, end_positions)

                loss = start_loss + end_loss

                loss.backward()

                optimizer.step()

                # replace progress bar

                pbar.set_postfix(loss=float(loss))

                pbar.replace(1)

     

        # Validation: Hold monitor of the typical loss and accuracy

        mannequin.eval()

        val_loss, num_matches, num_batches, num_samples = 0, 0, 0, 0

        with torch.no_grad():

            for batch in val_loader:

                # get batched knowledge

                input_ids, token_type_ids, start_positions, end_positions = batch

                input_ids = input_ids.to(system)

                token_type_ids = token_type_ids.to(system)

                start_positions = start_positions.to(system)

                end_positions = end_positions.to(system)

                # ahead go on validation knowledge

                start_logits, end_logits = mannequin(input_ids, token_type_ids)

                # compute loss

                start_loss = loss_fn(start_logits, start_positions)

                end_loss = loss_fn(end_logits, end_positions)

                loss = start_loss + end_loss

                val_loss += loss.merchandise()

                num_batches += 1

                # compute accuracy

                pred_start = start_logits.argmax(dim=–1)

                pred_end = end_logits.argmax(dim=–1)

                match = (pred_start == start_positions) & (pred_end == end_positions)

                num_matches += match.sum().merchandise()

                num_samples += len(start_positions)

     

        avg_loss = val_loss / num_batches

        acc = num_matches / num_samples

        print(f“Validation {epoch+1}/{num_epochs}: acc {acc:.4f}, avg loss {avg_loss:.4f}”)

     

    # Save the fine-tuned mannequin

    torch.save(mannequin.state_dict(), f“bert_model_squad.pth”)

    Share. Facebook Twitter Pinterest LinkedIn Tumblr Email
    Oliver Chambers
    • Website

    Related Posts

    Enterprise AI’s New Architectural Management Level – O’Reilly

    January 17, 2026

    The Knowledge-High quality Phantasm: Rethinking Classifier-Primarily based High quality Filtering for LLM Pretraining

    January 16, 2026

    How the Amazon AMET Funds crew accelerates check case technology with Strands Brokers

    January 16, 2026
    Top Posts

    Evaluating the Finest AI Video Mills for Social Media

    April 18, 2025

    Utilizing AI To Repair The Innovation Drawback: The Three Step Resolution

    April 18, 2025

    Midjourney V7: Quicker, smarter, extra reasonable

    April 18, 2025

    Meta resumes AI coaching utilizing EU person knowledge

    April 18, 2025
    Don't Miss

    Google Begins Rolling Out Lengthy-Awaited @gmail.com Electronic mail Function to Customers

    By Declan MurphyJanuary 17, 2026

    Google has initiated a gradual rollout of a extremely requested function that permits customers to vary their…

    Black Forest Labs launches open supply Flux.2 [klein] to generate AI photos in lower than a second

    January 17, 2026

    Enterprise AI’s New Architectural Management Level – O’Reilly

    January 17, 2026

    Simplify cloud networking with Lumen® Multi-Cloud Gateway

    January 17, 2026
    Stay In Touch
    • Facebook
    • Twitter
    • Pinterest
    • Instagram
    • YouTube
    • Vimeo

    Subscribe to Updates

    Get the latest creative news from SmartMag about art & design.

    UK Tech Insider
    Facebook X (Twitter) Instagram
    • About Us
    • Contact Us
    • Privacy Policy
    • Terms Of Service
    • Our Authors
    © 2026 UK Tech Insider. All rights reserved by UK Tech Insider.

    Type above and press Enter to search. Press Esc to cancel.