Close Menu
    Main Menu
    • Home
    • News
    • Tech
    • Robotics
    • ML & Research
    • AI
    • Digital Transformation
    • AI Ethics & Regulation
    • Thought Leadership in AI

    Subscribe to Updates

    Get the latest creative news from FooBar about art, design and business.

    What's Hot

    ​​Methods to Stop Prior Authorization Delays

    March 3, 2026

    Well-liked Iranian App BadeSaba was Hacked to Ship “Assist Is on the Means” Alerts

    March 3, 2026

    MWC 2026 Updates: Information, Updates and Product Bulletins

    March 3, 2026
    Facebook X (Twitter) Instagram
    UK Tech InsiderUK Tech Insider
    Facebook X (Twitter) Instagram
    UK Tech InsiderUK Tech Insider
    Home»Machine Learning & Research»Nice-Tuning a BERT Mannequin – MachineLearningMastery.com
    Machine Learning & Research

    Nice-Tuning a BERT Mannequin – MachineLearningMastery.com

    Oliver ChambersBy Oliver ChambersDecember 7, 2025No Comments4 Mins Read
    Facebook Twitter Pinterest Telegram LinkedIn Tumblr Email Reddit
    Nice-Tuning a BERT Mannequin – MachineLearningMastery.com
    Share
    Facebook Twitter LinkedIn Pinterest Email Copy Link


    import collections

    import dataclasses

    import functools

     

    import torch

    import torch.nn as nn

    import torch.optim as optim

    import tqdm

    from datasets import load_dataset

    from tokenizers import Tokenizer

    from torch import Tensor

     

     

    # BERT config and mannequin outlined beforehand

    @dataclasses.dataclass

    class BertConfig:

        “”“Configuration for BERT mannequin.”“”

        vocab_size: int = 30522

        num_layers: int = 12

        hidden_size: int = 768

        num_heads: int = 12

        dropout_prob: float = 0.1

        pad_id: int = 0

        max_seq_len: int = 512

        num_types: int = 2

     

    class BertBlock(nn.Module):

        “”“One transformer block in BERT.”“”

        def __init__(self, hidden_size: int, num_heads: int, dropout_prob: float):

            tremendous().__init__()

            self.consideration = nn.MultiheadAttention(hidden_size, num_heads,

                                                   dropout=dropout_prob, batch_first=True)

            self.attn_norm = nn.LayerNorm(hidden_size)

            self.ff_norm = nn.LayerNorm(hidden_size)

            self.dropout = nn.Dropout(dropout_prob)

            self.feed_forward = nn.Sequential(

                nn.Linear(hidden_size, 4 * hidden_size),

                nn.GELU(),

                nn.Linear(4 * hidden_size, hidden_size),

            )

     

        def ahead(self, x: Tensor, pad_mask: Tensor) -> Tensor:

            # self-attention with padding masks and post-norm

            attn_output, _ = self.consideration(x, x, x, key_padding_mask=pad_mask)

            x = self.attn_norm(x + attn_output)

            # feed-forward with GeLU activation and post-norm

            ff_output = self.feed_forward(x)

            x = self.ff_norm(x + self.dropout(ff_output))

            return x

     

    class BertPooler(nn.Module):

        “”“Pooler layer for BERT to course of the [CLS] token output.”“”

        def __init__(self, hidden_size: int):

            tremendous().__init__()

            self.dense = nn.Linear(hidden_size, hidden_size)

            self.activation = nn.Tanh()

     

        def ahead(self, x: Tensor) -> Tensor:

            x = self.dense(x)

            x = self.activation(x)

            return x

     

    class BertModel(nn.Module):

        “”“Spine of BERT mannequin.”“”

        def __init__(self, config: BertConfig):

            tremendous().__init__()

            # embedding layers

            self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size,

                                                padding_idx=config.pad_id)

            self.type_embeddings = nn.Embedding(config.num_types, config.hidden_size)

            self.position_embeddings = nn.Embedding(config.max_seq_len, config.hidden_size)

            self.embeddings_norm = nn.LayerNorm(config.hidden_size)

            self.embeddings_dropout = nn.Dropout(config.dropout_prob)

            # transformer blocks

            self.blocks = nn.ModuleList([

                BertBlock(config.hidden_size, config.num_heads, config.dropout_prob)

                for _ in range(config.num_layers)

            ])

            # [CLS] pooler layer

            self.pooler = BertPooler(config.hidden_size)

     

        def ahead(self, input_ids: Tensor, token_type_ids: Tensor, pad_id: int = 0,

                    ) -> tuple[Tensor, Tensor]:

            # create consideration masks for padding tokens

            pad_mask = input_ids == pad_id

            # convert integer tokens to embedding vectors

            batch_size, seq_len = input_ids.form

            position_ids = torch.arange(seq_len, system=input_ids.system).unsqueeze(0)

            position_embeddings = self.position_embeddings(position_ids)

            type_embeddings = self.type_embeddings(token_type_ids)

            token_embeddings = self.word_embeddings(input_ids)

            x = token_embeddings + type_embeddings + place_embeddings

            x = self.embeddings_norm(x)

            x = self.embeddings_dropout(x)

            # course of the sequence with transformer blocks

            for block in self.blocks:

                x = block(x, pad_mask)

            # pool the hidden state of the `[CLS]` token

            pooled_output = self.pooler(x[:, 0, :])

            return x, pooled_output

     

    # Outline new BERT mannequin for query answering

    class BertForQuestionAnswering(nn.Module):

        “”“BERT mannequin for SQuAD query answering.”“”

        def __init__(self, config: BertConfig):

            tremendous().__init__()

            self.bert = BertModel(config)

            # Two outputs: begin and finish place logits

            self.qa_outputs = nn.Linear(config.hidden_size, 2)

     

        def ahead(self,

            input_ids: Tensor,

            token_type_ids: Tensor,

            pad_id: int = 0,

        ) -> tuple[Tensor, Tensor]:

            # Get sequence output from BERT (batch_size, seq_len, hidden_size)

            seq_output, pooled_output = self.bert(input_ids, token_type_ids, pad_id=pad_id)

            # Venture to start out and finish logits

            logits = self.qa_outputs(seq_output)  # (batch_size, seq_len, 2)

            start_logits = logits[:, :, 0]  # (batch_size, seq_len)

            end_logits = logits[:, :, 1]    # (batch_size, seq_len)

            return start_logits, finish_logits

     

    # Load SQuAD dataset for query answering

    dataset = load_dataset(“squad”)

     

    # Load the pretrained BERT tokenizer

    TOKENIZER_PATH = “wikitext-2_wordpiece.json”

    tokenizer = Tokenizer.from_file(TOKENIZER_PATH)

     

    # Setup collate perform to tokenize question-context pairs for the mannequin

    def collate(batch: checklist[dict], tokenizer: Tokenizer, max_len: int,

                ) -> tuple[Tensor, Tensor, Tensor, Tensor]:

        “”“Collate question-context pairs for the mannequin.”“”

        cls_id = tokenizer.token_to_id(“[CLS]”)

        sep_id = tokenizer.token_to_id(“[SEP]”)

        pad_id = tokenizer.token_to_id(“[PAD]”)

     

        input_ids_list = []

        token_type_ids_list = []

        start_positions = []

        end_positions = []

     

        for merchandise in batch:

            # Tokenize query and context

            query, context = merchandise[“question”], merchandise[“context”]

            question_ids = tokenizer.encode(query).ids

            context_ids = tokenizer.encode(context).ids

     

            # Construct enter: [CLS] query [SEP] context [SEP]

            input_ids = [cls_id, *question_ids, sep_id, *context_ids, sep_id]

            token_type_ids = [0] * (len(question_ids)+2) + [1] * (len(context_ids)+1)

     

            # Truncate or pad to max size

            if len(input_ids) > max_len:

                input_ids = input_ids[:max_len]

                token_type_ids = token_type_ids[:max_len]

            else:

                input_ids.prolong([pad_id] * (max_len – len(input_ids)))

                token_type_ids.prolong([1] * (max_len – len(token_type_ids)))

     

            # Discover reply place in tokens: Reply will not be within the context

            start_pos = end_pos = 0

            if len(merchandise[“answers”][“text”]) > 0:

                solutions = tokenizer.encode(merchandise[“answers”][“text”][0]).ids

                # discover the context offset of the reply in context_ids

                for i in vary(len(context_ids) – len(solutions) + 1):

                    if context_ids[i:i+len(answers)] == solutions:

                        start_pos = i + len(question_ids) + 2

                        end_pos = start_pos + len(solutions) – 1

                        break

                if end_pos >= max_len:

                    start_pos = end_pos = 0  # reply is clipped, therefore no reply

     

            input_ids_list.append(input_ids)

            token_type_ids_list.append(token_type_ids)

            start_positions.append(start_pos)

            end_positions.append(end_pos)

     

        input_ids_list = torch.tensor(input_ids_list)

        token_type_ids_list = torch.tensor(token_type_ids_list)

        start_positions = torch.tensor(start_positions)

        end_positions = torch.tensor(end_positions)

        return (input_ids_list, token_type_ids_list, start_positions, end_positions)

     

    batch_size = 16

    max_len = 384  # Longer for Q&A to accommodate context

    collate_fn = functools.partial(collate, tokenizer=tokenizer, max_len=max_len)

    train_loader = torch.utils.knowledge.DataLoader(dataset[“train”], batch_size=batch_size,

                                               shuffle=True, collate_fn=collate_fn)

    val_loader = torch.utils.knowledge.DataLoader(dataset[“validation”], batch_size=batch_size,

                                             shuffle=False, collate_fn=collate_fn)

     

    # Create Q&A mannequin with a pretrained basis BERT mannequin

    system = torch.system(“cuda” if torch.cuda.is_available() else “cpu”)

    config = BertConfig()

    mannequin = BertForQuestionAnswering(config)

    mannequin.to(system)

    mannequin.bert.load_state_dict(torch.load(“bert_model.pth”, map_location=system))

     

    # Coaching setup

    loss_fn = nn.CrossEntropyLoss()

    optimizer = optim.AdamW(mannequin.parameters(), lr=2e–5)

    num_epochs = 3

     

    for epoch in vary(num_epochs):

        mannequin.prepare()

        # Coaching

        with tqdm.tqdm(train_loader, desc=f“Epoch {epoch+1}/{num_epochs}”) as pbar:

            for batch in pbar:

                # get batched knowledge

                input_ids, token_type_ids, start_positions, end_positions = batch

                input_ids = input_ids.to(system)

                token_type_ids = token_type_ids.to(system)

                start_positions = start_positions.to(system)

                end_positions = end_positions.to(system)

                # ahead go

                start_logits, end_logits = mannequin(input_ids, token_type_ids)

                # backward go

                optimizer.zero_grad()

                start_loss = loss_fn(start_logits, start_positions)

                end_loss = loss_fn(end_logits, end_positions)

                loss = start_loss + end_loss

                loss.backward()

                optimizer.step()

                # replace progress bar

                pbar.set_postfix(loss=float(loss))

                pbar.replace(1)

     

        # Validation: Hold monitor of the typical loss and accuracy

        mannequin.eval()

        val_loss, num_matches, num_batches, num_samples = 0, 0, 0, 0

        with torch.no_grad():

            for batch in val_loader:

                # get batched knowledge

                input_ids, token_type_ids, start_positions, end_positions = batch

                input_ids = input_ids.to(system)

                token_type_ids = token_type_ids.to(system)

                start_positions = start_positions.to(system)

                end_positions = end_positions.to(system)

                # ahead go on validation knowledge

                start_logits, end_logits = mannequin(input_ids, token_type_ids)

                # compute loss

                start_loss = loss_fn(start_logits, start_positions)

                end_loss = loss_fn(end_logits, end_positions)

                loss = start_loss + end_loss

                val_loss += loss.merchandise()

                num_batches += 1

                # compute accuracy

                pred_start = start_logits.argmax(dim=–1)

                pred_end = end_logits.argmax(dim=–1)

                match = (pred_start == start_positions) & (pred_end == end_positions)

                num_matches += match.sum().merchandise()

                num_samples += len(start_positions)

     

        avg_loss = val_loss / num_batches

        acc = num_matches / num_samples

        print(f“Validation {epoch+1}/{num_epochs}: acc {acc:.4f}, avg loss {avg_loss:.4f}”)

     

    # Save the fine-tuned mannequin

    torch.save(mannequin.state_dict(), f“bert_model_squad.pth”)

    Share. Facebook Twitter Pinterest LinkedIn Tumblr Email
    Oliver Chambers
    • Website

    Related Posts

    Reduce Doc AI Prices 90%

    March 3, 2026

    Why Capability Planning Is Again – O’Reilly

    March 2, 2026

    The Potential of CoT for Reasoning: A Nearer Have a look at Hint Dynamics

    March 2, 2026
    Top Posts

    Evaluating the Finest AI Video Mills for Social Media

    April 18, 2025

    Utilizing AI To Repair The Innovation Drawback: The Three Step Resolution

    April 18, 2025

    Midjourney V7: Quicker, smarter, extra reasonable

    April 18, 2025

    Meta resumes AI coaching utilizing EU person knowledge

    April 18, 2025
    Don't Miss

    ​​Methods to Stop Prior Authorization Delays

    By Hannah O’SullivanMarch 3, 2026

    Prior authorization was designed to make sure medical necessity and…

    Well-liked Iranian App BadeSaba was Hacked to Ship “Assist Is on the Means” Alerts

    March 3, 2026

    MWC 2026 Updates: Information, Updates and Product Bulletins

    March 3, 2026

    Fixing the Pupil Debt Disaster with U.S. Information CEO Eric Gertler

    March 3, 2026
    Stay In Touch
    • Facebook
    • Twitter
    • Pinterest
    • Instagram
    • YouTube
    • Vimeo

    Subscribe to Updates

    Get the latest creative news from SmartMag about art & design.

    UK Tech Insider
    Facebook X (Twitter) Instagram
    • About Us
    • Contact Us
    • Privacy Policy
    • Terms Of Service
    • Our Authors
    © 2026 UK Tech Insider. All rights reserved by UK Tech Insider.

    Type above and press Enter to search. Press Esc to cancel.