Close Menu
    Main Menu
    • Home
    • News
    • Tech
    • Robotics
    • ML & Research
    • AI
    • Digital Transformation
    • AI Ethics & Regulation
    • Thought Leadership in AI

    Subscribe to Updates

    Get the latest creative news from FooBar about art, design and business.

    What's Hot

    FBI Accessed Home windows Laptops After Microsoft Shared BitLocker Restoration Keys – Hackread – Cybersecurity Information, Information Breaches, AI, and Extra

    January 25, 2026

    Pet Bowl 2026: Learn how to Watch and Stream the Furry Showdown

    January 25, 2026

    Why Each Chief Ought to Put on the Coach’s Hat ― and 4 Expertise Wanted To Coach Successfully

    January 25, 2026
    Facebook X (Twitter) Instagram
    UK Tech InsiderUK Tech Insider
    Facebook X (Twitter) Instagram
    UK Tech InsiderUK Tech Insider
    Home»Thought Leadership in AI»Pretrain a BERT Mannequin from Scratch
    Thought Leadership in AI

    Pretrain a BERT Mannequin from Scratch

    Yasmin BhattiBy Yasmin BhattiDecember 4, 2025No Comments3 Mins Read
    Facebook Twitter Pinterest Telegram LinkedIn Tumblr Email Reddit
    Pretrain a BERT Mannequin from Scratch
    Share
    Facebook Twitter LinkedIn Pinterest Email Copy Link


    import dataclasses

     

    import datasets

    import torch

    import torch.nn as nn

    import tqdm

     

     

    @dataclasses.dataclass

    class BertConfig:

        “”“Configuration for BERT mannequin.”“”

        vocab_size: int = 30522

        num_layers: int = 12

        hidden_size: int = 768

        num_heads: int = 12

        dropout_prob: float = 0.1

        pad_id: int = 0

        max_seq_len: int = 512

        num_types: int = 2

     

     

     

    class BertBlock(nn.Module):

        “”“One transformer block in BERT.”“”

        def __init__(self, hidden_size: int, num_heads: int, dropout_prob: float):

            tremendous().__init__()

            self.consideration = nn.MultiheadAttention(hidden_size, num_heads,

                                                   dropout=dropout_prob, batch_first=True)

            self.attn_norm = nn.LayerNorm(hidden_size)

            self.ff_norm = nn.LayerNorm(hidden_size)

            self.dropout = nn.Dropout(dropout_prob)

            self.feed_forward = nn.Sequential(

                nn.Linear(hidden_size, 4 * hidden_size),

                nn.GELU(),

                nn.Linear(4 * hidden_size, hidden_size),

            )

     

        def ahead(self, x: torch.Tensor, pad_mask: torch.Tensor) -> torch.Tensor:

            # self-attention with padding masks and post-norm

            attn_output, _ = self.consideration(x, x, x, key_padding_mask=pad_mask)

            x = self.attn_norm(x + attn_output)

            # feed-forward with GeLU activation and post-norm

            ff_output = self.feed_forward(x)

            x = self.ff_norm(x + self.dropout(ff_output))

            return x

     

     

    class BertPooler(nn.Module):

        “”“Pooler layer for BERT to course of the [CLS] token output.”“”

        def __init__(self, hidden_size: int):

            tremendous().__init__()

            self.dense = nn.Linear(hidden_size, hidden_size)

            self.activation = nn.Tanh()

     

        def ahead(self, x: torch.Tensor) -> torch.Tensor:

            x = self.dense(x)

            x = self.activation(x)

            return x

     

     

    class BertModel(nn.Module):

        “”“Spine of BERT mannequin.”“”

        def __init__(self, config: BertConfig):

            tremendous().__init__()

            # embedding layers

            self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size,

                                                padding_idx=config.pad_id)

            self.type_embeddings = nn.Embedding(config.num_types, config.hidden_size)

            self.position_embeddings = nn.Embedding(config.max_seq_len, config.hidden_size)

            self.embeddings_norm = nn.LayerNorm(config.hidden_size)

            self.embeddings_dropout = nn.Dropout(config.dropout_prob)

            # transformer blocks

            self.blocks = nn.ModuleList([

                BertBlock(config.hidden_size, config.num_heads, config.dropout_prob)

                for _ in range(config.num_layers)

            ])

            # [CLS] pooler layer

            self.pooler = BertPooler(config.hidden_size)

     

        def ahead(self, input_ids: torch.Tensor, token_type_ids: torch.Tensor, pad_id: int = 0

                    ) -> tuple[torch.Tensor, torch.Tensor]:

            # create consideration masks for padding tokens

            pad_mask = input_ids == pad_id

            # convert integer tokens to embedding vectors

            batch_size, seq_len = input_ids.form

            position_ids = torch.arange(seq_len, machine=input_ids.machine).unsqueeze(0)

            position_embeddings = self.position_embeddings(position_ids)

            type_embeddings = self.type_embeddings(token_type_ids)

            token_embeddings = self.word_embeddings(input_ids)

            x = token_embeddings + type_embeddings + place_embeddings

            x = self.embeddings_norm(x)

            x = self.embeddings_dropout(x)

            # course of the sequence with transformer blocks

            for block in self.blocks:

                x = block(x, pad_mask)

            # pool the hidden state of the `[CLS]` token

            pooled_output = self.pooler(x[:, 0, :])

            return x, pooled_output

     

     

    class BertPretrainingModel(nn.Module):

        def __init__(self, config: BertConfig):

            tremendous().__init__()

            self.bert = BertModel(config)

            self.mlm_head = nn.Sequential(

                nn.Linear(config.hidden_size, config.hidden_size),

                nn.GELU(),

                nn.LayerNorm(config.hidden_size),

                nn.Linear(config.hidden_size, config.vocab_size),

            )

            self.nsp_head = nn.Linear(config.hidden_size, 2)

     

        def ahead(self, input_ids: torch.Tensor, token_type_ids: torch.Tensor, pad_id: int = 0

                    ) -> tuple[torch.Tensor, torch.Tensor]:

            # Course of the sequence with the BERT mannequin spine

            x, pooled_output = self.bert(input_ids, token_type_ids, pad_id)

            # Predict the masked tokens for the MLM process and the classification for the NSP process

            mlm_logits = self.mlm_head(x)

            nsp_logits = self.nsp_head(pooled_output)

            return mlm_logits, nsp_logits

     

     

    # Coaching parameters

    epochs = 10

    learning_rate = 1e–4

    batch_size = 32

     

    # Load dataset and arrange dataloader

    dataset = datasets.Dataset.from_parquet(“wikitext-2_train_data.parquet”)

     

    def collate_fn(batch: listing[dict]):

        “”“Customized collate perform to deal with variable-length sequences in dataset.”“”

        # all the time at max size: tokens, segment_ids; all the time singleton: is_random_next

        input_ids = torch.tensor([item[“tokens”] for merchandise in batch])

        token_type_ids = torch.tensor([item[“segment_ids”] for merchandise in batch]).abs()

        is_random_next = torch.tensor([item[“is_random_next”] for merchandise in batch]).to(int)

        # variable size: masked_positions, masked_labels

        masked_pos = [(idx, pos) for idx, item in enumerate(batch) for pos in item[“masked_positions”]]

        masked_labels = torch.tensor([label for item in batch for label in item[“masked_labels”]])

        return input_ids, token_type_ids, is_random_next, masked_pos, masked_labels

     

    dataloader = torch.utils.knowledge.DataLoader(dataset, batch_size=batch_size, shuffle=True,

                                             collate_fn=collate_fn, num_workers=8)

     

    # practice the mannequin

     

    machine = torch.machine(“cuda” if torch.cuda.is_available() else “cpu”)

    mannequin = BertPretrainingModel(BertConfig()).to(machine)

    mannequin.practice()

    optimizer = torch.optim.AdamW(mannequin.parameters(), lr=learning_rate)

    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

    loss_fn = nn.CrossEntropyLoss()

     

    for epoch in vary(epochs):

        pbar = tqdm.tqdm(dataloader, desc=f“Epoch {epoch+1}/{epochs}”)

        for batch in pbar:

            # get batched knowledge

            input_ids, token_type_ids, is_random_next, masked_pos, masked_labels = batch

            input_ids = input_ids.to(machine)

            token_type_ids = token_type_ids.to(machine)

            is_random_next = is_random_next.to(machine)

            masked_labels = masked_labels.to(machine)

            # extract output from mannequin

            mlm_logits, nsp_logits = mannequin(input_ids, token_type_ids)

            # MLM loss: masked_positions is a listing of tuples of (B, S), extract the

            # corresponding logits from tensor mlm_logits of form (B, S, V)

            batch_indices, token_positions = zip(*masked_pos)

            mlm_logits = mlm_logits[batch_indices, token_positions]

            mlm_loss = loss_fn(mlm_logits, masked_labels)

            # Compute the loss for the NSP process

            nsp_loss = loss_fn(nsp_logits, is_random_next)

            # backward with complete loss

            total_loss = mlm_loss + nsp_loss

            pbar.set_postfix(MLM=mlm_loss.merchandise(), NSP=nsp_loss.merchandise(), Whole=total_loss.merchandise())

            optimizer.zero_grad()

            total_loss.backward()

            optimizer.step()

            scheduler.step()

            pbar.replace(1)

        pbar.shut()

     

    # Save the mannequin

    torch.save(mannequin.state_dict(), “bert_pretraining_model.pth”)

    torch.save(mannequin.bert.state_dict(), “bert_model.pth”)

    Share. Facebook Twitter Pinterest LinkedIn Tumblr Email
    Yasmin Bhatti
    • Website

    Related Posts

    Why it’s crucial to maneuver past overly aggregated machine-learning metrics | MIT Information

    January 21, 2026

    Generative AI software helps 3D print private gadgets that maintain every day use | MIT Information

    January 15, 2026

    Methods to Learn a Machine Studying Analysis Paper in 2026

    January 15, 2026
    Top Posts

    FBI Accessed Home windows Laptops After Microsoft Shared BitLocker Restoration Keys – Hackread – Cybersecurity Information, Information Breaches, AI, and Extra

    January 25, 2026

    Evaluating the Finest AI Video Mills for Social Media

    April 18, 2025

    Utilizing AI To Repair The Innovation Drawback: The Three Step Resolution

    April 18, 2025

    Midjourney V7: Quicker, smarter, extra reasonable

    April 18, 2025
    Don't Miss

    FBI Accessed Home windows Laptops After Microsoft Shared BitLocker Restoration Keys – Hackread – Cybersecurity Information, Information Breaches, AI, and Extra

    By Declan MurphyJanuary 25, 2026

    Is your Home windows PC safe? A latest Guam court docket case reveals Microsoft can…

    Pet Bowl 2026: Learn how to Watch and Stream the Furry Showdown

    January 25, 2026

    Why Each Chief Ought to Put on the Coach’s Hat ― and 4 Expertise Wanted To Coach Successfully

    January 25, 2026

    How the Amazon.com Catalog Crew constructed self-learning generative AI at scale with Amazon Bedrock

    January 25, 2026
    Stay In Touch
    • Facebook
    • Twitter
    • Pinterest
    • Instagram
    • YouTube
    • Vimeo

    Subscribe to Updates

    Get the latest creative news from SmartMag about art & design.

    UK Tech Insider
    Facebook X (Twitter) Instagram
    • About Us
    • Contact Us
    • Privacy Policy
    • Terms Of Service
    • Our Authors
    © 2026 UK Tech Insider. All rights reserved by UK Tech Insider.

    Type above and press Enter to search. Press Esc to cancel.