Close Menu
    Main Menu
    • Home
    • News
    • Tech
    • Robotics
    • ML & Research
    • AI
    • Digital Transformation
    • AI Ethics & Regulation
    • Thought Leadership in AI

    Subscribe to Updates

    Get the latest creative news from FooBar about art, design and business.

    What's Hot

    FBI Accessed Home windows Laptops After Microsoft Shared BitLocker Restoration Keys – Hackread – Cybersecurity Information, Information Breaches, AI, and Extra

    January 25, 2026

    Pet Bowl 2026: Learn how to Watch and Stream the Furry Showdown

    January 25, 2026

    Why Each Chief Ought to Put on the Coach’s Hat ― and 4 Expertise Wanted To Coach Successfully

    January 25, 2026
    Facebook X (Twitter) Instagram
    UK Tech InsiderUK Tech Insider
    Facebook X (Twitter) Instagram
    UK Tech InsiderUK Tech Insider
    Home»Thought Leadership in AI»Coaching a Mannequin on A number of GPUs with Information Parallelism
    Thought Leadership in AI

    Coaching a Mannequin on A number of GPUs with Information Parallelism

    Yasmin BhattiBy Yasmin BhattiDecember 27, 2025No Comments6 Mins Read
    Facebook Twitter Pinterest Telegram LinkedIn Tumblr Email Reddit
    Coaching a Mannequin on A number of GPUs with Information Parallelism
    Share
    Facebook Twitter LinkedIn Pinterest Email Copy Link


    import dataclasses

    import os

     

    import datasets

    import tqdm

    import tokenizers

    import torch

    import torch.distributed as dist

    import torch.nn as nn

    import torch.nn.purposeful as F

    import torch.optim.lr_scheduler as lr_scheduler

    from torch import Tensor

    from torch.nn.parallel import DistributedDataParallel as DDP

    from torch.utils.knowledge.distributed import DistributedSampler

     

    # Construct the mannequin

    @dataclasses.dataclass

    class LlamaConfig:

        “”“Outline Llama mannequin hyperparameters.”“”

        vocab_size: int = 50000  # Measurement of the tokenizer vocabulary

        max_position_embeddings: int = 2048  # Most sequence size

        hidden_size: int = 768  # Dimension of hidden layers

        intermediate_size: int = 4*768  # Dimension of MLP’s hidden layer

        num_hidden_layers: int = 12  # Variety of transformer layers

        num_attention_heads: int = 12  # Variety of consideration heads

        num_key_value_heads: int = 3  # Variety of key-value heads for GQA

     

     

    class RotaryPositionEncoding(nn.Module):

        “”“Rotary place encoding.”“”

     

        def __init__(self, dim: int, max_position_embeddings: int) -> None:

            “”“Initialize the RotaryPositionEncoding module

     

            Args:

                dim: The hidden dimension of the enter tensor to which RoPE is utilized

                max_position_embeddings: The utmost sequence size of the enter tensor

            ““”

            tremendous().__init__()

            self.dim = dim

            self.max_position_embeddings = max_position_embeddings

            # compute a matrix of ntheta_i

            N = 10_000.0

            inv_freq = 1.0 / (N ** (torch.arange(0, dim, 2) / dim))

            inv_freq = torch.cat((inv_freq, inv_freq), dim=–1)

            place = torch.arange(max_position_embeddings)

            sinusoid_inp = torch.outer(place, inv_freq)

            # save cosine and sine matrices as buffers, not parameters

            self.register_buffer(“cos”, sinusoid_inp.cos())

            self.register_buffer(“sin”, sinusoid_inp.sin())

     

        def ahead(self, x: Tensor) -> Tensor:

            “”“Apply RoPE to tensor x

     

            Args:

                x: Enter tensor of form (batch_size, seq_length, num_heads, head_dim)

     

            Returns:

                Output tensor of form (batch_size, seq_length, num_heads, head_dim)

            ““”

            batch_size, seq_len, num_heads, head_dim = x.form

            dtype = x.dtype

            # remodel the cosine and sine matrices to 4D tensor and the identical dtype as x

            cos = self.cos.to(dtype)[:seq_len].view(1, seq_len, 1, –1)

            sin = self.sin.to(dtype)[:seq_len].view(1, seq_len, 1, –1)

            # apply RoPE to x

            x1, x2 = x.chunk(2, dim=–1)

            rotated = torch.cat((–x2, x1), dim=–1)

            output = (x * cos) + (rotated * sin)

            return output

     

     

    class LlamaAttention(nn.Module):

        “”“Grouped-query consideration with rotary embeddings.”“”

     

        def __init__(self, config: LlamaConfig) -> None:

            tremendous().__init__()

            self.hidden_size = config.hidden_size

            self.num_heads = config.num_attention_heads

            self.head_dim = self.hidden_size // self.num_heads

            self.num_kv_heads = config.num_key_value_heads  # GQA: H_kv < H_q

     

            # hidden_size should be divisible by num_heads

            assert (self.head_dim * self.num_heads) == self.hidden_dimension

     

            # Linear layers for Q, Okay, V projections

            self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)

            self.k_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)

            self.v_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)

            self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)

     

        def ahead(self, hidden_states: Tensor, rope: RotaryPositionEncoding, attn_mask: Tensor) -> Tensor:

            bs, seq_len, dim = hidden_states.dimension()

     

            # Undertaking inputs to Q, Okay, V

            query_states = self.q_proj(hidden_states).view(bs, seq_len, self.num_heads, self.head_dim)

            key_states = self.k_proj(hidden_states).view(bs, seq_len, self.num_kv_heads, self.head_dim)

            value_states = self.v_proj(hidden_states).view(bs, seq_len, self.num_kv_heads, self.head_dim)

     

            # Apply rotary place embeddings

            query_states = rope(query_states)

            key_states = rope(key_states)

     

            # Transpose tensors from BSHD to BHSD dimension for scaled_dot_product_attention

            query_states = query_states.transpose(1, 2)

            key_states = key_states.transpose(1, 2)

            value_states = value_states.transpose(1, 2)

     

            # Use PyTorch’s optimized consideration implementation

            # setting is_causal=True is incompatible with setting specific consideration masks

            attn_output = F.scaled_dot_product_attention(

                query_states,

                key_states,

                value_states,

                attn_mask=attn_mask,

                dropout_p=0.0,

                enable_gqa=True,

            )

     

            # Transpose output tensor from BHSD to BSHD dimension, reshape to 3D, after which mission output

            attn_output = attn_output.transpose(1, 2).reshape(bs, seq_len, self.hidden_size)

            attn_output = self.o_proj(attn_output)

            return attn_output

     

     

    class LlamaMLP(nn.Module):

        “”“Feed-forward community with SwiGLU activation.”“”

     

        def __init__(self, config: LlamaConfig) -> None:

            tremendous().__init__()

            # Two parallel projections for SwiGLU

            self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)

            self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)

            self.act_fn = F.silu  # SwiGLU activation operate

            # Undertaking again to hidden dimension

            self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)

     

        def ahead(self, x: Tensor) -> Tensor:

            # SwiGLU activation: multiply gate and up-projected inputs

            gate = self.act_fn(self.gate_proj(x))

            up = self.up_proj(x)

            return self.down_proj(gate * up)

     

     

    class LlamaDecoderLayer(nn.Module):

        “”“Single transformer layer for a Llama mannequin.”“”

     

        def __init__(self, config: LlamaConfig) -> None:

            tremendous().__init__()

            self.input_layernorm = nn.RMSNorm(config.hidden_size, eps=1e–5)

            self.self_attn = LlamaAttention(config)

            self.post_attention_layernorm = nn.RMSNorm(config.hidden_size, eps=1e–5)

            self.mlp = LlamaMLP(config)

     

        def ahead(self, hidden_states: Tensor, rope: RotaryPositionEncoding, attn_mask: Tensor) -> Tensor:

            # First residual block: Self-attention

            residual = hidden_states

            hidden_states = self.input_layernorm(hidden_states)

            attn_outputs = self.self_attn(hidden_states, rope=rope, attn_mask=attn_mask)

            hidden_states = attn_outputs + residual

     

            # Second residual block: MLP

            residual = hidden_states

            hidden_states = self.post_attention_layernorm(hidden_states)

            hidden_states = self.mlp(hidden_states) + residual

            return hidden_states

     

     

    class LlamaModel(nn.Module):

        “”“The complete Llama mannequin with none pretraining heads.”“”

     

        def __init__(self, config: LlamaConfig) -> None:

            tremendous().__init__()

            self.rotary_emb = RotaryPositionEncoding(

                config.hidden_size // config.num_attention_heads,

                config.max_position_embeddings,

            )

     

            self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)

            self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])

            self.norm = nn.RMSNorm(config.hidden_size, eps=1e–5)

     

        def ahead(self, input_ids: Tensor, attn_mask: Tensor) -> Tensor:

            # Convert enter token IDs to embeddings

            hidden_states = self.embed_tokens(input_ids)

            # Course of via all transformer layers, then the ultimate norm layer

            for layer in self.layers:

                hidden_states = layer(hidden_states, rope=self.rotary_emb, attn_mask=attn_mask)

            hidden_states = self.norm(hidden_states)

            # Return the ultimate hidden states

            return hidden_states

     

     

    class LlamaForPretraining(nn.Module):

        def __init__(self, config: LlamaConfig) -> None:

            tremendous().__init__()

            self.base_model = LlamaModel(config)

            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

     

        def ahead(self, input_ids: Tensor, attn_mask: Tensor) -> Tensor:

            hidden_states = self.base_model(input_ids, attn_mask)

            return self.lm_head(hidden_states)

     

     

    def create_causal_mask(batch: Tensor, dtype: torch.dtype = torch.float32) -> Tensor:

        “”“Create a causal masks for self-attention.

     

        Args:

            batch: Batch of sequences, form (batch_size, seq_len)

            dtype: Information kind of the masks

     

        Returns:

            Causal masks of form (seq_len, seq_len)

        ““”

        batch_size, seq_len = batch.form

        masks = torch.full((seq_len, seq_len), float(‘-inf’), gadget=batch.gadget, dtype=dtype)

                    .triu(diagonal=1)

        return masks

     

     

    def create_padding_mask(batch: Tensor, padding_token_id: int, dtype: torch.dtype = torch.float32) -> Tensor:

        “”“Create a padding masks for a batch of sequences for self-attention.

     

        Args:

            batch: Batch of sequences, form (batch_size, seq_len)

            padding_token_id: ID of the padding token

            dtype: Information kind of the masks

     

        Returns:

            Padding masks of form (batch_size, 1, seq_len, seq_len)

        ““”

        padded = torch.zeros_like(batch, gadget=batch.gadget, dtype=dtype)

                      .masked_fill(batch == padding_token_id, float(‘-inf’))

        masks = padded[:,:,None] + padded[:,None,:]

        return masks[:, None, :, :]

     

     

    # Generator operate to create padded sequences of fastened size

    class PretrainingDataset(torch.utils.knowledge.Dataset):

        def __init__(self, dataset: datasets.Dataset, tokenizer: tokenizers.Tokenizer,

                    seq_length: int):

            self.dataset = dataset

            self.tokenizer = tokenizer

            self.seq_length = seq_length

            self.bot = tokenizer.token_to_id(“[BOT]”)

            self.eot = tokenizer.token_to_id(“[EOT]”)

            self.pad = tokenizer.token_to_id(“[PAD]”)

     

        def __len__(self):

            return len(self.dataset)

     

        def __getitem__(self, index):

            “”“Get a sequence of token ids from the dataset. [BOT] and [EOT] tokens

            are added. Clipped and padded to the sequence size.

            ““”

            seq = self.dataset[index][“text”]

            tokens: record[int] = [self.bot] + self.tokenizer.encode(seq).ids + [self.eot]

            # pad to focus on sequence size

            toklen = len(tokens)

            if toklen < self.seq_length+1:

                pad_length = self.seq_length+1 – toklen

                tokens += [self.pad] * pad_size

            # return the sequence

            x = torch.tensor(tokens[:self.seq_length], dtype=torch.int64)

            y = torch.tensor(tokens[1:self.seq_length+1], dtype=torch.int64)

            return x, y

     

    # Load the tokenizer

    tokenizer = tokenizers.Tokenizer.from_file(“bpe_50K.json”)

     

    # Load the dataset

    dataset = datasets.load_dataset(“HuggingFaceFW/fineweb”, “sample-10BT”, cut up=“prepare”)

     

    # Initialize the distributed surroundings

    dist.init_process_group(backend=“nccl”)

    rank = dist.get_rank()

    local_rank = int(os.environ[“LOCAL_RANK”])

    world_size = dist.get_world_size()

    gadget = torch.gadget(f“cuda:{local_rank}”)

    print(f“World dimension: {world_size}, Rank: {rank}, Native rank: {local_rank}. Utilizing gadget: {gadget}”)

    #torch.cuda.set_device(local_rank)

    #torch.set_default_device(gadget)

     

    # Create pretraining mannequin with default config, then wrap it in DDP

    model_config = LlamaConfig()

    mannequin = LlamaForPretraining(model_config).to(rank)

    mannequin = DDP(mannequin, device_ids=[local_rank])  # , output_device=local_rank)

    mannequin.prepare()

     

    # print the mannequin dimension

    print(f“Mannequin parameters dimension: {sum(p.numel() for p in mannequin.parameters()) / 1024**2:.2f} M”)

    print(f“Mannequin buffers dimension: {sum(p.numel() for p in mannequin.buffers()) / 1024**2:.2f} M”)

    print(f“Mannequin precision(s): {set([x.dtype for x in model.state_dict().values()])}”)

     

    # Coaching parameters

    epochs = 3

    learning_rate = 1e–3

    batch_size = 64

    seq_length = 512

    num_warmup_steps = 1000

    PAD_TOKEN_ID = tokenizer.token_to_id(“[PAD]”)

     

    # DataLoader, optimizer, scheduler, and loss operate

    dataset = PretrainingDataset(dataset, tokenizer, seq_length)

    sampler = DistributedSampler(dataset, shuffle=False)

    dataloader = torch.utils.knowledge.DataLoader(

        dataset,

        batch_size=batch_size,

        sampler=sampler,

        pin_memory=True,  # elective

        shuffle=False,

        num_workers=world_size,

    )

    optimizer = torch.optim.AdamW(

        mannequin.parameters(), lr=learning_rate, betas=(0.9, 0.99), eps=1e–8, weight_decay=0.1

    )

    num_training_steps = len(dataloader) * epochs

    print(f“Variety of coaching steps: {num_training_steps} = {len(dataloader)} * {epochs}”)

    warmup_scheduler = lr_scheduler.LinearLR(

        optimizer,

        start_factor=0.1, end_factor=1.0, total_iters=num_warmup_steps

    )

    cosine_scheduler = lr_scheduler.CosineAnnealingLR(

        optimizer,

        T_max=num_training_steps – num_warmup_steps,

        eta_min=0

    )

    scheduler = lr_scheduler.SequentialLR(

        optimizer,

        schedulers=[warmup_scheduler, cosine_scheduler],

        milestones=[num_warmup_steps]

    )

    loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN_ID)

     

    # begin coaching

    for epoch in vary(epochs):

        pbar = tqdm.tqdm(dataloader, desc=f“Epoch {epoch+1}/{epochs}”)

        sampler.set_epoch(epoch)   # required for shuffling solely

        for batch_id, batch in enumerate(pbar):

            if batch_id % 1000 == 0 and rank == 0:

                # checkpoint the mannequin and optimizer state, solely on rank 0 course of

                torch.save({

                    “mannequin”: mannequin.module.state_dict() if isinstance(mannequin, DDP) else mannequin.state_dict(),

                    “optimizer”: optimizer.state_dict(),

                    “scheduler”: scheduler.state_dict(),

                    “epoch”: epoch,

                    “batch”: batch_id,

                }, f“llama_pretraining_checkpoint.pth”)

            # get batched knowledge, transfer from CPU to GPU

            input_ids, target_ids = batch

            input_ids = input_ids.to(gadget)

            target_ids = target_ids.to(gadget)

            # create consideration masks: causal masks + padding masks

            attn_mask = create_causal_mask(input_ids) +

                        create_padding_mask(input_ids, PAD_TOKEN_ID)

            # extract output from mannequin

            logits = mannequin(input_ids, attn_mask)

            # compute loss: cross-entropy between logits and goal, ignoring padding tokens

            loss = loss_fn(logits.view(–1, logits.dimension(–1)), target_ids.view(–1))

            # backward with loss and gradient clipping by L2 norm to 1.0

            optimizer.zero_grad()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(mannequin.parameters(), 1.0)

            optimizer.step()

            scheduler.step()

            pbar.set_postfix(loss=loss.merchandise())

            pbar.replace(1)

        pbar.shut()

     

    # Save the mannequin

    if rank == 0:

        torch.save(mannequin.state_dict(), “llama_pretraining_model.pth”)

        torch.save(mannequin.base_model.state_dict(), “llama_model.pth”)

     

    # Clear up the distributed surroundings

    dist.destroy_process_group()

    Share. Facebook Twitter Pinterest LinkedIn Tumblr Email
    Yasmin Bhatti
    • Website

    Related Posts

    Why it’s crucial to maneuver past overly aggregated machine-learning metrics | MIT Information

    January 21, 2026

    Generative AI software helps 3D print private gadgets that maintain every day use | MIT Information

    January 15, 2026

    Methods to Learn a Machine Studying Analysis Paper in 2026

    January 15, 2026
    Top Posts

    FBI Accessed Home windows Laptops After Microsoft Shared BitLocker Restoration Keys – Hackread – Cybersecurity Information, Information Breaches, AI, and Extra

    January 25, 2026

    Evaluating the Finest AI Video Mills for Social Media

    April 18, 2025

    Utilizing AI To Repair The Innovation Drawback: The Three Step Resolution

    April 18, 2025

    Midjourney V7: Quicker, smarter, extra reasonable

    April 18, 2025
    Don't Miss

    FBI Accessed Home windows Laptops After Microsoft Shared BitLocker Restoration Keys – Hackread – Cybersecurity Information, Information Breaches, AI, and Extra

    By Declan MurphyJanuary 25, 2026

    Is your Home windows PC safe? A latest Guam court docket case reveals Microsoft can…

    Pet Bowl 2026: Learn how to Watch and Stream the Furry Showdown

    January 25, 2026

    Why Each Chief Ought to Put on the Coach’s Hat ― and 4 Expertise Wanted To Coach Successfully

    January 25, 2026

    How the Amazon.com Catalog Crew constructed self-learning generative AI at scale with Amazon Bedrock

    January 25, 2026
    Stay In Touch
    • Facebook
    • Twitter
    • Pinterest
    • Instagram
    • YouTube
    • Vimeo

    Subscribe to Updates

    Get the latest creative news from SmartMag about art & design.

    UK Tech Insider
    Facebook X (Twitter) Instagram
    • About Us
    • Contact Us
    • Privacy Policy
    • Terms Of Service
    • Our Authors
    © 2026 UK Tech Insider. All rights reserved by UK Tech Insider.

    Type above and press Enter to search. Press Esc to cancel.