Close Menu
    Main Menu
    • Home
    • News
    • Tech
    • Robotics
    • ML & Research
    • AI
    • Digital Transformation
    • AI Ethics & Regulation
    • Thought Leadership in AI

    Subscribe to Updates

    Get the latest creative news from FooBar about art, design and business.

    What's Hot

    FBI Accessed Home windows Laptops After Microsoft Shared BitLocker Restoration Keys – Hackread – Cybersecurity Information, Information Breaches, AI, and Extra

    January 25, 2026

    Pet Bowl 2026: Learn how to Watch and Stream the Furry Showdown

    January 25, 2026

    Why Each Chief Ought to Put on the Coach’s Hat ― and 4 Expertise Wanted To Coach Successfully

    January 25, 2026
    Facebook X (Twitter) Instagram
    UK Tech InsiderUK Tech Insider
    Facebook X (Twitter) Instagram
    UK Tech InsiderUK Tech Insider
    Home»Thought Leadership in AI»Getting ready Knowledge for BERT Coaching
    Thought Leadership in AI

    Getting ready Knowledge for BERT Coaching

    Yasmin BhattiBy Yasmin BhattiDecember 2, 2025No Comments4 Mins Read
    Facebook Twitter Pinterest Telegram LinkedIn Tumblr Email Reddit
    Getting ready Knowledge for BERT Coaching
    Share
    Facebook Twitter LinkedIn Pinterest Email Copy Link


    “”“Course of the WikiText dataset for coaching the BERT mannequin. Utilizing Hugging Face

    datasets library.

    ““”

     

    import time

    import random

    from typing import Iterator

     

    import tokenizers

    from datasets import load_dataset, Dataset

     

    # path and identify of every dataset

    DATASETS = {

        “wikitext-2”: (“wikitext”, “wikitext-2-raw-v1”),

        “wikitext-103”: (“wikitext”, “wikitext-103-raw-v1”),

    }

    PATH, NAME = DATASETS[“wikitext-103”]

    TOKENIZER_PATH = “wikitext-103_wordpiece.json”

     

     

    def create_docs(path: str, identify: str, tokenizer: tokenizers.Tokenizer) -> listing[list[list[int]]]:

        “”“Load wikitext dataset and extract textual content as paperwork”“”

        dataset = load_dataset(path, identify, cut up=“practice”)

        docs: listing[list[list[int]]] = []

        for line in dataset[“text”]:

            line = line.strip()

            if not line or line.startswith(“=”):

                docs.append([])   # new doc encountered

            else:

                tokens = tokenizer.encode(line).ids

                docs[–1].append(tokens)

        docs = [doc for doc in docs if doc]  # take away empty paperwork

        return docs

     

     

    def create_dataset(

        docs: listing[list[list[int]]],

        tokenizer: tokenizers.Tokenizer,

        max_seq_length: int = 512,

        doc_repeat: int = 10,

        mask_prob: float = 0.15,

        short_seq_prob: float = 0.1,

        max_predictions_per_seq: int = 20,

    ) -> Iterator[dict]:

        “”“Generate samples from all paperwork”“”

        doc_indices = listing(vary(len(docs))) * doc_repeat

        for doc_idx in doc_indices:

            yield from generate_samples(doc_idx, docs, tokenizer, max_seq_length, mask_prob, short_seq_prob, max_predictions_per_seq)

     

    def generate_samples(

        doc_idx: int,

        all_docs: listing[list[list[int]]],

        tokenizer: tokenizers.Tokenizer,

        max_seq_length: int = 512,

        mask_prob: float = 0.15,

        short_seq_prob: float = 0.1,

        max_predictions_per_seq: int = 20,

    ) -> Iterator[dict]:

        “”“Generate samples from a given doc”“”

        # variety of tokens to extract from this doc, excluding [CLS], [SEP], [SEP]

        target_length = max_seq_length – 3

        if random.random() < short_seq_prob:

            # shorter sequence is used 10% of the time

            target_length = random.randint(2, target_length)

     

        # copy the doc

        chunks = []

        for chunk in all_docs[doc_idx]:

            chunks.append(chunk)

     

        # exhaust chunks and create samples

        whereas chunks:

            # scan till goal token size

            running_length = 0

            finish = 1

            whereas finish < len(chunks) and running_length < target_length:

                running_length += len(chunks[end–1])

                finish += 1

            # randomly separate the chunk into two segments

            sep = random.randint(1, finish–1) if finish > 1 else 1

            sentence_a = [tok for chunk in chunks[:sep] for tok in chunk]

            sentence_b = [tok for chunk in chunks[sep:end] for tok in chunk]

            # sentence B: could also be from one other doc

            if not sentence_b or random.random() < 0.5:

                # discover one other doc (should not be the identical as doc_idx)

                b_idx = random.randint(0, len(all_docs)–2)

                if b_idx >= doc_idx:

                    b_idx += 1

                # sentence B begins from a random place within the new doc

                sentence_b = []

                running_length = len(sentence_a)

                i = random.randint(0, len(all_docs[b_idx])–1)

                whereas i < len(all_docs[b_idx]) and running_length < target_length:

                    sentence_b.prolong(all_docs[b_idx][i])

                    running_length += len(all_docs[b_idx][i])

                    i += 1

                is_random_next = True

                chunks = chunks[sep:]

            else:

                is_random_next = False

                chunks = chunks[end:]

            # create a pattern from the pair

            yield create_sample(sentence_a, sentence_b, is_random_next, tokenizer, max_seq_length, mask_prob, max_predictions_per_seq)

     

    def create_sample(

        sentence_a: listing[list[int]],

        sentence_b: listing[list[int]],

        is_random_next: bool,

        tokenizer: tokenizers.Tokenizer,

        max_seq_length: int = 512,

        mask_prob: float = 0.15,

        max_predictions_per_seq: int = 20,

    ) -> dict:

        “”“Create a pattern from a pair of sentences”“”

        # Accumulate id of particular tokens

        cls_id = tokenizer.token_to_id(“[CLS]”)

        sep_id = tokenizer.token_to_id(“[SEP]”)

        mask_id = tokenizer.token_to_id(“[MASK]”)

        pad_id = tokenizer.padding[“pad_id”]

        # modify size to suit the max sequence size

        truncate_seq_pair(sentence_a, sentence_b, max_seq_length–3)

        num_pad = max_seq_length – len(sentence_a) – len(sentence_b) – 3

        # create unmodified tokens sequence

        tokens = [cls_id] + sentence_a + [sep_id] + sentence_b + [sep_id] + ([pad_id] * num_pad)

        seg_id = [0] * (len(sentence_a) + 2) + [1] * (len(sentence_b) + 1) + [–1] * num_pad

        assert len(tokens) == len(seg_id) == max_seq_size

        # create the prediction targets

        cand_indices = [i for i, tok in enumerate(tokens) if tok not in [cls_id, sep_id, pad_id]]

        random.shuffle(cand_indices)

        num_predictions = int(spherical((len(sentence_a) + len(sentence_b)) * mask_prob))

        num_predictions = min(max_predictions_per_seq, max(1, num_predictions))

        mlm_positions = sorted(cand_indices[:num_predictions])

        mlm_labels = []

        for i in mlm_positions:

            mlm_labels.append(tokens[i])

            # prob 0.8 substitute with [MASK], prob 0.1 substitute with random phrase, prob 0.1 maintain unique

            if random.random() < 0.8:

                tokens[i] = mask_id

            elif random.random() < 0.5:

                tokens[i] = random.randint(4, tokenizer.get_vocab_size()–1)

        # randomly masks some tokens

        ret = {

            “tokens”: tokens,

            “segment_ids”: seg_id,

            “is_random_next”: is_random_next,

            “masked_positions”: mlm_positions,

            “masked_labels”: mlm_labels,

        }

        return ret

     

     

    def truncate_seq_pair(sentence_a: listing[int], sentence_b: listing[int], max_num_tokens: int) -> None:

        “”“Truncate a pair of sequences till beneath a most sequence size.”“”

        whereas len(sentence_a) + len(sentence_b) > max_num_tokens:

            # decide the longer sentence to take away tokens from

            candidate = sentence_a if len(sentence_a) > len(sentence_b) else sentence_b

            # take away one token from both finish in equal possibilities

            if random.random() < 0.5:

                candidate.pop(0)

            else:

                candidate.pop()

     

     

    if __name__ == “__main__”:

        print(time.time(), “began”)

        tokenizer = tokenizers.Tokenizer.from_file(TOKENIZER_PATH)

        print(time.time(), “loaded tokenizer”)

        docs = create_docs(PATH, NAME, tokenizer)

        print(time.time(), “created docs with %d paperwork” % len(docs))

        dataset = Dataset.from_generator(create_dataset, gen_kwargs={“docs”: docs, “tokenizer”: tokenizer})

        print(time.time(), “created dataset from generator”)

        # Save dataset to parquet file

        dataset.to_parquet(“wikitext-103_train_data.parquet”)

        print(time.time(), “saved dataset to parquet file”)

        # Load dataset from parquet file

        dataset = Dataset.from_parquet(“wikitext-103_train_data.parquet”, streaming=True)

        print(time.time(), “loaded dataset from parquet file”)

        # Print a couple of samples

        for i, pattern in enumerate(dataset):

            print(i)

            print(pattern)

            print()

            if i >= 3:

                break

        print(time.time(), “completed”)

    Share. Facebook Twitter Pinterest LinkedIn Tumblr Email
    Yasmin Bhatti
    • Website

    Related Posts

    Why it’s crucial to maneuver past overly aggregated machine-learning metrics | MIT Information

    January 21, 2026

    Generative AI software helps 3D print private gadgets that maintain every day use | MIT Information

    January 15, 2026

    Methods to Learn a Machine Studying Analysis Paper in 2026

    January 15, 2026
    Top Posts

    FBI Accessed Home windows Laptops After Microsoft Shared BitLocker Restoration Keys – Hackread – Cybersecurity Information, Information Breaches, AI, and Extra

    January 25, 2026

    Evaluating the Finest AI Video Mills for Social Media

    April 18, 2025

    Utilizing AI To Repair The Innovation Drawback: The Three Step Resolution

    April 18, 2025

    Midjourney V7: Quicker, smarter, extra reasonable

    April 18, 2025
    Don't Miss

    FBI Accessed Home windows Laptops After Microsoft Shared BitLocker Restoration Keys – Hackread – Cybersecurity Information, Information Breaches, AI, and Extra

    By Declan MurphyJanuary 25, 2026

    Is your Home windows PC safe? A latest Guam court docket case reveals Microsoft can…

    Pet Bowl 2026: Learn how to Watch and Stream the Furry Showdown

    January 25, 2026

    Why Each Chief Ought to Put on the Coach’s Hat ― and 4 Expertise Wanted To Coach Successfully

    January 25, 2026

    How the Amazon.com Catalog Crew constructed self-learning generative AI at scale with Amazon Bedrock

    January 25, 2026
    Stay In Touch
    • Facebook
    • Twitter
    • Pinterest
    • Instagram
    • YouTube
    • Vimeo

    Subscribe to Updates

    Get the latest creative news from SmartMag about art & design.

    UK Tech Insider
    Facebook X (Twitter) Instagram
    • About Us
    • Contact Us
    • Privacy Policy
    • Terms Of Service
    • Our Authors
    © 2026 UK Tech Insider. All rights reserved by UK Tech Insider.

    Type above and press Enter to search. Press Esc to cancel.