Creating and Uploading a Dataset with Unsloth: An Adventure in Wonderland

Community Article Published August 4, 2024

Comprehensive Guide: From Dataset Creation to Fine-Tuning and Using Your Model

Introduction

In this guide, we'll walk through the process of creating a dataset by scraping content from GitHub repositories and documentation sites, uploading the dataset to Hugging Face, fine-tuning a language model using the dataset, and finally, using the fine-tuned model. To make things engaging, we'll use analogies from "Alice in Wonderland" to explain each step.

Step 1: Setting Up the Environment

Before we can start, we need to set up our environment. Think of this step as Alice preparing for her adventure into Wonderland.

  1. Install Necessary Libraries:

    pip install beautifulsoup4 gitpython huggingface_hub datasets requests
    
  2. Clone or Pull the Repository: We will clone the repository if it doesn’t exist or pull the latest changes if it does.

Step 2: Cloning and Pulling the Repository

Just like the Mad Hatter’s tea party, this step involves organizing the chaos of data.

  1. Clone or Pull Repository Function:

    def clone_or_pull_repo(repo_url, repo_name):
        if os.path.exists(repo_name):
            verbose_print(f"Repository {repo_name} already exists. Pulling latest changes.")
            repo = Repo(repo_name)
            repo.remotes.origin.pull()
        else:
            verbose_print(f"Cloning repository from {repo_url}")
            Repo.clone_from(repo_url, repo_name)
    
  2. Extract Markdown Files: We extract all Markdown files from the repository to scrape content.

    def extract_markdown_files(repo_path):
        verbose_print(f"Extracting Markdown files from {repo_path}")
        markdown_files = []
        for root, dirs, files in os.walk(repo_path):
            for file in files:
                if file.endswith(".md"):
                    markdown_files.append(os.path.join(root, file))
        return markdown_files
    

Step 3: Parsing and Scraping Content

This step is akin to the Cheshire Cat appearing and disappearing, just like our content extraction process.

  1. Parse Markdown Files:

    def parse_markdown(file_path):
        verbose_print(f"Parsing Markdown file {file_path}")
        with open(file_path, 'r') as file:
            content = file.read()
        sections = content.split('\n## ')
        parsed_sections = [section.replace('\n', ' ') for section in sections]
        return parsed_sections
    
  2. Scrape Documentation Pages:

    def get_page_links(base_url, link_selector):
        verbose_print(f"Getting page links from {base_url}")
        response = requests.get(base_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        page_links = []
        for link in soup.select(link_selector):
            href = link['href']
            if not href.startswith('http') and href != '#':
                href = base_url.rstrip('/') + '/' + href.lstrip('/')
                page_links.append(href)
        return page_links
    
    def scrape_page(url, content_selector):
        verbose_print(f"Scraping content from {url}")
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        page_content = []
        main_content = soup.select_one(content_selector)
        if main_content:
            sections = main_content.find_all(['h1', 'h2', 'h3', 'p', 'pre'])
            for section in sections:
                page_content.append(section.text)
        return page_content
    

Step 4: Creating and Saving the Dataset

We’re now ready to create and save our dataset, much like Alice collecting her memories.

  1. Create Dataset:

    def create_dataset(repo_url, doc_urls):
        dataset = []
    
        # Scrape GitHub repository
        repo_name = repo_url.split('/')[-1].replace('.git', '')
        clone_or_pull_repo(repo_url, repo_name)
        markdown_files = extract_markdown_files(repo_name)
        for md_file in markdown_files:
            sections = parse_markdown(md_file)
            for section in sections:
                dataset.append({
                    'source': 'GitHub',
                    'repository': repo_name,
                    'file': md_file,
                    'label': 'autogen',
                    'content': section
                })
    
        # Scrape documentation site
        for doc_url, link_selector, content_selector in doc_urls:
            page_links = get_page_links(doc_url, link_selector)
            for page_url in page_links:
                page_content = scrape_page(page_url, content_selector)
                for section in page_content:
                    dataset.append({
                        'source': 'Documentation',
                        'url': page_url,
                        'label': 'autogen',
                        'content': section
                    })
    
        return dataset
    
  2. Save and Load Dataset Locally:

    def load_dataset_locally(file_path):
        if os.path.exists(file_path):
            verbose_print(f"Loading existing dataset from {file_path}")
            with open(file_path, 'r') as file:
                return json.load(file)
        verbose_print(f"No existing dataset found at {file_path}")
        return []
    
    def save_dataset_locally(dataset, output_file):
        verbose_print(f"Saving dataset to {output_file}")
        with open(output_file, 'w') as file:
            json.dump(dataset, file, indent=4)
        verbose_print("Dataset saved successfully")
    

Step 5: Uploading to Hugging Face

Finally, just like sharing stories from Wonderland, we upload our dataset to Hugging Face.

  1. Upload to Hugging Face:
    def upload_to_huggingface(dataset, repo_id):
        token = os.getenv("HF_TOKEN")
        verbose_print(f"Uploading dataset to Hugging Face with repository ID {repo_id}")
        hf_api = HfApi()
        hf_api.create_repo(repo_id, token=token, repo_type="dataset", private=False)
    
        # Create a DatasetDict and push to hub
        dataset_dict = DatasetDict({"train": Dataset.from_list(dataset)})
        dataset_dict.push_to_hub(repo_id, token=token)
        verbose_print(f"Dataset uploaded to Hugging Face with repository ID {repo_id}")
    

Example Usage

  1. Define Repository and Documentation URLs:

    repo_url = 'https://github.com/microsoft/autogen.git'
    doc_urls = [
        ('https://microsoft.github.io/autogen/docs/', 'a[href]', 'div.md-content'),
        ('https://microsoft.github.io/autogen/docs/Examples', 'a[href]', 'div.md-content'),
        ('https://microsoft.github.io/autogen/docs/notebooks', 'a[href]', 'div.md-content'),
        ('https://microsoft.github.io/autogen/blog', 'a[href]', 'div.blog-content')
    ]
    
  2. Create, Save, and Upload Dataset:

   output_file = 'autogen_python_dataset.json'
   repo_id = 'dimentox/autogen-python'

   verbose_print("Starting dataset creation process")
   existing_dataset = load_dataset_locally(output_file)
   new_dataset = create_dataset(repo_url, doc_urls)
   combined_dataset = existing_dataset + new_dataset
   save_dataset_locally(combined_dataset, output_file)
   upload_to_huggingface(combined_dataset, repo_id)
   verbose_print("Dataset creation and upload process completed")

Step 6: Fine-Tuning the Model

Now, let's move on to the fine-tuning part. This is like refining the Queen of Hearts' garden to perfection.

  1. Fine-Tuning Script:
   from unsloth import FastLanguageModel
   from trl import SFTTrainer
   from transformers import TrainingArguments, DataCollatorForSeq2Seq
   from unsloth import is_bfloat16_supported
   from datasets import load_dataset
   import torch

   class AdaptiveTrainer(SFTTrainer):
       def __init__(self, *args, **kwargs):
           super().__init__(*args, **kwargs)
           self.prev_eval_loss = float('inf')

       def evaluation_step(self, *args, **kwargs):
           output = super().evaluation_step(*args, **kwargs)
           current_eval_loss = output['eval_loss']

           # Adaptive Learning Rate Adjustment
           if current_eval_loss > self.prev_eval_loss:
               self.args.learning_rate *= 0.9  # Reduce learning rate if loss increased
               print(f"Decreased learning rate to: {self.args.learning_rate}")
           else:
               self.args.learning_rate *= 1.05  # Slightly increase if loss decreased
               print(f"Increased learning rate to: {self.args.learning_rate}")

           self.prev_eval_loss = current_eval_loss
           return output

       def training_step(self, *args, **kwargs):
           # Adjust gradient clipping based on gradient norms
           if self.state.global_step > 0 and self.state.global_step % self.args.eval_steps == 0:
               current_grad_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)
               print(f"Adjusted gradient clipping to: {current_grad_norm}")

           return super().training_step(*args, **kwargs)

       def print_memory_stats(stage):
           gpu_stats = torch.cuda.get_device_properties(0)
           used_memory = round(torch.cuda.memory_reserved() / 1024 / 1024 / 1024, 3)
           max_memory = round(gpu

      _stats.total_memory / 1024 / 1024 / 1024, 3)
             print(f"[{stage}] GPU: {gpu_stats.name}, Memory Reserved: {used_memory} GB / {max_memory} GB")
      
         max_seq_length = 2048
         dtype = None
         load_in_4bit = True
      
         print("Loading model")
         model, tokenizer = FastLanguageModel.from_pretrained(
             model_name="unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
             max_seq_length=max_seq_length,
             dtype=dtype,
             load_in_4bit=load_in_4bit,
             token="token"
         )
      
         print("Loading Laura")
         model = FastLanguageModel.get_peft_model(
             model,
             r=16,
             target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
             lora_alpha=16,
             lora_dropout=0,
             bias="none",
             use_gradient_checkpointing="unsloth",
             random_state=3407,
             use_rslora=False,
             loftq_config=None,
         )
      
         print("Loading dataset")
         dataset_path = "autogen_python_dataset.json"
         dataset = load_dataset("json", data_files=dataset_path, split="train")
      
         custom_prompt = """Source: {}
         Repository: {}
         File: {}
         Label: {}
         Content: {}
         """
      
         EOS_TOKEN = tokenizer.eos_token
      
         def formatting_prompts_func(examples):
             sources = examples["source"]
             repositories = examples["repository"]
             files = examples["file"]
             labels = examples["label"]
             contents = examples["content"]
             texts = []
             for source, repository, file, label, content in zip(sources, repositories, files, labels, contents):
                 text = custom_prompt.format(source, repository, file, label, content) + EOS_TOKEN
                 texts.append(text)
             return {"text": texts}
      
         dataset = dataset.map(formatting_prompts_func, batched=True)
      
         trainer = AdaptiveTrainer(
             model=model,
             tokenizer=tokenizer,
             train_dataset=dataset,
             dataset_text_field="text",
             max_seq_length=max_seq_length,
             dataset_num_proc=2,
             packing=False,
             data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
             args=TrainingArguments(
                 per_device_train_batch_size=2,
                 gradient_accumulation_steps=4,
                 warmup_steps=5,
                 num_train_epochs=1,
                 learning_rate=2e-4,
                 fp16=not is_bfloat16_supported(),
                 bf16=is_bfloat16_supported(),
                 logging_steps=1,
                 optim="adamw_8bit",
                 weight_decay=0.01,
                 lr_scheduler_type="linear",
                 seed=3407,
                 output_dir="outputs",
                 save_strategy="steps",
                 save_steps=50,
                 eval_steps=1,
             ),
         )
      
         print_memory_stats("Before Training")
         trainer_stats = trainer.train(resume_from_checkpoint=True)
         print_memory_stats("After Training")
      
         model.save_pretrained("lora_model")
         tokenizer.save_pretrained("lora_model")

Step 7: Using the Fine-Tuned Model

Now that our model is fine-tuned, it's time to use it! This is like Alice finally understanding the Wonderland rules and using them to her advantage.

  1. Using the Fine-Tuned Model:
   from unsloth import FastLanguageModel
   from transformers import TextStreamer
   import torch

   max_seq_length = 2048
   dtype = None
   load_in_4bit = True

   print("Loading fine-tuned model")
   model, tokenizer = FastLanguageModel.from_pretrained(
       model_name="lora_model",
       max_seq_length=max_seq_length,
       dtype=dtype,
       load_in_4bit=load_in_4bit,
       token="token"
   )

   FastLanguageModel.for_inference(model)
   inputs = tokenizer(
       [
           """
           <s>
           Q: What is the capital of France?
           A:
           """
       ],
       return_tensors="pt"
   ).to("cuda")

   text_streamer = TextStreamer(tokenizer)
   outputs = model.generate(**inputs, streamer=text_streamer, max_new_tokens=64)
   print(tokenizer.batch_decode(outputs))

Conclusion

In this comprehensive guide, we've walked through creating a dataset, fine-tuning a model, and using the fine-tuned model with the help of Unsloth. By following this guide, you can navigate through the intricacies of data scraping, dataset creation, and model fine-tuning with ease, much like Alice's adventure in Wonderland. Happy exploring!

Complete Code Notebook

Here’s the complete code notebook for your reference:


# Step 1: Setting Up the Environment
!pip install beautifulsoup4 gitpython huggingface_hub datasets requests

# Step 2: Cloning and Pulling the Repository
import os
import json
import requests
from bs4 import BeautifulSoup
from git import Repo
from huggingface_hub import HfApi
from datasets import Dataset, DatasetDict

def verbose_print(message):
    print(f"[INFO] {message}")

def clone_or_pull_repo(repo_url, repo_name):
    if os.path.exists(repo_name):
        verbose_print(f"Repository {repo_name} already exists. Pulling latest changes.")
        repo = Repo(repo_name)
        repo.remotes.origin.pull()
    else:
        verbose_print(f"Cloning repository from {repo_url}")
        Repo.clone_from(repo_url, repo_name)

def extract_markdown_files(repo_path):
    verbose_print(f"Extracting Markdown files from {repo_path}")
    markdown_files = []
    for root, dirs, files in os.walk(repo_path):
        for file in files:
            if file.endswith(".md"):
                markdown_files.append(os.path.join(root, file))
    return markdown_files

# Step 3: Parsing and Scraping Content
def parse_markdown(file_path):
    verbose_print(f"Parsing Markdown file {file_path}")
    with open(file_path, 'r') as file:
        content = file.read()
    sections = content.split('\n## ')
    parsed_sections = [section.replace('\n', ' ') for section in sections]
    return parsed_sections

def get_page_links(base_url, link_selector):
    verbose_print(f"Getting page links from {base_url}")
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    page_links = []
    for link in soup.select(link_selector):
        href = link['href']
        if not href.startswith('http') and href != '#':
            href = base_url.rstrip('/') + '/' + href.lstrip('/')
            page_links.append(href)
    return page_links

def scrape_page(url, content_selector):
    verbose_print(f"Scraping content from {url}")
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    page_content = []
    main_content = soup.select_one(content_selector)
    if main_content:
        sections = main_content.find_all(['h1', 'h2', 'h3', 'p', 'pre'])
        for section in sections:
            page_content.append(section.text)
    return page_content

# Step 4: Creating and Saving the Dataset
def create_dataset(repo_url, doc_urls):
    dataset = []

    # Scrape GitHub repository
    repo_name = repo_url.split('/')[-1].replace('.git', '')
    clone_or_pull_repo(repo_url, repo_name)
    markdown_files = extract_markdown_files(repo_name)
    for md_file in markdown_files:
        sections = parse_markdown(md_file)
        for section in sections:
            dataset.append({
                'source': 'GitHub',
                'repository': repo_name,
                'file': md_file,
                'label': 'autogen',
                'content': section
            })

    # Scrape documentation site
    for doc_url, link_selector, content_selector in doc_urls:
        page_links = get_page_links(doc_url, link_selector)
        for page_url in page_links:
            page_content = scrape_page(page_url, content_selector)
            for section in page_content:
                dataset.append({
                    'source': 'Documentation',
                    'url': page_url,
                    'label': 'autogen',
                    'content': section
                })

    return dataset

def load_dataset_locally(file_path):
    if os.path.exists(file_path):
        verbose_print(f"Loading existing dataset from {file_path}")
        with open(file_path, 'r') as file:
            return json.load(file)
    verbose_print(f"No existing dataset found at {file_path}")
    return []

def save_dataset_locally(dataset, output_file):
    verbose_print(f"Saving dataset to {output_file}")
    with open(output_file, 'w') as file:
        json.dump(dataset, file, indent=4)
    verbose_print("Dataset saved successfully")

# Step 5: Uploading to Hugging Face
def upload_to_huggingface(dataset, repo_id):
    token = os.getenv("HF_TOKEN")
    verbose_print(f"Uploading dataset to Hugging Face with repository ID {repo_id}")
    hf_api = HfApi()
    hf_api.create_repo(repo_id, token=token, repo_type="dataset", private=False)

    # Create a DatasetDict and push to hub
    dataset_dict = DatasetDict({"

train": Dataset.from_list(dataset)})
    dataset_dict.push_to_hub(repo_id, token=token)
    verbose_print(f"Dataset uploaded to Hugging Face with repository ID {repo_id}")

# Example Usage
repo_url = 'https://github.com/microsoft/autogen.git'
doc_urls = [
    ('https://microsoft.github.io/autogen/docs/', 'a[href]', 'div.md-content'),
    ('https://microsoft.github.io/autogen/docs/Examples', 'a[href]', 'div.md-content'),
    ('https://microsoft.github.io/autogen/docs/notebooks', 'a[href]', 'div.md-content'),
    ('https://microsoft.github.io/autogen/blog', 'a[href]', 'div.blog-content')
]
output_file = 'autogen_python_dataset.json'
repo_id = 'dimentox/autogen-python'

verbose_print("Starting dataset creation process")
existing_dataset = load_dataset_locally(output_file)
new_dataset = create_dataset(repo_url, doc_urls)
combined_dataset = existing_dataset + new_dataset
save_dataset_locally(combined_dataset, output_file)
upload_to_huggingface(combined_dataset, repo_id)
verbose_print("Dataset creation and upload process completed")

# Step 6: Fine-Tuning the Model
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from datasets import load_dataset
import torch

class AdaptiveTrainer(SFTTrainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.prev_eval_loss = float('inf')

    def evaluation_step(self, *args, **kwargs):
        output = super().evaluation_step(*args, **kwargs)
        current_eval_loss = output['eval_loss']

        # Adaptive Learning Rate Adjustment
        if current_eval_loss > self.prev_eval_loss:
            self.args.learning_rate *= 0.9  # Reduce learning rate if loss increased
            print(f"Decreased learning rate to: {self.args.learning_rate}")
        else:
            self.args.learning_rate *= 1.05  # Slightly increase if loss decreased
            print(f"Increased learning rate to: {self.args.learning_rate}")

        self.prev_eval_loss = current_eval_loss
        return output

    def training_step(self, *args, **kwargs):
        # Adjust gradient clipping based on gradient norms
        if self.state.global_step > 0 and self.state.global_step % self.args.eval_steps == 0:
            current_grad_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)
            print(f"Adjusted gradient clipping to: {current_grad_norm}")

        return super().training_step(*args, **kwargs)

def print_memory_stats(stage):
    gpu_stats = torch.cuda.get_device_properties(0)
    used_memory = round(torch.cuda.memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"[{stage}] GPU: {gpu_stats.name}, Memory Reserved: {used_memory} GB / {max_memory} GB")

max_seq_length = 2048
dtype = None
load_in_4bit = True

print("Loading model")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token="token"
)

print("Loading Laura")
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

print("Loading dataset")
dataset_path = "autogen_python_dataset.json"
dataset = load_dataset("json", data_files=dataset_path, split="train")

custom_prompt = """Source: {}
Repository: {}
File: {}
Label: {}
Content: {}
"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    sources = examples["source"]
    repositories = examples["repository"]
    files = examples["file"]
    labels = examples["label"]
    contents = examples["content"]
    texts = []
    for source, repository, file, label, content in zip(sources, repositories, files, labels, contents):
        text = custom_prompt.format(source, repository, file, label, content) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)

trainer = AdaptiveTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        num_train_epochs=1,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        save_strategy="steps",
        save_steps=50,
        eval_steps=1,
    ),
)

print_memory_stats("Before Training")
trainer_stats = trainer.train(resume_from_checkpoint=True)
print_memory_stats("After Training")

model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

# Step 7: Using the Fine-Tuned Model
from unsloth import FastLanguageModel
from transformers import TextStreamer
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

print("Loading fine-tuned model")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="lora_model",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token="TOKEN"
)

FastLanguageModel.for_inference(model)
inputs = tokenizer(
    [
        """
        <s>
        Q: What is the capital of France?
        A:
        """
    ],
    return_tensors="pt"
).to("cuda")

text_streamer = TextStreamer(tokenizer)
outputs = model.generate(**inputs, streamer=text_streamer, max_new_tokens=64)
print(tokenizer.batch_decode(outputs))