import gradio as gr import pandas as pd from css_html_js import custom_css TITLE = """

🇹🇭 Thai Sentence Embedding Leaderboard

""" INTRODUCTION_TEXT = """ 📐 The 🇹🇭 Thai Sentence Embedding Leaderboard aims to track, rank and evaluate open embedding models on Thai sentence embedding tasks. Source code for evaluation at https://github.com/mrpeerat/Thai-Sentence-Vector-Benchmark, feel free to submit your own score at https://huggingface.co/spaces/panuthept/thai_sentence_embedding_benchmark/discussions. ## Dataset The evaluation is conducted on 4 tasks across 8 datasets: 1. Semantic Textual Similarity (STS) - Translated STS-B, contains 1,379 test samples, https://github.com/mrpeerat/Thai-Sentence-Vector-Benchmark 2. Text Classification - Wisesight, contains 2,671 test samples, https://huggingface.co/datasets/pythainlp/wisesight_sentiment - Wongnai, contains 6,203 test samples, https://huggingface.co/datasets/Wongnai/wongnai_reviews - Generated Review, contains 17,453 test samples, https://huggingface.co/datasets/airesearch/generated_reviews_enth 3. Pair Classification - XNLI (Thai only), contains 3,340 test samples, https://github.com/facebookresearch/XNLI 4. Retrieval - XQuAD (Thai only), contains 1,190 test samples, https://huggingface.co/datasets/google/xquad - MIRACL (Thai only), contains 733 test samples, https://huggingface.co/datasets/miracl/miracl - TyDiQA (Thai only), contains 763 test samples, https://huggingface.co/datasets/chompk/tydiqa-goldp-th ## Metrics The evaluation metric for each task is as follows: 1. STS: Spearman’s Rank Correlation 2. Text Classification: F1 Score 3. Pair Classification: Average Precision 3. Retrieval: MMR@10 """ results = [ { 'Model Name': '[XLMR-base](https://huggingface.co/FacebookAI/xlm-roberta-base)', 'Model Size (Million Parameters)': 279, 'Embedding Dimensions': 768, 'Average (8 datasets)': 37.95, 'STS (1 datasets)': 44.48, 'Classification (3 datasets)': 58.42, 'PairClassification (1 datasets)': 57.62, 'Retrieval (3 datasets)': 5.57, }, { 'Model Name': '[XLMR-large](https://huggingface.co/FacebookAI/xlm-roberta-large)', 'Model Size (Million Parameters)': 561, 'Embedding Dimensions': 1024, 'Average (8 datasets)': 38.59, 'STS (1 datasets)': 38.31, 'Classification (3 datasets)': 59.51, 'PairClassification (1 datasets)': 54.56, 'Retrieval (3 datasets)': 11.80, }, { 'Model Name': '[WangchanBERTa](https://huggingface.co/airesearch/wangchanberta-base-att-spm-uncased)', 'Model Size (Million Parameters)': 106, 'Embedding Dimensions': 768, 'Average (8 datasets)': 36.34, 'STS (1 datasets)': 21.32, 'Classification (3 datasets)': 55.46, 'PairClassification (1 datasets)': 52.96, 'Retrieval (3 datasets)': 19.49, }, { 'Model Name': '[PhayaThaiBERT](https://huggingface.co/clicknext/phayathaibert)', 'Model Size (Million Parameters)': 278, 'Embedding Dimensions': 768, 'Average (8 datasets)': 55.38, 'STS (1 datasets)': 51.56, 'Classification (3 datasets)': 59.90, 'PairClassification (1 datasets)': 59.67, 'Retrieval (3 datasets)': 56.31, }, { 'Model Name': '[MPNet-multilingual](https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2)', 'Model Size (Million Parameters)': 278, 'Embedding Dimensions': 768, 'Average (8 datasets)': 66.14, 'STS (1 datasets)': 80.49, 'Classification (3 datasets)': 56.89, 'PairClassification (1 datasets)': 84.14, 'Retrieval (3 datasets)': 64.13, }, { 'Model Name': '[DistilUSE-multilingual](https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2)', 'Model Size (Million Parameters)': 135, 'Embedding Dimensions': 512, 'Average (8 datasets)': 51.45, 'STS (1 datasets)': 65.37, 'Classification (3 datasets)': 50.93, 'PairClassification (1 datasets)': 65.94, 'Retrieval (3 datasets)': 42.72, }, { 'Model Name': '[BGE-M3 (dense only)](https://huggingface.co/BAAI/bge-m3)', 'Model Size (Million Parameters)': 570, 'Embedding Dimensions': 1024, 'Average (8 datasets)': 75.64, 'STS (1 datasets)': 77.22, 'Classification (3 datasets)': 59.95, 'PairClassification (1 datasets)': 79.02, 'Retrieval (3 datasets)': 91.42, }, { 'Model Name': '[SimCSE-XLMR-base](https://huggingface.co/kornwtp/simcse-model-XLMR)', 'Model Size (Million Parameters)': 279, 'Embedding Dimensions': 768, 'Average (8 datasets)': 53.83, 'STS (1 datasets)': 63.98, 'Classification (3 datasets)': 49.44, 'PairClassification (1 datasets)': 61.87, 'Retrieval (3 datasets)': 54.17, }, { 'Model Name': '[SimCSE-WangchanBERTa](https://huggingface.co/kornwtp/simcse-model-wangchanberta)', 'Model Size (Million Parameters)': 106, 'Embedding Dimensions': 768, 'Average (8 datasets)': 54.01, 'STS (1 datasets)': 60.73, 'Classification (3 datasets)': 56.71, 'PairClassification (1 datasets)': 59.14, 'Retrieval (3 datasets)': 51.05, }, { 'Model Name': '[SimCSE-PhayaThaiBERT](https://huggingface.co/kornwtp/simcse-model-phayathaibert)', 'Model Size (Million Parameters)': 278, 'Embedding Dimensions': 768, 'Average (8 datasets)': 60.02, 'STS (1 datasets)': 67.82, 'Classification (3 datasets)': 53.50, 'PairClassification (1 datasets)': 63.35, 'Retrieval (3 datasets)': 66.05, }, { 'Model Name': '[SCT-XLMR-base](https://huggingface.co/kornwtp/SCT-model-XLMR)', 'Model Size (Million Parameters)': 279, 'Embedding Dimensions': 768, 'Average (8 datasets)': 57.69, 'STS (1 datasets)': 68.91, 'Classification (3 datasets)': 55.93, 'PairClassification (1 datasets)': 66.49, 'Retrieval (3 datasets)': 54.90, }, { 'Model Name': '[SCT-WangchanBERTa](https://huggingface.co/kornwtp/SCT-model-wangchanberta)', 'Model Size (Million Parameters)': 106, 'Embedding Dimensions': 768, 'Average (8 datasets)': 62.22, 'STS (1 datasets)': 71.35, 'Classification (3 datasets)': 59.19, 'PairClassification (1 datasets)': 67.04, 'Retrieval (3 datasets)': 63.83, }, { 'Model Name': '[SCT-PhayaThaiBERT](https://huggingface.co/kornwtp/SCT-model-phayathaibert)', 'Model Size (Million Parameters)': 278, 'Embedding Dimensions': 768, 'Average (8 datasets)': 63.28, 'STS (1 datasets)': 74.08, 'Classification (3 datasets)': 58.77, 'PairClassification (1 datasets)': 65.87, 'Retrieval (3 datasets)': 66.20, }, { 'Model Name': '[SCT-KD-XLMR-base](https://huggingface.co/kornwtp/SCT-KD-model-XLMR)', 'Model Size (Million Parameters)': 279, 'Embedding Dimensions': 768, 'Average (8 datasets)': 65.37, 'STS (1 datasets)': 78.78, 'Classification (3 datasets)': 56.87, 'PairClassification (1 datasets)': 79.78, 'Retrieval (3 datasets)': 65.02, }, { 'Model Name': '[SCT-KD-WangchanBERTa](https://huggingface.co/kornwtp/SCT-KD-model-wangchanberta)', 'Model Size (Million Parameters)': 106, 'Embedding Dimensions': 768, 'Average (8 datasets)': 63.55, 'STS (1 datasets)': 77.77, 'Classification (3 datasets)': 56.33, 'PairClassification (1 datasets)': 77.04, 'Retrieval (3 datasets)': 62.38, }, { 'Model Name': '[SCT-KD-PhayaThaiBERT](https://huggingface.co/kornwtp/SCT-KD-model-phayathaibert)', 'Model Size (Million Parameters)': 278, 'Embedding Dimensions': 768, 'Average (8 datasets)': 66.00, 'STS (1 datasets)': 77.80, 'Classification (3 datasets)': 57.27, 'PairClassification (1 datasets)': 77.84, 'Retrieval (3 datasets)': 67.94, }, { 'Model Name': '[ConGen-XLMR-base](https://huggingface.co/kornwtp/ConGen-model-XLMR)', 'Model Size (Million Parameters)': 279, 'Embedding Dimensions': 768, 'Average (8 datasets)': 66.84, 'STS (1 datasets)': 79.69, 'Classification (3 datasets)': 56.90, 'PairClassification (1 datasets)': 81.47, 'Retrieval (3 datasets)': 68.03, }, { 'Model Name': '[ConGen-WangchanBERTa](https://huggingface.co/kornwtp/ConGen-model-wangchanberta)', 'Model Size (Million Parameters)': 106, 'Embedding Dimensions': 768, 'Average (8 datasets)': 67.17, 'STS (1 datasets)': 78.78, 'Classification (3 datasets)': 58.16, 'PairClassification (1 datasets)': 82.43, 'Retrieval (3 datasets)': 67.66, }, { 'Model Name': '[ConGen-PhayaThaiBERT](https://huggingface.co/kornwtp/ConGen-model-phayathaibert)', 'Model Size (Million Parameters)': 278, 'Embedding Dimensions': 768, 'Average (8 datasets)': 66.94, 'STS (1 datasets)': 78.90, 'Classification (3 datasets)': 57.63, 'PairClassification (1 datasets)': 81.01, 'Retrieval (3 datasets)': 68.04, }, { 'Model Name': '[E5-Mistral-7B-Instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct)', 'Model Size (Million Parameters)': 7110, 'Embedding Dimensions': 4096, 'Average (8 datasets)': 71.94, 'STS (1 datasets)': 75.52, 'Classification (3 datasets)': 60.46, 'PairClassification (1 datasets)': 68.04, 'Retrieval (3 datasets)': 86.80, }, { 'Model Name': '[gte-Qwen2-7B-Instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct)', 'Model Size (Million Parameters)': 7610, 'Embedding Dimensions': 3584, 'Average (8 datasets)': 49.31, 'STS (1 datasets)': 51.60, 'Classification (3 datasets)': 57.55, 'PairClassification (1 datasets)': 61.73, 'Retrieval (3 datasets)': 38.31, }, { 'Model Name': '[GritLM-7B](https://huggingface.co/GritLM/GritLM-7B)', 'Model Size (Million Parameters)': 7240, 'Embedding Dimensions': 4096, 'Average (8 datasets)': 42.38, 'STS (1 datasets)': 45.50, 'Classification (3 datasets)': 56.83, 'PairClassification (1 datasets)': 56.40, 'Retrieval (3 datasets)': 22.79, }, { 'Model Name': '[Llama3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)', 'Model Size (Million Parameters)': 8030, 'Embedding Dimensions': 4096, 'Average (8 datasets)': 51.63, 'STS (1 datasets)': 49.48, 'Classification (3 datasets)': 58.54, 'PairClassification (1 datasets)': 57.76, 'Retrieval (3 datasets)': 47.93, }, { 'Model Name': '[Llama3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)', 'Model Size (Million Parameters)': 8030, 'Embedding Dimensions': 4096, 'Average (8 datasets)': 52.81, 'STS (1 datasets)': 50.63, 'Classification (3 datasets)': 58.85, 'PairClassification (1 datasets)': 58.04, 'Retrieval (3 datasets)': 50.38, }, { 'Model Name': '[Llama3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B)', 'Model Size (Million Parameters)': 8030, 'Embedding Dimensions': 4096, 'Average (8 datasets)': 50.36, 'STS (1 datasets)': 49.98, 'Classification (3 datasets)': 58.18, 'PairClassification (1 datasets)': 58.12, 'Retrieval (3 datasets)': 43.64, }, { 'Model Name': '[Llama3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)', 'Model Size (Million Parameters)': 8030, 'Embedding Dimensions': 4096, 'Average (8 datasets)': 50.06, 'STS (1 datasets)': 49.76, 'Classification (3 datasets)': 57.90, 'PairClassification (1 datasets)': 57.47, 'Retrieval (3 datasets)': 43.63, }, { 'Model Name': '[Typhoon-8B-Instruct](https://huggingface.co/scb10x/llama-3-typhoon-v1.5-8b-instruct)', 'Model Size (Million Parameters)': 8030, 'Embedding Dimensions': 4096, 'Average (8 datasets)': 53.51, 'STS (1 datasets)': 51.46, 'Classification (3 datasets)': 58.91, 'PairClassification (1 datasets)': 58.05, 'Retrieval (3 datasets)': 52.65, }, { 'Model Name': 'Cohere-embed-multilingual-v2.0', 'Model Size (Million Parameters)': "N/A", 'Embedding Dimensions': 768, 'Average (8 datasets)': 68.01, 'STS (1 datasets)': 68.03, 'Classification (3 datasets)': 57.31, 'PairClassification (1 datasets)': 62.03, 'Retrieval (3 datasets)': 85.23, }, { 'Model Name': 'Cohere-embed-multilingual-v3.0', 'Model Size (Million Parameters)': "N/A", 'Embedding Dimensions': 1024, 'Average (8 datasets)': 74.86, 'STS (1 datasets)': 77.87, 'Classification (3 datasets)': 59.96, 'PairClassification (1 datasets)': 73.28, 'Retrieval (3 datasets)': 91.43, }, { 'Model Name': 'Openai-text-embedding-3-large', 'Model Size (Million Parameters)': "N/A", 'Embedding Dimensions': 3072, 'Average (8 datasets)': 69.26, 'STS (1 datasets)': 70.46, 'Classification (3 datasets)': 58.79, 'PairClassification (1 datasets)': 67.33, 'Retrieval (3 datasets)': 83.87, }, ] # Calculate average results = [ { **result, 'Average (8 datasets)': round(sum( result.get(key, 0) for key in ['STS (1 datasets)', 'Classification (3 datasets)', 'PairClassification (1 datasets)', 'Retrieval (3 datasets)'] ) / 4, 2), } for result in results ] # Sort by average results = sorted(results, key=lambda x: x['Average (8 datasets)'], reverse=True) data = pd.DataFrame(results) demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") gr.DataFrame(data, datatype = 'markdown') demo.launch()