MohamedRashad's picture
Update app.py
36b7eaa verified
from transformers import AutoTokenizer
from tqdm import tqdm
import gradio as gr
import pandas as pd
from datasets import load_dataset
import random
from pathlib import Path
initial_list_of_models = [
"riotu-lab/Aranizer-PBE-86k",
"riotu-lab/Aranizer-PBE-64k",
"riotu-lab/Aranizer-PBE-32k",
"riotu-lab/Aranizer-SP-86k",
"riotu-lab/Aranizer-SP-64k",
"riotu-lab/Aranizer-SP-32k",
"asafaya/bert-base-arabic",
"inceptionai/jais-family-30b-16k",
"Xenova/gpt-4o",
"FreedomIntelligence/AceGPT-v1.5-13B-Chat",
"FreedomIntelligence/AceGPT-13B",
"Qwen/Qwen2.5-72B-Instruct",
"microsoft/Phi-3-mini-128k-instruct",
"unsloth/gemma-2b-bnb-4bit",
"unsloth/Llama-3.3-70B-Instruct",
"CohereForAI/c4ai-command-r-v01",
"CohereForAI/c4ai-command-r-plus",
"CohereForAI/aya-101",
]
dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
if dataframe_path.exists():
df = pd.read_json(dataframe_path, lines=True)
else:
df = pd.DataFrame(
columns=[
"👳 Tokenize Tashkeel",
"📛 Models",
"🪺 Fertility Score",
"➕ Total Number of Tokens",
"📘 Vocab Size",
"Tokenizer Class",
]
)
# Datasets used for calculating the number of tokens
arabic_dataset1 = load_dataset("ImruQays/Rasaif-Classical-Arabic-English-Parallel-texts", split="train")["ar"]
arabic_dataset2 = load_dataset("HeshamHaroon/arabic-quotes", split="train")["quote"]
arabic_dataset3 = load_dataset("SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots", split="train")["text"]
all_data = arabic_dataset1 + arabic_dataset2 + arabic_dataset3
print(f"Total number of samples: {len(all_data)}")
all_text = " ".join(all_data)
all_words = all_text.split()
def benchmark_tokenizer(model_name) -> float:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
model_name, use_fast=True, trust_remote_code=True
)
vocab_size = tokenizer.vocab_size
total_number_of_tokens = len(tokenizer.tokenize(all_text))
# Check if the tokenizer maintains the tashkeel
dummy_text = "السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللَّهِ وَبَرَكَاتُهُ"
tokenized_text = tokenizer.decode(tokenizer.encode(dummy_text), skip_special_tokens=True)
tashkeel_maintainer = "✅" if tokenized_text == dummy_text else "❌"
return {
"👳 Tokenize Tashkeel": tashkeel_maintainer,
"📛 Models": model_name,
"🪺 Fertility Score": round(total_number_of_tokens / len(all_words), 3),
"📘 Vocab Size": vocab_size,
"➕ Total Number of Tokens": total_number_of_tokens,
"Tokenizer Class": tokenizer.__class__.__name__,
}
for model_name in tqdm(initial_list_of_models):
if model_name in df["📛 Models"].values:
continue
benchmark_data = benchmark_tokenizer(model_name)
df = df._append(benchmark_data, ignore_index=True)
# Sort the dataframe by the number of tokens
df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
# Save the dataframe to a csv file
df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
def submit(model_name):
global df
if model_name in df["📛 Models"].values:
return (
gr.Dataframe(df),
gr.BarPlot(df),
gr.Dropdown(choices=df["📛 Models"].tolist()),
)
benchmark_data = benchmark_tokenizer(model_name)
df = df._append(benchmark_data, ignore_index=True)
df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
return (
gr.Dataframe(df),
gr.BarPlot(df),
gr.Dropdown(choices=df["📛 Models"].tolist()),
)
def generate_distinct_colors(n):
"""Generate n visually distinct colors in hexadecimal format."""
if n > 256**3:
raise ValueError("Cannot generate more than 16,777,216 unique colors.")
# To ensure colors are distinct, calculate an appropriate distance between colors
# The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate
spacing = int((256 * 256 * 256) ** (1 / 3) / n ** (1 / 3))
max_val = 256 - spacing
# Set to keep track of used colors
used_colors = set()
# List to store the result colors
result = []
attempts = 0
while len(result) < n:
# Generate a color with a random start and controlled spacing
r = random.randint(0, max_val)
g = random.randint(0, max_val)
b = random.randint(0, max_val)
# Scale up by spacing to ensure minimum distance between colors
r = min(255, r * spacing)
g = min(255, g * spacing)
b = min(255, b * spacing)
# Format the color in hexadecimal
color = f"#{r:02X}{g:02X}{b:02X}"
# Ensure this color hasn't been used
if color not in used_colors:
used_colors.add(color)
result.append(color)
else:
attempts += 1
if attempts > 50:
# Dynamically adjust spacing if stuck
spacing = max(1, spacing - 1)
max_val = 256 - spacing
attempts = 0
return result
def decode_bpe_tokens(tokens):
fixed_tokens = []
for token in tokens:
# Check if the token starts with the special BPE space character 'Ġ'
if token.startswith("Ġ"):
# Process the rest of the token
try:
# Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters
fixed_token = " " + token[1:].encode("utf-8").decode("utf-8")
except UnicodeDecodeError:
fixed_token = token # Use the original token if decoding fails
else:
try:
# Directly encode and decode without misinterpretation steps
fixed_token = token.encode("utf-8").decode("utf-8")
except UnicodeDecodeError:
fixed_token = token # Use the original token if decoding fails
fixed_tokens.append(fixed_token)
return fixed_tokens
def tokenize_text(text, chosen_model, better_tokenization=False):
tokenizer = AutoTokenizer.from_pretrained(chosen_model)
tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
random_colors = generate_distinct_colors(len(tokenized_text))
if better_tokenization:
final_tokenized_text = []
for token in tokenized_text:
correct_tokenized_text = ""
for char in text:
correct_tokenized_text += char
current_token = decode_bpe_tokens(
tokenizer.tokenize(correct_tokenized_text)
)
if current_token[0] == token:
final_tokenized_text.append(correct_tokenized_text)
text = text[len(correct_tokenized_text) :]
break
else:
final_tokenized_text = tokenized_text
print(final_tokenized_text)
output = []
color_map = {}
for idx, token in enumerate(final_tokenized_text):
output.append((token, str(idx)))
color_map[str(idx + 1)] = random_colors[idx % len(random_colors)]
return gr.HighlightedText(output, color_map)
def refresh():
global df
df = pd.read_json(dataframe_path, lines=True)
return (
gr.Dataframe(df),
gr.BarPlot(df),
gr.Dropdown(choices=df["📛 Models"].tolist()),
)
leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens got from the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset (This dataset was chosen because it represents Arabic Fusha text in a small and concentrated manner).
**A tokenizer that scores high in this leaderboard should be efficient in parsing Arabic in its different dialects and forms.**
## Updates/Notes:
1. New datasets is added for the evaluation (e.g. [arabic-quotes](https://huggingface.co/datasets/HeshamHaroon/arabic-quotes), [Moroccan_Arabic_Wikipedia_20230101_nobots](https://huggingface.co/datasets/SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots)).
1. `Fertility Score` is calculated by dividing the total number of tokens by the total number of words in the dataset (Lower is better).
1. `Tokenize Tashkeel` is an indicator of whether the tokenizer maintains the tashkeel when tokenizing or not (`✅` for yes, `❌` for no).
1. `Vocab Size` is the total number of tokens in the tokenizer's vocabulary (e.g. `10000` tokens).
1. `Tokenizer Class` is the class of the tokenizer (e.g. `BertTokenizer` or `GPT2Tokenizer`)
1. `Total Number of Tokens` is the total number of tokens in the dataset after tokenization (Lower is better).
**Note**: Press `Refresh` to get the latest data available in the leaderboard (The initial state may be deceiving).
"""
with gr.Blocks() as demo:
gr.HTML("<center><h1>Arabic Tokenizers Leaderboard</h1></center>")
gr.Markdown("## What is the best tokenizer for Arabic?")
gr.Markdown(leaderboard_description)
with gr.Tab(label="Leaderboard"):
dataframe = gr.Dataframe(df)
with gr.Accordion("Barplot", open=False):
barplot = gr.BarPlot(
df,
x="📛 Models",
y="➕ Total Number of Tokens",
x_title=" ",
y_title=" ",
width=1000,
height=400,
tooltip=["📘 Vocab Size", "🪺 Fertility Score"],
vertical=False,
x_label_angle=30,
)
model_name = gr.Textbox(
label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
)
with gr.Row():
submit_new_model_btn = gr.Button(
value="Submit New Model", variant="primary", scale=3
)
refresh_btn = gr.Button(value="Refresh", variant="secondary", scale=1)
with gr.Tab(label="Try tokenizers"):
text = gr.Textbox(
label="Enter a text",
lines=5,
value="السلام عليكم ورحمة الله",
rtl=True,
text_align="right",
)
dropdown = gr.Dropdown(
label="Select a model",
choices=df["📛 Models"].tolist(),
value=df["📛 Models"].tolist()[0],
)
with gr.Row():
submit_text_btn = gr.Button(value="Submit", variant="primary", scale=3)
checkbox = gr.Checkbox(
label="Better tokenization for Arabic Text", value=False, scale=1
)
tokenized_textbox = gr.HighlightedText(label="Tokenized text")
submit_new_model_btn.click(
submit, model_name, outputs=[dataframe, barplot, dropdown]
)
refresh_btn.click(refresh, outputs=[dataframe, barplot, dropdown])
submit_text_btn.click(
tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox]
)
demo.launch()