Spaces:

MohamedRashad
/

arabic-tokenizers-leaderboard

Running

App Files Files Community

arabic-tokenizers-leaderboard / app.py

MohamedRashad

Update app.py

36b7eaa verified 29 days ago

raw

history blame contribute delete

11.2 kB

	from transformers import AutoTokenizer
	from tqdm import tqdm
	import gradio as gr
	import pandas as pd
	from datasets import load_dataset
	import random
	from pathlib import Path

	initial_list_of_models = [
	"riotu-lab/Aranizer-PBE-86k",
	"riotu-lab/Aranizer-PBE-64k",
	"riotu-lab/Aranizer-PBE-32k",
	"riotu-lab/Aranizer-SP-86k",
	"riotu-lab/Aranizer-SP-64k",
	"riotu-lab/Aranizer-SP-32k",
	"asafaya/bert-base-arabic",
	"inceptionai/jais-family-30b-16k",
	"Xenova/gpt-4o",
	"FreedomIntelligence/AceGPT-v1.5-13B-Chat",
	"FreedomIntelligence/AceGPT-13B",
	"Qwen/Qwen2.5-72B-Instruct",
	"microsoft/Phi-3-mini-128k-instruct",
	"unsloth/gemma-2b-bnb-4bit",
	"unsloth/Llama-3.3-70B-Instruct",
	"CohereForAI/c4ai-command-r-v01",
	"CohereForAI/c4ai-command-r-plus",
	"CohereForAI/aya-101",
	]

	dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
	if dataframe_path.exists():
	df = pd.read_json(dataframe_path, lines=True)
	else:
	df = pd.DataFrame(
	columns=[
	"👳 Tokenize Tashkeel",
	"📛 Models",
	"🪺 Fertility Score",
	"➕ Total Number of Tokens",
	"📘 Vocab Size",
	"Tokenizer Class",
	]
	)

	# Datasets used for calculating the number of tokens
	arabic_dataset1 = load_dataset("ImruQays/Rasaif-Classical-Arabic-English-Parallel-texts", split="train")["ar"]
	arabic_dataset2 = load_dataset("HeshamHaroon/arabic-quotes", split="train")["quote"]
	arabic_dataset3 = load_dataset("SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots", split="train")["text"]
	all_data = arabic_dataset1 + arabic_dataset2 + arabic_dataset3
	print(f"Total number of samples: {len(all_data)}")
	all_text = " ".join(all_data)
	all_words = all_text.split()

	def benchmark_tokenizer(model_name) -> float:
	# Initialize the tokenizer
	tokenizer = AutoTokenizer.from_pretrained(
	model_name, use_fast=True, trust_remote_code=True
	)
	vocab_size = tokenizer.vocab_size
	total_number_of_tokens = len(tokenizer.tokenize(all_text))

	# Check if the tokenizer maintains the tashkeel
	dummy_text = "السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللَّهِ وَبَرَكَاتُهُ"
	tokenized_text = tokenizer.decode(tokenizer.encode(dummy_text), skip_special_tokens=True)
	tashkeel_maintainer = "✅" if tokenized_text == dummy_text else "❌"

	return {
	"👳 Tokenize Tashkeel": tashkeel_maintainer,
	"📛 Models": model_name,
	"🪺 Fertility Score": round(total_number_of_tokens / len(all_words), 3),
	"📘 Vocab Size": vocab_size,
	"➕ Total Number of Tokens": total_number_of_tokens,
	"Tokenizer Class": tokenizer.__class__.__name__,
	}


	for model_name in tqdm(initial_list_of_models):
	if model_name in df["📛 Models"].values:
	continue

	benchmark_data = benchmark_tokenizer(model_name)
	df = df._append(benchmark_data, ignore_index=True)

	# Sort the dataframe by the number of tokens
	df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)

	# Save the dataframe to a csv file
	df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)


	def submit(model_name):
	global df
	if model_name in df["📛 Models"].values:
	return (
	gr.Dataframe(df),
	gr.BarPlot(df),
	gr.Dropdown(choices=df["📛 Models"].tolist()),
	)
	benchmark_data = benchmark_tokenizer(model_name)
	df = df._append(benchmark_data, ignore_index=True)
	df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
	df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
	return (
	gr.Dataframe(df),
	gr.BarPlot(df),
	gr.Dropdown(choices=df["📛 Models"].tolist()),
	)


	def generate_distinct_colors(n):
	"""Generate n visually distinct colors in hexadecimal format."""
	if n > 256**3:
	raise ValueError("Cannot generate more than 16,777,216 unique colors.")

	# To ensure colors are distinct, calculate an appropriate distance between colors
	# The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate
	spacing = int((256 * 256 * 256) (1 / 3) / n (1 / 3))
	max_val = 256 - spacing

	# Set to keep track of used colors
	used_colors = set()

	# List to store the result colors
	result = []

	attempts = 0
	while len(result) < n:
	# Generate a color with a random start and controlled spacing
	r = random.randint(0, max_val)
	g = random.randint(0, max_val)
	b = random.randint(0, max_val)

	# Scale up by spacing to ensure minimum distance between colors
	r = min(255, r * spacing)
	g = min(255, g * spacing)
	b = min(255, b * spacing)

	# Format the color in hexadecimal
	color = f"#{r:02X}{g:02X}{b:02X}"

	# Ensure this color hasn't been used
	if color not in used_colors:
	used_colors.add(color)
	result.append(color)
	else:
	attempts += 1
	if attempts > 50:
	# Dynamically adjust spacing if stuck
	spacing = max(1, spacing - 1)
	max_val = 256 - spacing
	attempts = 0

	return result


	def decode_bpe_tokens(tokens):
	fixed_tokens = []
	for token in tokens:
	# Check if the token starts with the special BPE space character 'Ġ'
	if token.startswith("Ġ"):
	# Process the rest of the token
	try:
	# Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters
	fixed_token = " " + token[1:].encode("utf-8").decode("utf-8")
	except UnicodeDecodeError:
	fixed_token = token # Use the original token if decoding fails
	else:
	try:
	# Directly encode and decode without misinterpretation steps
	fixed_token = token.encode("utf-8").decode("utf-8")
	except UnicodeDecodeError:
	fixed_token = token # Use the original token if decoding fails
	fixed_tokens.append(fixed_token)
	return fixed_tokens


	def tokenize_text(text, chosen_model, better_tokenization=False):
	tokenizer = AutoTokenizer.from_pretrained(chosen_model)
	tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
	random_colors = generate_distinct_colors(len(tokenized_text))

	if better_tokenization:
	final_tokenized_text = []
	for token in tokenized_text:
	correct_tokenized_text = ""
	for char in text:
	correct_tokenized_text += char
	current_token = decode_bpe_tokens(
	tokenizer.tokenize(correct_tokenized_text)
	)
	if current_token[0] == token:
	final_tokenized_text.append(correct_tokenized_text)
	text = text[len(correct_tokenized_text) :]
	break
	else:
	final_tokenized_text = tokenized_text
	print(final_tokenized_text)

	output = []
	color_map = {}
	for idx, token in enumerate(final_tokenized_text):
	output.append((token, str(idx)))
	color_map[str(idx + 1)] = random_colors[idx % len(random_colors)]

	return gr.HighlightedText(output, color_map)


	def refresh():
	global df
	df = pd.read_json(dataframe_path, lines=True)
	return (
	gr.Dataframe(df),
	gr.BarPlot(df),
	gr.Dropdown(choices=df["📛 Models"].tolist()),
	)

	leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens got from the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset (This dataset was chosen because it represents Arabic Fusha text in a small and concentrated manner).

	A tokenizer that scores high in this leaderboard should be efficient in parsing Arabic in its different dialects and forms.

	## Updates/Notes:
	1. New datasets is added for the evaluation (e.g. [arabic-quotes](https://huggingface.co/datasets/HeshamHaroon/arabic-quotes), [Moroccan_Arabic_Wikipedia_20230101_nobots](https://huggingface.co/datasets/SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots)).
	1. `Fertility Score` is calculated by dividing the total number of tokens by the total number of words in the dataset (Lower is better).
	1. `Tokenize Tashkeel` is an indicator of whether the tokenizer maintains the tashkeel when tokenizing or not (`✅` for yes, `❌` for no).
	1. `Vocab Size` is the total number of tokens in the tokenizer's vocabulary (e.g. `10000` tokens).
	1. `Tokenizer Class` is the class of the tokenizer (e.g. `BertTokenizer` or `GPT2Tokenizer`)
	1. `Total Number of Tokens` is the total number of tokens in the dataset after tokenization (Lower is better).

	Note: Press `Refresh` to get the latest data available in the leaderboard (The initial state may be deceiving).
	"""

	with gr.Blocks() as demo:
	gr.HTML("<center><h1>Arabic Tokenizers Leaderboard</h1></center>")
	gr.Markdown("## What is the best tokenizer for Arabic?")
	gr.Markdown(leaderboard_description)
	with gr.Tab(label="Leaderboard"):
	dataframe = gr.Dataframe(df)
	with gr.Accordion("Barplot", open=False):
	barplot = gr.BarPlot(
	df,
	x="📛 Models",
	y="➕ Total Number of Tokens",
	x_title=" ",
	y_title=" ",
	width=1000,
	height=400,
	tooltip=["📘 Vocab Size", "🪺 Fertility Score"],
	vertical=False,
	x_label_angle=30,
	)
	model_name = gr.Textbox(
	label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
	)
	with gr.Row():
	submit_new_model_btn = gr.Button(
	value="Submit New Model", variant="primary", scale=3
	)
	refresh_btn = gr.Button(value="Refresh", variant="secondary", scale=1)
	with gr.Tab(label="Try tokenizers"):
	text = gr.Textbox(
	label="Enter a text",
	lines=5,
	value="السلام عليكم ورحمة الله",
	rtl=True,
	text_align="right",
	)
	dropdown = gr.Dropdown(
	label="Select a model",
	choices=df["📛 Models"].tolist(),
	value=df["📛 Models"].tolist()[0],
	)
	with gr.Row():
	submit_text_btn = gr.Button(value="Submit", variant="primary", scale=3)
	checkbox = gr.Checkbox(
	label="Better tokenization for Arabic Text", value=False, scale=1
	)
	tokenized_textbox = gr.HighlightedText(label="Tokenized text")

	submit_new_model_btn.click(
	submit, model_name, outputs=[dataframe, barplot, dropdown]
	)
	refresh_btn.click(refresh, outputs=[dataframe, barplot, dropdown])
	submit_text_btn.click(
	tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox]
	)


	demo.launch()