Upload 5 files

Browse files

Files changed (5) hide show

create_dataset_splits.py +59 -0
eval.py +193 -0
preprocess_dataset.py +103 -0
train.py +86 -0
train_tokenizers.py +52 -0

create_dataset_splits.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import keras_nlp
+import keras
+import tensorflow.data as tf_data
+import pickle
+import random
+def read_files(path):
+    with open(path, "r", encoding="utf-8") as f:
+        dataset_split = f.read().split("\n")[:-1]
+    #to lowercase, idk why
+    dataset_split = [line.lower() for line in dataset_split]
+    return dataset_split
+def save_list_to_file(file_path, string_list):
+    with open(file_path, 'w') as file:
+        file.writelines([f"{string}\n" for string in string_list])
+#EUROPARL cs-en
+#load files
+cs_file = 'datasets/europarl/europarl-v7.cs-en.cs'
+en_file = 'datasets/europarl/europarl-v7.cs-en.en'
+sentences_cs = read_files(cs_file)
+sentences_en = read_files(en_file)
+#create pairs and split to train, valid and test
+pairs = list(zip(sentences_en, sentences_cs))
+random.shuffle(pairs)
+num_val_samples = int(0.15 * len(pairs))
+num_train_samples = len(pairs) - 2 * num_val_samples
+train_pairs = pairs[:num_train_samples]
+valid_pairs = pairs[num_train_samples : num_train_samples + num_val_samples]
+test_pairs = pairs[num_train_samples + num_val_samples :]
+print(train_pairs[0])
+print(valid_pairs[0])
+print(test_pairs[0])
+en_train_samples = [pair[0] for pair in train_pairs]
+cs_train_samples = [pair[1] for pair in train_pairs]
+en_valid_samples = [pair[0] for pair in valid_pairs]
+cs_valid_samples = [pair[1] for pair in valid_pairs]
+en_test_samples = [pair[0] for pair in test_pairs]
+cs_test_samples = [pair[1] for pair in test_pairs]
+save_list_to_file("datasets/europarl/train-cs-en.en", en_train_samples)
+save_list_to_file("datasets/europarl/train-cs-en.cs", cs_train_samples)
+save_list_to_file("datasets/europarl/valid-cs-en.en", en_valid_samples)
+save_list_to_file("datasets/europarl/valid-cs-en.cs", cs_valid_samples)
+save_list_to_file("datasets/europarl/test-cs-en.en", en_test_samples)
+save_list_to_file("datasets/europarl/test-cs-en.cs", cs_test_samples)

eval.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import keras_nlp
+import keras
+import tensorflow.data as tf_data
+import pickle
+import tensorflow as tf
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
+import datetime
+import random
+import re
+from sacrebleu.metrics import CHRF
+import time
+# from keras import ops
+#hyperparameters
+MAX_SEQUENCE_LENGTH = 64
+transformer = keras.models.load_model('models_europarl/en_cs_translator_saved_20231209_0046.keras')
+def read_files(path, lowercase = False):
+    with open(path, "r", encoding="utf-8") as f:
+        dataset_split = f.read().split("\n")[:-1]
+    #to lowercase, idk why
+    if(lowercase):
+        dataset_split = [line.lower() for line in dataset_split]
+    return dataset_split
+en_vocab = read_files("tokenizers/en_europarl_vocab")
+cs_vocab = read_files("tokenizers/cs_europarl_vocab")
+en_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+    vocabulary=en_vocab,
+    lowercase=False
+)
+cs_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+    vocabulary=cs_vocab,
+    lowercase=False
+)
+def compute_probabilities(logits):
+    return keras.activations.softmax(logits)
+def next_token_logits(encoder_input_tokens, prompt, predicted_token_index):
+    logits =  transformer(
+        [tf.expand_dims(encoder_input_tokens, axis=0), tf.expand_dims(prompt, axis=0)]
+    )[:, predicted_token_index-1, :] #we need prediction for next token, which is on index of last generated token
+    return logits
+def greedy_decode(encoder_input_tokens, prompt, end_token_id):
+    start_index = 1
+    current_prompt = prompt
+    for predicted_token_index in range(start_index, MAX_SEQUENCE_LENGTH):
+        next_logits = next_token_logits(encoder_input_tokens, current_prompt, predicted_token_index)
+        next_probabilities = compute_probabilities(next_logits)
+        max_probability_token_id = tf.argmax(next_probabilities, axis=-1) #index in logits array is equal to id
+        indices = tf.constant([[predicted_token_index]])
+        data = tf.constant([max_probability_token_id.numpy()[0]])
+        current_prompt = tf.tensor_scatter_nd_update(current_prompt, indices, data)
+        #generated end token
+        if max_probability_token_id == end_token_id:
+            break
+    return current_prompt
+def beam_decode(encoder_input_tokens, prompt, end_token_id, beam_size):
+    start_index = 1
+    #initial beam
+    next_logits = next_token_logits(encoder_input_tokens, prompt, start_index)
+    next_probabilities = compute_probabilities(next_logits)
+    top_k_probabilities, top_k_token_indices = tf.math.top_k(next_probabilities, k=beam_size)
+    current_subsequencies = []
+    for index, value in enumerate(top_k_token_indices.numpy()[0]):
+        #add to current subsequencies 5 versions of prompt with top k tokens on index 1
+        indices = tf.constant([[start_index]])
+        data = tf.constant([value])
+        current_prompt = tf.tensor_scatter_nd_update(prompt, indices, data)
+        #add potential subsequence with its log probability and length-normalized log probability (here length = 1, so its same)
+        log_prob = tf.math.log(top_k_probabilities.numpy()[0][index])
+        current_subsequencies.append((current_prompt, log_prob, log_prob))
+    final_potential_solutions = []
+    for predicted_token_index in range(start_index+1, MAX_SEQUENCE_LENGTH):
+        #solutions which generated end token
+        if len(final_potential_solutions) == beam_size:
+            break
+        tmp_subsequencies = []
+        for index, (subseq_prompt, subseq_log_probability, _) in enumerate(current_subsequencies):
+            next_logits = next_token_logits(encoder_input_tokens, subseq_prompt, predicted_token_index)
+            next_probabilities = compute_probabilities(next_logits)
+            top_k_probabilities, top_k_token_indices = tf.math.top_k(next_probabilities, k=beam_size-len(final_potential_solutions))
+            for index, value in enumerate(top_k_token_indices.numpy()[0]):
+                #add to current subsequencies 5 versions of prompt with top k tokens on index 1
+                indices = tf.constant([[predicted_token_index]])
+                data = tf.constant([value])
+                updated_subseq_prompt = tf.tensor_scatter_nd_update(subseq_prompt, indices, data)
+                #add potential subsequence with its log probability
+                nextLogProbability = tf.math.log(top_k_probabilities.numpy()[0][index])
+                tmp_subsequencies.append((updated_subseq_prompt, subseq_log_probability + nextLogProbability, (subseq_log_probability + nextLogProbability)/(predicted_token_index+1)))
+        current_subsequencies = []
+        current_sequences_to_find = beam_size - len(final_potential_solutions)
+        tmp_subsequencies = sorted(tmp_subsequencies, key=lambda x: x[2], reverse=True)
+        for i in range(current_sequences_to_find):
+            if tmp_subsequencies[i][0][predicted_token_index] == end_token_id:
+                final_potential_solutions.append(tmp_subsequencies[i])
+            else:
+                current_subsequencies.append(tmp_subsequencies[i])
+    #get best
+    final_potential_solutions = sorted(final_potential_solutions, key=lambda x: x[2], reverse=True)
+    if len(final_potential_solutions) > 0:
+        return final_potential_solutions[0][0]
+    #didnt generate any probable sequence to end
+    else:
+        sorted_subs = sorted(current_subsequencies, key=lambda x: x[2], reverse=True)
+        return sorted_subs[0][0]
+def decode_sequences(input_sentence):
+    # Tokenize the encoder input.
+    encoder_input_tokens = en_tokenizer(input_sentence)
+    # encoder_input_tokens = tf.expand_dims(encoder_input_tokens, axis=0)
+    if len(encoder_input_tokens) < MAX_SEQUENCE_LENGTH:
+        pads = tf.fill((MAX_SEQUENCE_LENGTH - len(encoder_input_tokens)), 0)
+        encoder_input_tokens = tf.concat([encoder_input_tokens, pads], 0)
+    if len(encoder_input_tokens) > MAX_SEQUENCE_LENGTH:
+        tensor_content = "[START] Exceeded. [END] [PAD] [PAD] [PAD] [PAD]"
+        tensor = tf.constant([tensor_content], dtype=tf.string)
+        return tensor
+    start = tf.fill((1), cs_tokenizer.token_to_id("[START]"))
+    pads = tf.fill((MAX_SEQUENCE_LENGTH - 1), cs_tokenizer.token_to_id("[PAD]"))
+    prompt = tf.concat((start, pads), axis=-1)
+    end_token_id = cs_tokenizer.token_to_id("[END]")
+    generated_tokens = greedy_decode(encoder_input_tokens, prompt, end_token_id)
+    # generated_tokens = beam_decode(encoder_input_tokens, prompt, end_token_id, 5)
+    generated_sentences = cs_tokenizer.detokenize(tf.expand_dims(generated_tokens, axis=0))
+    return generated_sentences
+test_en = read_files('datasets/europarl/test-cs-en.en')
+test_cs = read_files('datasets/europarl/test-cs-en.cs')
+bleu_metrics = keras_nlp.metrics.Bleu(
+    name="bleu",
+    tokenizer = cs_tokenizer
+)
+eval_samples = 100
+chrf = CHRF()
+refs = test_cs[:eval_samples]
+translations = []
+start_time = time.time()
+for i in range(len(refs)):
+    cs_translated = decode_sequences(test_en[i])
+    cs_translated = cs_translated.numpy()[0].decode("utf-8")
+    cs_translated = (
+        cs_translated.replace("[PAD]", "")
+        .replace("[START]", "")
+        .replace("[END]", "")
+        .strip()
+    )
+    #remove spaces before interpunction
+    cs_translated = re.sub(r'\s+([.,;!?:])', r'\1', cs_translated)
+    print(cs_translated, flush=True)
+    translations.append(cs_translated)
+end_time = time.time()
+refs_twodim = [[ref] for ref in refs]
+bleu_metrics(refs_twodim, translations)
+print("evaluating chrf", flush=True)
+chrf2_result = chrf.corpus_score(translations, refs_twodim)
+print("chrf2")
+print(chrf2_result)
+print("bleu")
+print(bleu_metrics.result().numpy())
+print("elapsed time")
+elapsed_time = end_time - start_time
+print(elapsed_time)

preprocess_dataset.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import keras_nlp
+import keras
+import tensorflow.data as tf_data
+import pickle
+#hyperparameters
+BATCH_SIZE = 16
+MAX_SEQUENCE_LENGTH = 64
+#load tokenizers/en_vocab to list
+def read_files(path, lowercase = False):
+    with open(path, "r", encoding="utf-8") as f:
+        dataset_split = f.read().split("\n")[:-1]
+    #to lowercase, idk why
+    if(lowercase):
+        dataset_split = [line.lower() for line in dataset_split]
+    return dataset_split
+# en_vocab = read_files("tokenizers/en_opus_vocab")
+# cs_vocab = read_files("tokenizers/cs_opus_vocab")
+en_vocab = read_files("tokenizers/en_europarl_vocab")
+cs_vocab = read_files("tokenizers/cs_europarl_vocab")
+en_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+    vocabulary=en_vocab,
+    lowercase=False
+)
+cs_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+    vocabulary=cs_vocab,
+    lowercase=False
+)
+#opus
+# train_cs_file = 'datasets/cs-en/opus.cs-en-train.cs'
+# train_en_file = 'datasets/cs-en/opus.cs-en-train.en'
+# valid_cs_file = 'datasets/cs-en/opus.cs-en-dev.cs'
+# valid_en_file = 'datasets/cs-en/opus.cs-en-dev.en'
+# test_cs_file = 'datasets/cs-en/opus.cs-en-test.cs'
+# test_en_file = 'datasets/cs-en/opus.cs-en-test.en'
+#europarl
+train_cs_file = 'datasets/europarl/train-cs-en.cs'
+train_en_file = 'datasets/europarl/train-cs-en.en'
+valid_cs_file = 'datasets/europarl/valid-cs-en.cs'
+valid_en_file = 'datasets/europarl/valid-cs-en.en'
+test_cs_file = 'datasets/europarl/test-cs-en.cs'
+test_en_file = 'datasets/europarl/test-cs-en.en'
+train_cs = read_files(train_cs_file, True)
+train_en = read_files(train_en_file, True)
+valid_cs = read_files(valid_cs_file, True)
+valid_en = read_files(valid_en_file, True)
+test_cs = read_files(test_cs_file, True)
+test_en = read_files(test_en_file, True)
+def preprocess_batch(en, cs):
+    en = en_tokenizer(en)
+    cs = cs_tokenizer(cs)
+    # Pad `eng` to `MAX_SEQUENCE_LENGTH`.
+    en_start_end_packer = keras_nlp.layers.StartEndPacker(
+        sequence_length=MAX_SEQUENCE_LENGTH,
+        pad_value=en_tokenizer.token_to_id("[PAD]"),
+    )
+    en = en_start_end_packer(en)
+    # Add special tokens (`"[START]"` and `"[END]"`) to `spa` and pad it as well.
+    cs_start_end_packer = keras_nlp.layers.StartEndPacker(
+        sequence_length=MAX_SEQUENCE_LENGTH + 1,
+        start_value=cs_tokenizer.token_to_id("[START]"),
+        end_value=cs_tokenizer.token_to_id("[END]"),
+        pad_value=cs_tokenizer.token_to_id("[PAD]"),
+    )
+    cs = cs_start_end_packer(cs)
+    return (
+        {
+            "encoder_inputs": en,
+            "decoder_inputs": cs[:, :-1],
+        },
+        cs[:, 1:],
+    )
+def make_dataset(en_texts, cs_texts):
+    dataset = tf_data.Dataset.from_tensor_slices((en_texts, cs_texts))
+    dataset = dataset.batch(BATCH_SIZE)
+    dataset = dataset.map(preprocess_batch, num_parallel_calls=tf_data.AUTOTUNE)
+    return dataset.shuffle(2048).prefetch(16).cache()
+train_ds = make_dataset(train_en, train_cs)
+val_ds = make_dataset(valid_en, valid_cs)
+tf_data.Dataset.save(train_ds, "datasets/preprocessed_europarl_train")
+tf_data.Dataset.save(val_ds, "datasets/preprocessed_europarl_valid")

train.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import keras_nlp
+import keras
+import tensorflow.data as tf_data
+import pickle
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
+import datetime
+BATCH_SIZE = 16
+LEARNING_RATE=1e-4
+EPOCHS = 20
+EMBED_DIM = 256
+INTERMEDIATE_DIM = 2048
+NUM_HEADS = 8
+# TODO probably change dynamically
+MAX_SEQUENCE_LENGTH = 128
+EN_VOCAB_SIZE = 30000
+CS_VOCAB_SIZE = 30000
+train_ds = tf_data.Dataset.load("datasets/preprocessed_europarl_train")
+valid_ds = tf_data.Dataset.load("datasets/preprocessed_europarl_valid")
+# Encoder
+encoder_inputs = keras.Input(shape=(None,), name="encoder_inputs")
+x = keras_nlp.layers.TokenAndPositionEmbedding(
+    vocabulary_size=EN_VOCAB_SIZE,
+    sequence_length=MAX_SEQUENCE_LENGTH,
+    embedding_dim=EMBED_DIM,
+)(encoder_inputs)
+encoder_outputs = keras_nlp.layers.TransformerEncoder(
+    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
+)(inputs=x)
+encoder = keras.Model(encoder_inputs, encoder_outputs)
+# Decoder
+decoder_inputs = keras.Input(shape=(None,), name="decoder_inputs")
+encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")
+x = keras_nlp.layers.TokenAndPositionEmbedding(
+    vocabulary_size=CS_VOCAB_SIZE,
+    sequence_length=MAX_SEQUENCE_LENGTH,
+    embedding_dim=EMBED_DIM,
+)(decoder_inputs)
+x = keras_nlp.layers.TransformerDecoder(
+    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
+)(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)
+x = keras.layers.Dropout(0.5)(x)
+decoder_outputs = keras.layers.Dense(CS_VOCAB_SIZE, activation="softmax")(x)
+decoder = keras.Model(
+    [
+        decoder_inputs,
+        encoded_seq_inputs,
+    ],
+    decoder_outputs,
+)
+decoder_outputs = decoder([decoder_inputs, encoder_outputs])
+transformer = keras.Model(
+    [encoder_inputs, decoder_inputs],
+    decoder_outputs,
+    name="transformer",
+)
+transformer.summary()
+optimizer = Adam(learning_rate=LEARNING_RATE)
+transformer.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"])
+# Callbacks
+early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
+model_checkpoint = ModelCheckpoint(f'models_europarl/en_cs_translator_checkpoint_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.keras', save_best_only=True)
+transformer.fit(
+    train_ds,
+    epochs=EPOCHS,
+    validation_data=valid_ds,
+    batch_size=BATCH_SIZE,
+    callbacks=[early_stopping, model_checkpoint]
+)
+transformer.save(f'models_europarl/en_cs_translator_saved_{datetime.datetime.now().strftime("%Y%m%d_%H%M")}.keras')

train_tokenizers.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import keras_nlp
+import keras
+import tensorflow.data as tf_data
+import pickle
+import random
+EN_VOCAB_SIZE = 30000
+CS_VOCAB_SIZE = 30000
+def train_word_piece(text_samples, vocab_size, reserved_tokens, save_output_path):
+    word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
+    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
+        word_piece_ds.batch(1000).prefetch(2),
+        vocabulary_size=vocab_size,
+        reserved_tokens=reserved_tokens,
+        vocabulary_output_file=save_output_path
+    )
+    return vocab
+def read_files(path):
+    with open(path, "r", encoding="utf-8") as f:
+        dataset_split = f.read().split("\n")[:-1]
+    #to lowercase, idk why
+    dataset_split = [line.lower() for line in dataset_split]
+    return dataset_split
+#OPUS cs-en
+# train_cs = read_files('datasets/cs-en/opus.cs-en-train.cs')
+# train_en = read_files('datasets/cs-en/opus.cs-en-train.en')
+#EUROPARL cs-en
+train_cs = read_files('datasets/europarl/train-cs-en.cs')
+train_en = read_files('datasets/europarl/train-cs-en.en')
+print(train_cs[0])
+print(train_en[0])
+reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]
+en_vocab = train_word_piece(train_en, EN_VOCAB_SIZE, reserved_tokens, "tokenizers/en_europarl_vocab")
+cs_vocab = train_word_piece(train_cs, CS_VOCAB_SIZE, reserved_tokens, "tokenizers/cs_europarl_vocab")