File size: 1,367 Bytes
75a2cec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import keras_nlp
import keras
import tensorflow.data as tf_data
import pickle
import random
EN_VOCAB_SIZE = 30000
CS_VOCAB_SIZE = 30000
def train_word_piece(text_samples, vocab_size, reserved_tokens, save_output_path):
word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
word_piece_ds.batch(1000).prefetch(2),
vocabulary_size=vocab_size,
reserved_tokens=reserved_tokens,
vocabulary_output_file=save_output_path
)
return vocab
def read_files(path):
with open(path, "r", encoding="utf-8") as f:
dataset_split = f.read().split("\n")[:-1]
#to lowercase, idk why
dataset_split = [line.lower() for line in dataset_split]
return dataset_split
#OPUS cs-en
# train_cs = read_files('datasets/cs-en/opus.cs-en-train.cs')
# train_en = read_files('datasets/cs-en/opus.cs-en-train.en')
#EUROPARL cs-en
train_cs = read_files('datasets/europarl/train-cs-en.cs')
train_en = read_files('datasets/europarl/train-cs-en.en')
print(train_cs[0])
print(train_en[0])
reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]
en_vocab = train_word_piece(train_en, EN_VOCAB_SIZE, reserved_tokens, "tokenizers/en_europarl_vocab")
cs_vocab = train_word_piece(train_cs, CS_VOCAB_SIZE, reserved_tokens, "tokenizers/cs_europarl_vocab")
|