File size: 1,367 Bytes
75a2cec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import keras_nlp
import keras
import tensorflow.data as tf_data
import pickle
import random


EN_VOCAB_SIZE = 30000
CS_VOCAB_SIZE = 30000




def train_word_piece(text_samples, vocab_size, reserved_tokens, save_output_path):
    word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
        word_piece_ds.batch(1000).prefetch(2),
        vocabulary_size=vocab_size,
        reserved_tokens=reserved_tokens,
        vocabulary_output_file=save_output_path
    )
    return vocab

def read_files(path):
    with open(path, "r", encoding="utf-8") as f:
        dataset_split = f.read().split("\n")[:-1]
    #to lowercase, idk why
    dataset_split = [line.lower() for line in dataset_split]
    return dataset_split

#OPUS cs-en
# train_cs = read_files('datasets/cs-en/opus.cs-en-train.cs')
# train_en = read_files('datasets/cs-en/opus.cs-en-train.en')


#EUROPARL cs-en
train_cs = read_files('datasets/europarl/train-cs-en.cs')
train_en = read_files('datasets/europarl/train-cs-en.en')



print(train_cs[0])
print(train_en[0])


reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]

en_vocab = train_word_piece(train_en, EN_VOCAB_SIZE, reserved_tokens, "tokenizers/en_europarl_vocab")
cs_vocab = train_word_piece(train_cs, CS_VOCAB_SIZE, reserved_tokens, "tokenizers/cs_europarl_vocab")