jkot commited on
Commit
75a2cec
·
1 Parent(s): c72597f

Upload 5 files

Browse files
Files changed (5) hide show
  1. create_dataset_splits.py +59 -0
  2. eval.py +193 -0
  3. preprocess_dataset.py +103 -0
  4. train.py +86 -0
  5. train_tokenizers.py +52 -0
create_dataset_splits.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import keras_nlp
2
+ import keras
3
+ import tensorflow.data as tf_data
4
+ import pickle
5
+ import random
6
+
7
+
8
+
9
+
10
+ def read_files(path):
11
+ with open(path, "r", encoding="utf-8") as f:
12
+ dataset_split = f.read().split("\n")[:-1]
13
+ #to lowercase, idk why
14
+ dataset_split = [line.lower() for line in dataset_split]
15
+ return dataset_split
16
+
17
+ def save_list_to_file(file_path, string_list):
18
+ with open(file_path, 'w') as file:
19
+ file.writelines([f"{string}\n" for string in string_list])
20
+
21
+
22
+ #EUROPARL cs-en
23
+ #load files
24
+ cs_file = 'datasets/europarl/europarl-v7.cs-en.cs'
25
+ en_file = 'datasets/europarl/europarl-v7.cs-en.en'
26
+ sentences_cs = read_files(cs_file)
27
+ sentences_en = read_files(en_file)
28
+
29
+ #create pairs and split to train, valid and test
30
+ pairs = list(zip(sentences_en, sentences_cs))
31
+ random.shuffle(pairs)
32
+ num_val_samples = int(0.15 * len(pairs))
33
+ num_train_samples = len(pairs) - 2 * num_val_samples
34
+
35
+ train_pairs = pairs[:num_train_samples]
36
+ valid_pairs = pairs[num_train_samples : num_train_samples + num_val_samples]
37
+ test_pairs = pairs[num_train_samples + num_val_samples :]
38
+
39
+ print(train_pairs[0])
40
+ print(valid_pairs[0])
41
+ print(test_pairs[0])
42
+
43
+
44
+ en_train_samples = [pair[0] for pair in train_pairs]
45
+ cs_train_samples = [pair[1] for pair in train_pairs]
46
+ en_valid_samples = [pair[0] for pair in valid_pairs]
47
+ cs_valid_samples = [pair[1] for pair in valid_pairs]
48
+ en_test_samples = [pair[0] for pair in test_pairs]
49
+ cs_test_samples = [pair[1] for pair in test_pairs]
50
+
51
+
52
+ save_list_to_file("datasets/europarl/train-cs-en.en", en_train_samples)
53
+ save_list_to_file("datasets/europarl/train-cs-en.cs", cs_train_samples)
54
+ save_list_to_file("datasets/europarl/valid-cs-en.en", en_valid_samples)
55
+ save_list_to_file("datasets/europarl/valid-cs-en.cs", cs_valid_samples)
56
+ save_list_to_file("datasets/europarl/test-cs-en.en", en_test_samples)
57
+ save_list_to_file("datasets/europarl/test-cs-en.cs", cs_test_samples)
58
+
59
+
eval.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import keras_nlp
3
+ import keras
4
+ import tensorflow.data as tf_data
5
+ import pickle
6
+ import tensorflow as tf
7
+ from tensorflow.keras.optimizers import Adam
8
+ from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
9
+ import datetime
10
+ import random
11
+ import re
12
+ from sacrebleu.metrics import CHRF
13
+ import time
14
+ # from keras import ops
15
+ #hyperparameters
16
+ MAX_SEQUENCE_LENGTH = 64
17
+
18
+ transformer = keras.models.load_model('models_europarl/en_cs_translator_saved_20231209_0046.keras')
19
+ def read_files(path, lowercase = False):
20
+ with open(path, "r", encoding="utf-8") as f:
21
+ dataset_split = f.read().split("\n")[:-1]
22
+ #to lowercase, idk why
23
+ if(lowercase):
24
+ dataset_split = [line.lower() for line in dataset_split]
25
+ return dataset_split
26
+
27
+ en_vocab = read_files("tokenizers/en_europarl_vocab")
28
+ cs_vocab = read_files("tokenizers/cs_europarl_vocab")
29
+ en_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
30
+ vocabulary=en_vocab,
31
+ lowercase=False
32
+ )
33
+ cs_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
34
+ vocabulary=cs_vocab,
35
+ lowercase=False
36
+ )
37
+
38
+ def compute_probabilities(logits):
39
+ return keras.activations.softmax(logits)
40
+
41
+ def next_token_logits(encoder_input_tokens, prompt, predicted_token_index):
42
+ logits = transformer(
43
+ [tf.expand_dims(encoder_input_tokens, axis=0), tf.expand_dims(prompt, axis=0)]
44
+ )[:, predicted_token_index-1, :] #we need prediction for next token, which is on index of last generated token
45
+ return logits
46
+
47
+
48
+ def greedy_decode(encoder_input_tokens, prompt, end_token_id):
49
+
50
+ start_index = 1
51
+ current_prompt = prompt
52
+ for predicted_token_index in range(start_index, MAX_SEQUENCE_LENGTH):
53
+ next_logits = next_token_logits(encoder_input_tokens, current_prompt, predicted_token_index)
54
+ next_probabilities = compute_probabilities(next_logits)
55
+ max_probability_token_id = tf.argmax(next_probabilities, axis=-1) #index in logits array is equal to id
56
+ indices = tf.constant([[predicted_token_index]])
57
+ data = tf.constant([max_probability_token_id.numpy()[0]])
58
+ current_prompt = tf.tensor_scatter_nd_update(current_prompt, indices, data)
59
+ #generated end token
60
+ if max_probability_token_id == end_token_id:
61
+ break
62
+ return current_prompt
63
+
64
+
65
+
66
+ def beam_decode(encoder_input_tokens, prompt, end_token_id, beam_size):
67
+ start_index = 1
68
+ #initial beam
69
+ next_logits = next_token_logits(encoder_input_tokens, prompt, start_index)
70
+ next_probabilities = compute_probabilities(next_logits)
71
+ top_k_probabilities, top_k_token_indices = tf.math.top_k(next_probabilities, k=beam_size)
72
+ current_subsequencies = []
73
+ for index, value in enumerate(top_k_token_indices.numpy()[0]):
74
+ #add to current subsequencies 5 versions of prompt with top k tokens on index 1
75
+ indices = tf.constant([[start_index]])
76
+ data = tf.constant([value])
77
+ current_prompt = tf.tensor_scatter_nd_update(prompt, indices, data)
78
+ #add potential subsequence with its log probability and length-normalized log probability (here length = 1, so its same)
79
+ log_prob = tf.math.log(top_k_probabilities.numpy()[0][index])
80
+ current_subsequencies.append((current_prompt, log_prob, log_prob))
81
+
82
+ final_potential_solutions = []
83
+ for predicted_token_index in range(start_index+1, MAX_SEQUENCE_LENGTH):
84
+ #solutions which generated end token
85
+ if len(final_potential_solutions) == beam_size:
86
+ break
87
+
88
+ tmp_subsequencies = []
89
+ for index, (subseq_prompt, subseq_log_probability, _) in enumerate(current_subsequencies):
90
+ next_logits = next_token_logits(encoder_input_tokens, subseq_prompt, predicted_token_index)
91
+ next_probabilities = compute_probabilities(next_logits)
92
+ top_k_probabilities, top_k_token_indices = tf.math.top_k(next_probabilities, k=beam_size-len(final_potential_solutions))
93
+ for index, value in enumerate(top_k_token_indices.numpy()[0]):
94
+ #add to current subsequencies 5 versions of prompt with top k tokens on index 1
95
+ indices = tf.constant([[predicted_token_index]])
96
+ data = tf.constant([value])
97
+ updated_subseq_prompt = tf.tensor_scatter_nd_update(subseq_prompt, indices, data)
98
+ #add potential subsequence with its log probability
99
+ nextLogProbability = tf.math.log(top_k_probabilities.numpy()[0][index])
100
+ tmp_subsequencies.append((updated_subseq_prompt, subseq_log_probability + nextLogProbability, (subseq_log_probability + nextLogProbability)/(predicted_token_index+1)))
101
+
102
+ current_subsequencies = []
103
+ current_sequences_to_find = beam_size - len(final_potential_solutions)
104
+ tmp_subsequencies = sorted(tmp_subsequencies, key=lambda x: x[2], reverse=True)
105
+ for i in range(current_sequences_to_find):
106
+ if tmp_subsequencies[i][0][predicted_token_index] == end_token_id:
107
+ final_potential_solutions.append(tmp_subsequencies[i])
108
+ else:
109
+ current_subsequencies.append(tmp_subsequencies[i])
110
+
111
+ #get best
112
+ final_potential_solutions = sorted(final_potential_solutions, key=lambda x: x[2], reverse=True)
113
+
114
+ if len(final_potential_solutions) > 0:
115
+ return final_potential_solutions[0][0]
116
+ #didnt generate any probable sequence to end
117
+ else:
118
+ sorted_subs = sorted(current_subsequencies, key=lambda x: x[2], reverse=True)
119
+ return sorted_subs[0][0]
120
+
121
+
122
+ def decode_sequences(input_sentence):
123
+
124
+ # Tokenize the encoder input.
125
+ encoder_input_tokens = en_tokenizer(input_sentence)
126
+ # encoder_input_tokens = tf.expand_dims(encoder_input_tokens, axis=0)
127
+ if len(encoder_input_tokens) < MAX_SEQUENCE_LENGTH:
128
+ pads = tf.fill((MAX_SEQUENCE_LENGTH - len(encoder_input_tokens)), 0)
129
+ encoder_input_tokens = tf.concat([encoder_input_tokens, pads], 0)
130
+ if len(encoder_input_tokens) > MAX_SEQUENCE_LENGTH:
131
+ tensor_content = "[START] Exceeded. [END] [PAD] [PAD] [PAD] [PAD]"
132
+ tensor = tf.constant([tensor_content], dtype=tf.string)
133
+ return tensor
134
+
135
+ start = tf.fill((1), cs_tokenizer.token_to_id("[START]"))
136
+ pads = tf.fill((MAX_SEQUENCE_LENGTH - 1), cs_tokenizer.token_to_id("[PAD]"))
137
+ prompt = tf.concat((start, pads), axis=-1)
138
+
139
+ end_token_id = cs_tokenizer.token_to_id("[END]")
140
+
141
+ generated_tokens = greedy_decode(encoder_input_tokens, prompt, end_token_id)
142
+ # generated_tokens = beam_decode(encoder_input_tokens, prompt, end_token_id, 5)
143
+
144
+ generated_sentences = cs_tokenizer.detokenize(tf.expand_dims(generated_tokens, axis=0))
145
+ return generated_sentences
146
+
147
+
148
+ test_en = read_files('datasets/europarl/test-cs-en.en')
149
+ test_cs = read_files('datasets/europarl/test-cs-en.cs')
150
+ bleu_metrics = keras_nlp.metrics.Bleu(
151
+ name="bleu",
152
+ tokenizer = cs_tokenizer
153
+ )
154
+
155
+ eval_samples = 100
156
+ chrf = CHRF()
157
+ refs = test_cs[:eval_samples]
158
+ translations = []
159
+ start_time = time.time()
160
+
161
+ for i in range(len(refs)):
162
+
163
+ cs_translated = decode_sequences(test_en[i])
164
+ cs_translated = cs_translated.numpy()[0].decode("utf-8")
165
+ cs_translated = (
166
+ cs_translated.replace("[PAD]", "")
167
+ .replace("[START]", "")
168
+ .replace("[END]", "")
169
+ .strip()
170
+ )
171
+ #remove spaces before interpunction
172
+ cs_translated = re.sub(r'\s+([.,;!?:])', r'\1', cs_translated)
173
+ print(cs_translated, flush=True)
174
+ translations.append(cs_translated)
175
+
176
+ end_time = time.time()
177
+
178
+
179
+
180
+
181
+ refs_twodim = [[ref] for ref in refs]
182
+ bleu_metrics(refs_twodim, translations)
183
+
184
+ print("evaluating chrf", flush=True)
185
+ chrf2_result = chrf.corpus_score(translations, refs_twodim)
186
+
187
+ print("chrf2")
188
+ print(chrf2_result)
189
+ print("bleu")
190
+ print(bleu_metrics.result().numpy())
191
+ print("elapsed time")
192
+ elapsed_time = end_time - start_time
193
+ print(elapsed_time)
preprocess_dataset.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import keras_nlp
3
+ import keras
4
+ import tensorflow.data as tf_data
5
+ import pickle
6
+ #hyperparameters
7
+ BATCH_SIZE = 16
8
+ MAX_SEQUENCE_LENGTH = 64
9
+
10
+ #load tokenizers/en_vocab to list
11
+
12
+ def read_files(path, lowercase = False):
13
+ with open(path, "r", encoding="utf-8") as f:
14
+ dataset_split = f.read().split("\n")[:-1]
15
+ #to lowercase, idk why
16
+ if(lowercase):
17
+ dataset_split = [line.lower() for line in dataset_split]
18
+ return dataset_split
19
+
20
+ # en_vocab = read_files("tokenizers/en_opus_vocab")
21
+ # cs_vocab = read_files("tokenizers/cs_opus_vocab")
22
+ en_vocab = read_files("tokenizers/en_europarl_vocab")
23
+ cs_vocab = read_files("tokenizers/cs_europarl_vocab")
24
+
25
+ en_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
26
+ vocabulary=en_vocab,
27
+ lowercase=False
28
+ )
29
+ cs_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
30
+ vocabulary=cs_vocab,
31
+ lowercase=False
32
+ )
33
+
34
+
35
+ #opus
36
+ # train_cs_file = 'datasets/cs-en/opus.cs-en-train.cs'
37
+ # train_en_file = 'datasets/cs-en/opus.cs-en-train.en'
38
+ # valid_cs_file = 'datasets/cs-en/opus.cs-en-dev.cs'
39
+ # valid_en_file = 'datasets/cs-en/opus.cs-en-dev.en'
40
+ # test_cs_file = 'datasets/cs-en/opus.cs-en-test.cs'
41
+ # test_en_file = 'datasets/cs-en/opus.cs-en-test.en'
42
+
43
+ #europarl
44
+ train_cs_file = 'datasets/europarl/train-cs-en.cs'
45
+ train_en_file = 'datasets/europarl/train-cs-en.en'
46
+ valid_cs_file = 'datasets/europarl/valid-cs-en.cs'
47
+ valid_en_file = 'datasets/europarl/valid-cs-en.en'
48
+ test_cs_file = 'datasets/europarl/test-cs-en.cs'
49
+ test_en_file = 'datasets/europarl/test-cs-en.en'
50
+
51
+
52
+ train_cs = read_files(train_cs_file, True)
53
+ train_en = read_files(train_en_file, True)
54
+ valid_cs = read_files(valid_cs_file, True)
55
+ valid_en = read_files(valid_en_file, True)
56
+ test_cs = read_files(test_cs_file, True)
57
+ test_en = read_files(test_en_file, True)
58
+
59
+ def preprocess_batch(en, cs):
60
+ en = en_tokenizer(en)
61
+ cs = cs_tokenizer(cs)
62
+
63
+ # Pad `eng` to `MAX_SEQUENCE_LENGTH`.
64
+ en_start_end_packer = keras_nlp.layers.StartEndPacker(
65
+ sequence_length=MAX_SEQUENCE_LENGTH,
66
+ pad_value=en_tokenizer.token_to_id("[PAD]"),
67
+ )
68
+ en = en_start_end_packer(en)
69
+
70
+ # Add special tokens (`"[START]"` and `"[END]"`) to `spa` and pad it as well.
71
+ cs_start_end_packer = keras_nlp.layers.StartEndPacker(
72
+ sequence_length=MAX_SEQUENCE_LENGTH + 1,
73
+ start_value=cs_tokenizer.token_to_id("[START]"),
74
+ end_value=cs_tokenizer.token_to_id("[END]"),
75
+ pad_value=cs_tokenizer.token_to_id("[PAD]"),
76
+ )
77
+ cs = cs_start_end_packer(cs)
78
+
79
+ return (
80
+ {
81
+ "encoder_inputs": en,
82
+ "decoder_inputs": cs[:, :-1],
83
+ },
84
+ cs[:, 1:],
85
+ )
86
+
87
+
88
+ def make_dataset(en_texts, cs_texts):
89
+ dataset = tf_data.Dataset.from_tensor_slices((en_texts, cs_texts))
90
+ dataset = dataset.batch(BATCH_SIZE)
91
+ dataset = dataset.map(preprocess_batch, num_parallel_calls=tf_data.AUTOTUNE)
92
+ return dataset.shuffle(2048).prefetch(16).cache()
93
+
94
+
95
+ train_ds = make_dataset(train_en, train_cs)
96
+ val_ds = make_dataset(valid_en, valid_cs)
97
+
98
+ tf_data.Dataset.save(train_ds, "datasets/preprocessed_europarl_train")
99
+ tf_data.Dataset.save(val_ds, "datasets/preprocessed_europarl_valid")
100
+
101
+
102
+
103
+
train.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import keras_nlp
3
+ import keras
4
+ import tensorflow.data as tf_data
5
+ import pickle
6
+ from tensorflow.keras.optimizers import Adam
7
+ from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
8
+ import datetime
9
+
10
+ BATCH_SIZE = 16
11
+ LEARNING_RATE=1e-4
12
+ EPOCHS = 20
13
+ EMBED_DIM = 256
14
+ INTERMEDIATE_DIM = 2048
15
+ NUM_HEADS = 8
16
+ # TODO probably change dynamically
17
+ MAX_SEQUENCE_LENGTH = 128
18
+ EN_VOCAB_SIZE = 30000
19
+ CS_VOCAB_SIZE = 30000
20
+
21
+ train_ds = tf_data.Dataset.load("datasets/preprocessed_europarl_train")
22
+ valid_ds = tf_data.Dataset.load("datasets/preprocessed_europarl_valid")
23
+
24
+ # Encoder
25
+ encoder_inputs = keras.Input(shape=(None,), name="encoder_inputs")
26
+
27
+ x = keras_nlp.layers.TokenAndPositionEmbedding(
28
+ vocabulary_size=EN_VOCAB_SIZE,
29
+ sequence_length=MAX_SEQUENCE_LENGTH,
30
+ embedding_dim=EMBED_DIM,
31
+ )(encoder_inputs)
32
+
33
+ encoder_outputs = keras_nlp.layers.TransformerEncoder(
34
+ intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
35
+ )(inputs=x)
36
+ encoder = keras.Model(encoder_inputs, encoder_outputs)
37
+
38
+
39
+ # Decoder
40
+ decoder_inputs = keras.Input(shape=(None,), name="decoder_inputs")
41
+ encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")
42
+
43
+ x = keras_nlp.layers.TokenAndPositionEmbedding(
44
+ vocabulary_size=CS_VOCAB_SIZE,
45
+ sequence_length=MAX_SEQUENCE_LENGTH,
46
+ embedding_dim=EMBED_DIM,
47
+ )(decoder_inputs)
48
+
49
+ x = keras_nlp.layers.TransformerDecoder(
50
+ intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
51
+ )(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)
52
+ x = keras.layers.Dropout(0.5)(x)
53
+ decoder_outputs = keras.layers.Dense(CS_VOCAB_SIZE, activation="softmax")(x)
54
+ decoder = keras.Model(
55
+ [
56
+ decoder_inputs,
57
+ encoded_seq_inputs,
58
+ ],
59
+ decoder_outputs,
60
+ )
61
+ decoder_outputs = decoder([decoder_inputs, encoder_outputs])
62
+
63
+ transformer = keras.Model(
64
+ [encoder_inputs, decoder_inputs],
65
+ decoder_outputs,
66
+ name="transformer",
67
+ )
68
+
69
+ transformer.summary()
70
+
71
+ optimizer = Adam(learning_rate=LEARNING_RATE)
72
+ transformer.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"])
73
+
74
+ # Callbacks
75
+ early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
76
+ model_checkpoint = ModelCheckpoint(f'models_europarl/en_cs_translator_checkpoint_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.keras', save_best_only=True)
77
+
78
+ transformer.fit(
79
+ train_ds,
80
+ epochs=EPOCHS,
81
+ validation_data=valid_ds,
82
+ batch_size=BATCH_SIZE,
83
+ callbacks=[early_stopping, model_checkpoint]
84
+ )
85
+ transformer.save(f'models_europarl/en_cs_translator_saved_{datetime.datetime.now().strftime("%Y%m%d_%H%M")}.keras')
86
+
train_tokenizers.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import keras_nlp
2
+ import keras
3
+ import tensorflow.data as tf_data
4
+ import pickle
5
+ import random
6
+
7
+
8
+ EN_VOCAB_SIZE = 30000
9
+ CS_VOCAB_SIZE = 30000
10
+
11
+
12
+
13
+
14
+ def train_word_piece(text_samples, vocab_size, reserved_tokens, save_output_path):
15
+ word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
16
+ vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
17
+ word_piece_ds.batch(1000).prefetch(2),
18
+ vocabulary_size=vocab_size,
19
+ reserved_tokens=reserved_tokens,
20
+ vocabulary_output_file=save_output_path
21
+ )
22
+ return vocab
23
+
24
+ def read_files(path):
25
+ with open(path, "r", encoding="utf-8") as f:
26
+ dataset_split = f.read().split("\n")[:-1]
27
+ #to lowercase, idk why
28
+ dataset_split = [line.lower() for line in dataset_split]
29
+ return dataset_split
30
+
31
+ #OPUS cs-en
32
+ # train_cs = read_files('datasets/cs-en/opus.cs-en-train.cs')
33
+ # train_en = read_files('datasets/cs-en/opus.cs-en-train.en')
34
+
35
+
36
+ #EUROPARL cs-en
37
+ train_cs = read_files('datasets/europarl/train-cs-en.cs')
38
+ train_en = read_files('datasets/europarl/train-cs-en.en')
39
+
40
+
41
+
42
+ print(train_cs[0])
43
+ print(train_en[0])
44
+
45
+
46
+ reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]
47
+
48
+ en_vocab = train_word_piece(train_en, EN_VOCAB_SIZE, reserved_tokens, "tokenizers/en_europarl_vocab")
49
+ cs_vocab = train_word_piece(train_cs, CS_VOCAB_SIZE, reserved_tokens, "tokenizers/cs_europarl_vocab")
50
+
51
+
52
+