Upload 5 files
Browse files- create_dataset_splits.py +59 -0
- eval.py +193 -0
- preprocess_dataset.py +103 -0
- train.py +86 -0
- train_tokenizers.py +52 -0
create_dataset_splits.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import keras_nlp
|
2 |
+
import keras
|
3 |
+
import tensorflow.data as tf_data
|
4 |
+
import pickle
|
5 |
+
import random
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
def read_files(path):
|
11 |
+
with open(path, "r", encoding="utf-8") as f:
|
12 |
+
dataset_split = f.read().split("\n")[:-1]
|
13 |
+
#to lowercase, idk why
|
14 |
+
dataset_split = [line.lower() for line in dataset_split]
|
15 |
+
return dataset_split
|
16 |
+
|
17 |
+
def save_list_to_file(file_path, string_list):
|
18 |
+
with open(file_path, 'w') as file:
|
19 |
+
file.writelines([f"{string}\n" for string in string_list])
|
20 |
+
|
21 |
+
|
22 |
+
#EUROPARL cs-en
|
23 |
+
#load files
|
24 |
+
cs_file = 'datasets/europarl/europarl-v7.cs-en.cs'
|
25 |
+
en_file = 'datasets/europarl/europarl-v7.cs-en.en'
|
26 |
+
sentences_cs = read_files(cs_file)
|
27 |
+
sentences_en = read_files(en_file)
|
28 |
+
|
29 |
+
#create pairs and split to train, valid and test
|
30 |
+
pairs = list(zip(sentences_en, sentences_cs))
|
31 |
+
random.shuffle(pairs)
|
32 |
+
num_val_samples = int(0.15 * len(pairs))
|
33 |
+
num_train_samples = len(pairs) - 2 * num_val_samples
|
34 |
+
|
35 |
+
train_pairs = pairs[:num_train_samples]
|
36 |
+
valid_pairs = pairs[num_train_samples : num_train_samples + num_val_samples]
|
37 |
+
test_pairs = pairs[num_train_samples + num_val_samples :]
|
38 |
+
|
39 |
+
print(train_pairs[0])
|
40 |
+
print(valid_pairs[0])
|
41 |
+
print(test_pairs[0])
|
42 |
+
|
43 |
+
|
44 |
+
en_train_samples = [pair[0] for pair in train_pairs]
|
45 |
+
cs_train_samples = [pair[1] for pair in train_pairs]
|
46 |
+
en_valid_samples = [pair[0] for pair in valid_pairs]
|
47 |
+
cs_valid_samples = [pair[1] for pair in valid_pairs]
|
48 |
+
en_test_samples = [pair[0] for pair in test_pairs]
|
49 |
+
cs_test_samples = [pair[1] for pair in test_pairs]
|
50 |
+
|
51 |
+
|
52 |
+
save_list_to_file("datasets/europarl/train-cs-en.en", en_train_samples)
|
53 |
+
save_list_to_file("datasets/europarl/train-cs-en.cs", cs_train_samples)
|
54 |
+
save_list_to_file("datasets/europarl/valid-cs-en.en", en_valid_samples)
|
55 |
+
save_list_to_file("datasets/europarl/valid-cs-en.cs", cs_valid_samples)
|
56 |
+
save_list_to_file("datasets/europarl/test-cs-en.en", en_test_samples)
|
57 |
+
save_list_to_file("datasets/europarl/test-cs-en.cs", cs_test_samples)
|
58 |
+
|
59 |
+
|
eval.py
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import keras_nlp
|
3 |
+
import keras
|
4 |
+
import tensorflow.data as tf_data
|
5 |
+
import pickle
|
6 |
+
import tensorflow as tf
|
7 |
+
from tensorflow.keras.optimizers import Adam
|
8 |
+
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
|
9 |
+
import datetime
|
10 |
+
import random
|
11 |
+
import re
|
12 |
+
from sacrebleu.metrics import CHRF
|
13 |
+
import time
|
14 |
+
# from keras import ops
|
15 |
+
#hyperparameters
|
16 |
+
MAX_SEQUENCE_LENGTH = 64
|
17 |
+
|
18 |
+
transformer = keras.models.load_model('models_europarl/en_cs_translator_saved_20231209_0046.keras')
|
19 |
+
def read_files(path, lowercase = False):
|
20 |
+
with open(path, "r", encoding="utf-8") as f:
|
21 |
+
dataset_split = f.read().split("\n")[:-1]
|
22 |
+
#to lowercase, idk why
|
23 |
+
if(lowercase):
|
24 |
+
dataset_split = [line.lower() for line in dataset_split]
|
25 |
+
return dataset_split
|
26 |
+
|
27 |
+
en_vocab = read_files("tokenizers/en_europarl_vocab")
|
28 |
+
cs_vocab = read_files("tokenizers/cs_europarl_vocab")
|
29 |
+
en_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
|
30 |
+
vocabulary=en_vocab,
|
31 |
+
lowercase=False
|
32 |
+
)
|
33 |
+
cs_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
|
34 |
+
vocabulary=cs_vocab,
|
35 |
+
lowercase=False
|
36 |
+
)
|
37 |
+
|
38 |
+
def compute_probabilities(logits):
|
39 |
+
return keras.activations.softmax(logits)
|
40 |
+
|
41 |
+
def next_token_logits(encoder_input_tokens, prompt, predicted_token_index):
|
42 |
+
logits = transformer(
|
43 |
+
[tf.expand_dims(encoder_input_tokens, axis=0), tf.expand_dims(prompt, axis=0)]
|
44 |
+
)[:, predicted_token_index-1, :] #we need prediction for next token, which is on index of last generated token
|
45 |
+
return logits
|
46 |
+
|
47 |
+
|
48 |
+
def greedy_decode(encoder_input_tokens, prompt, end_token_id):
|
49 |
+
|
50 |
+
start_index = 1
|
51 |
+
current_prompt = prompt
|
52 |
+
for predicted_token_index in range(start_index, MAX_SEQUENCE_LENGTH):
|
53 |
+
next_logits = next_token_logits(encoder_input_tokens, current_prompt, predicted_token_index)
|
54 |
+
next_probabilities = compute_probabilities(next_logits)
|
55 |
+
max_probability_token_id = tf.argmax(next_probabilities, axis=-1) #index in logits array is equal to id
|
56 |
+
indices = tf.constant([[predicted_token_index]])
|
57 |
+
data = tf.constant([max_probability_token_id.numpy()[0]])
|
58 |
+
current_prompt = tf.tensor_scatter_nd_update(current_prompt, indices, data)
|
59 |
+
#generated end token
|
60 |
+
if max_probability_token_id == end_token_id:
|
61 |
+
break
|
62 |
+
return current_prompt
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
def beam_decode(encoder_input_tokens, prompt, end_token_id, beam_size):
|
67 |
+
start_index = 1
|
68 |
+
#initial beam
|
69 |
+
next_logits = next_token_logits(encoder_input_tokens, prompt, start_index)
|
70 |
+
next_probabilities = compute_probabilities(next_logits)
|
71 |
+
top_k_probabilities, top_k_token_indices = tf.math.top_k(next_probabilities, k=beam_size)
|
72 |
+
current_subsequencies = []
|
73 |
+
for index, value in enumerate(top_k_token_indices.numpy()[0]):
|
74 |
+
#add to current subsequencies 5 versions of prompt with top k tokens on index 1
|
75 |
+
indices = tf.constant([[start_index]])
|
76 |
+
data = tf.constant([value])
|
77 |
+
current_prompt = tf.tensor_scatter_nd_update(prompt, indices, data)
|
78 |
+
#add potential subsequence with its log probability and length-normalized log probability (here length = 1, so its same)
|
79 |
+
log_prob = tf.math.log(top_k_probabilities.numpy()[0][index])
|
80 |
+
current_subsequencies.append((current_prompt, log_prob, log_prob))
|
81 |
+
|
82 |
+
final_potential_solutions = []
|
83 |
+
for predicted_token_index in range(start_index+1, MAX_SEQUENCE_LENGTH):
|
84 |
+
#solutions which generated end token
|
85 |
+
if len(final_potential_solutions) == beam_size:
|
86 |
+
break
|
87 |
+
|
88 |
+
tmp_subsequencies = []
|
89 |
+
for index, (subseq_prompt, subseq_log_probability, _) in enumerate(current_subsequencies):
|
90 |
+
next_logits = next_token_logits(encoder_input_tokens, subseq_prompt, predicted_token_index)
|
91 |
+
next_probabilities = compute_probabilities(next_logits)
|
92 |
+
top_k_probabilities, top_k_token_indices = tf.math.top_k(next_probabilities, k=beam_size-len(final_potential_solutions))
|
93 |
+
for index, value in enumerate(top_k_token_indices.numpy()[0]):
|
94 |
+
#add to current subsequencies 5 versions of prompt with top k tokens on index 1
|
95 |
+
indices = tf.constant([[predicted_token_index]])
|
96 |
+
data = tf.constant([value])
|
97 |
+
updated_subseq_prompt = tf.tensor_scatter_nd_update(subseq_prompt, indices, data)
|
98 |
+
#add potential subsequence with its log probability
|
99 |
+
nextLogProbability = tf.math.log(top_k_probabilities.numpy()[0][index])
|
100 |
+
tmp_subsequencies.append((updated_subseq_prompt, subseq_log_probability + nextLogProbability, (subseq_log_probability + nextLogProbability)/(predicted_token_index+1)))
|
101 |
+
|
102 |
+
current_subsequencies = []
|
103 |
+
current_sequences_to_find = beam_size - len(final_potential_solutions)
|
104 |
+
tmp_subsequencies = sorted(tmp_subsequencies, key=lambda x: x[2], reverse=True)
|
105 |
+
for i in range(current_sequences_to_find):
|
106 |
+
if tmp_subsequencies[i][0][predicted_token_index] == end_token_id:
|
107 |
+
final_potential_solutions.append(tmp_subsequencies[i])
|
108 |
+
else:
|
109 |
+
current_subsequencies.append(tmp_subsequencies[i])
|
110 |
+
|
111 |
+
#get best
|
112 |
+
final_potential_solutions = sorted(final_potential_solutions, key=lambda x: x[2], reverse=True)
|
113 |
+
|
114 |
+
if len(final_potential_solutions) > 0:
|
115 |
+
return final_potential_solutions[0][0]
|
116 |
+
#didnt generate any probable sequence to end
|
117 |
+
else:
|
118 |
+
sorted_subs = sorted(current_subsequencies, key=lambda x: x[2], reverse=True)
|
119 |
+
return sorted_subs[0][0]
|
120 |
+
|
121 |
+
|
122 |
+
def decode_sequences(input_sentence):
|
123 |
+
|
124 |
+
# Tokenize the encoder input.
|
125 |
+
encoder_input_tokens = en_tokenizer(input_sentence)
|
126 |
+
# encoder_input_tokens = tf.expand_dims(encoder_input_tokens, axis=0)
|
127 |
+
if len(encoder_input_tokens) < MAX_SEQUENCE_LENGTH:
|
128 |
+
pads = tf.fill((MAX_SEQUENCE_LENGTH - len(encoder_input_tokens)), 0)
|
129 |
+
encoder_input_tokens = tf.concat([encoder_input_tokens, pads], 0)
|
130 |
+
if len(encoder_input_tokens) > MAX_SEQUENCE_LENGTH:
|
131 |
+
tensor_content = "[START] Exceeded. [END] [PAD] [PAD] [PAD] [PAD]"
|
132 |
+
tensor = tf.constant([tensor_content], dtype=tf.string)
|
133 |
+
return tensor
|
134 |
+
|
135 |
+
start = tf.fill((1), cs_tokenizer.token_to_id("[START]"))
|
136 |
+
pads = tf.fill((MAX_SEQUENCE_LENGTH - 1), cs_tokenizer.token_to_id("[PAD]"))
|
137 |
+
prompt = tf.concat((start, pads), axis=-1)
|
138 |
+
|
139 |
+
end_token_id = cs_tokenizer.token_to_id("[END]")
|
140 |
+
|
141 |
+
generated_tokens = greedy_decode(encoder_input_tokens, prompt, end_token_id)
|
142 |
+
# generated_tokens = beam_decode(encoder_input_tokens, prompt, end_token_id, 5)
|
143 |
+
|
144 |
+
generated_sentences = cs_tokenizer.detokenize(tf.expand_dims(generated_tokens, axis=0))
|
145 |
+
return generated_sentences
|
146 |
+
|
147 |
+
|
148 |
+
test_en = read_files('datasets/europarl/test-cs-en.en')
|
149 |
+
test_cs = read_files('datasets/europarl/test-cs-en.cs')
|
150 |
+
bleu_metrics = keras_nlp.metrics.Bleu(
|
151 |
+
name="bleu",
|
152 |
+
tokenizer = cs_tokenizer
|
153 |
+
)
|
154 |
+
|
155 |
+
eval_samples = 100
|
156 |
+
chrf = CHRF()
|
157 |
+
refs = test_cs[:eval_samples]
|
158 |
+
translations = []
|
159 |
+
start_time = time.time()
|
160 |
+
|
161 |
+
for i in range(len(refs)):
|
162 |
+
|
163 |
+
cs_translated = decode_sequences(test_en[i])
|
164 |
+
cs_translated = cs_translated.numpy()[0].decode("utf-8")
|
165 |
+
cs_translated = (
|
166 |
+
cs_translated.replace("[PAD]", "")
|
167 |
+
.replace("[START]", "")
|
168 |
+
.replace("[END]", "")
|
169 |
+
.strip()
|
170 |
+
)
|
171 |
+
#remove spaces before interpunction
|
172 |
+
cs_translated = re.sub(r'\s+([.,;!?:])', r'\1', cs_translated)
|
173 |
+
print(cs_translated, flush=True)
|
174 |
+
translations.append(cs_translated)
|
175 |
+
|
176 |
+
end_time = time.time()
|
177 |
+
|
178 |
+
|
179 |
+
|
180 |
+
|
181 |
+
refs_twodim = [[ref] for ref in refs]
|
182 |
+
bleu_metrics(refs_twodim, translations)
|
183 |
+
|
184 |
+
print("evaluating chrf", flush=True)
|
185 |
+
chrf2_result = chrf.corpus_score(translations, refs_twodim)
|
186 |
+
|
187 |
+
print("chrf2")
|
188 |
+
print(chrf2_result)
|
189 |
+
print("bleu")
|
190 |
+
print(bleu_metrics.result().numpy())
|
191 |
+
print("elapsed time")
|
192 |
+
elapsed_time = end_time - start_time
|
193 |
+
print(elapsed_time)
|
preprocess_dataset.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import keras_nlp
|
3 |
+
import keras
|
4 |
+
import tensorflow.data as tf_data
|
5 |
+
import pickle
|
6 |
+
#hyperparameters
|
7 |
+
BATCH_SIZE = 16
|
8 |
+
MAX_SEQUENCE_LENGTH = 64
|
9 |
+
|
10 |
+
#load tokenizers/en_vocab to list
|
11 |
+
|
12 |
+
def read_files(path, lowercase = False):
|
13 |
+
with open(path, "r", encoding="utf-8") as f:
|
14 |
+
dataset_split = f.read().split("\n")[:-1]
|
15 |
+
#to lowercase, idk why
|
16 |
+
if(lowercase):
|
17 |
+
dataset_split = [line.lower() for line in dataset_split]
|
18 |
+
return dataset_split
|
19 |
+
|
20 |
+
# en_vocab = read_files("tokenizers/en_opus_vocab")
|
21 |
+
# cs_vocab = read_files("tokenizers/cs_opus_vocab")
|
22 |
+
en_vocab = read_files("tokenizers/en_europarl_vocab")
|
23 |
+
cs_vocab = read_files("tokenizers/cs_europarl_vocab")
|
24 |
+
|
25 |
+
en_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
|
26 |
+
vocabulary=en_vocab,
|
27 |
+
lowercase=False
|
28 |
+
)
|
29 |
+
cs_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
|
30 |
+
vocabulary=cs_vocab,
|
31 |
+
lowercase=False
|
32 |
+
)
|
33 |
+
|
34 |
+
|
35 |
+
#opus
|
36 |
+
# train_cs_file = 'datasets/cs-en/opus.cs-en-train.cs'
|
37 |
+
# train_en_file = 'datasets/cs-en/opus.cs-en-train.en'
|
38 |
+
# valid_cs_file = 'datasets/cs-en/opus.cs-en-dev.cs'
|
39 |
+
# valid_en_file = 'datasets/cs-en/opus.cs-en-dev.en'
|
40 |
+
# test_cs_file = 'datasets/cs-en/opus.cs-en-test.cs'
|
41 |
+
# test_en_file = 'datasets/cs-en/opus.cs-en-test.en'
|
42 |
+
|
43 |
+
#europarl
|
44 |
+
train_cs_file = 'datasets/europarl/train-cs-en.cs'
|
45 |
+
train_en_file = 'datasets/europarl/train-cs-en.en'
|
46 |
+
valid_cs_file = 'datasets/europarl/valid-cs-en.cs'
|
47 |
+
valid_en_file = 'datasets/europarl/valid-cs-en.en'
|
48 |
+
test_cs_file = 'datasets/europarl/test-cs-en.cs'
|
49 |
+
test_en_file = 'datasets/europarl/test-cs-en.en'
|
50 |
+
|
51 |
+
|
52 |
+
train_cs = read_files(train_cs_file, True)
|
53 |
+
train_en = read_files(train_en_file, True)
|
54 |
+
valid_cs = read_files(valid_cs_file, True)
|
55 |
+
valid_en = read_files(valid_en_file, True)
|
56 |
+
test_cs = read_files(test_cs_file, True)
|
57 |
+
test_en = read_files(test_en_file, True)
|
58 |
+
|
59 |
+
def preprocess_batch(en, cs):
|
60 |
+
en = en_tokenizer(en)
|
61 |
+
cs = cs_tokenizer(cs)
|
62 |
+
|
63 |
+
# Pad `eng` to `MAX_SEQUENCE_LENGTH`.
|
64 |
+
en_start_end_packer = keras_nlp.layers.StartEndPacker(
|
65 |
+
sequence_length=MAX_SEQUENCE_LENGTH,
|
66 |
+
pad_value=en_tokenizer.token_to_id("[PAD]"),
|
67 |
+
)
|
68 |
+
en = en_start_end_packer(en)
|
69 |
+
|
70 |
+
# Add special tokens (`"[START]"` and `"[END]"`) to `spa` and pad it as well.
|
71 |
+
cs_start_end_packer = keras_nlp.layers.StartEndPacker(
|
72 |
+
sequence_length=MAX_SEQUENCE_LENGTH + 1,
|
73 |
+
start_value=cs_tokenizer.token_to_id("[START]"),
|
74 |
+
end_value=cs_tokenizer.token_to_id("[END]"),
|
75 |
+
pad_value=cs_tokenizer.token_to_id("[PAD]"),
|
76 |
+
)
|
77 |
+
cs = cs_start_end_packer(cs)
|
78 |
+
|
79 |
+
return (
|
80 |
+
{
|
81 |
+
"encoder_inputs": en,
|
82 |
+
"decoder_inputs": cs[:, :-1],
|
83 |
+
},
|
84 |
+
cs[:, 1:],
|
85 |
+
)
|
86 |
+
|
87 |
+
|
88 |
+
def make_dataset(en_texts, cs_texts):
|
89 |
+
dataset = tf_data.Dataset.from_tensor_slices((en_texts, cs_texts))
|
90 |
+
dataset = dataset.batch(BATCH_SIZE)
|
91 |
+
dataset = dataset.map(preprocess_batch, num_parallel_calls=tf_data.AUTOTUNE)
|
92 |
+
return dataset.shuffle(2048).prefetch(16).cache()
|
93 |
+
|
94 |
+
|
95 |
+
train_ds = make_dataset(train_en, train_cs)
|
96 |
+
val_ds = make_dataset(valid_en, valid_cs)
|
97 |
+
|
98 |
+
tf_data.Dataset.save(train_ds, "datasets/preprocessed_europarl_train")
|
99 |
+
tf_data.Dataset.save(val_ds, "datasets/preprocessed_europarl_valid")
|
100 |
+
|
101 |
+
|
102 |
+
|
103 |
+
|
train.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import keras_nlp
|
3 |
+
import keras
|
4 |
+
import tensorflow.data as tf_data
|
5 |
+
import pickle
|
6 |
+
from tensorflow.keras.optimizers import Adam
|
7 |
+
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
|
8 |
+
import datetime
|
9 |
+
|
10 |
+
BATCH_SIZE = 16
|
11 |
+
LEARNING_RATE=1e-4
|
12 |
+
EPOCHS = 20
|
13 |
+
EMBED_DIM = 256
|
14 |
+
INTERMEDIATE_DIM = 2048
|
15 |
+
NUM_HEADS = 8
|
16 |
+
# TODO probably change dynamically
|
17 |
+
MAX_SEQUENCE_LENGTH = 128
|
18 |
+
EN_VOCAB_SIZE = 30000
|
19 |
+
CS_VOCAB_SIZE = 30000
|
20 |
+
|
21 |
+
train_ds = tf_data.Dataset.load("datasets/preprocessed_europarl_train")
|
22 |
+
valid_ds = tf_data.Dataset.load("datasets/preprocessed_europarl_valid")
|
23 |
+
|
24 |
+
# Encoder
|
25 |
+
encoder_inputs = keras.Input(shape=(None,), name="encoder_inputs")
|
26 |
+
|
27 |
+
x = keras_nlp.layers.TokenAndPositionEmbedding(
|
28 |
+
vocabulary_size=EN_VOCAB_SIZE,
|
29 |
+
sequence_length=MAX_SEQUENCE_LENGTH,
|
30 |
+
embedding_dim=EMBED_DIM,
|
31 |
+
)(encoder_inputs)
|
32 |
+
|
33 |
+
encoder_outputs = keras_nlp.layers.TransformerEncoder(
|
34 |
+
intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
|
35 |
+
)(inputs=x)
|
36 |
+
encoder = keras.Model(encoder_inputs, encoder_outputs)
|
37 |
+
|
38 |
+
|
39 |
+
# Decoder
|
40 |
+
decoder_inputs = keras.Input(shape=(None,), name="decoder_inputs")
|
41 |
+
encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")
|
42 |
+
|
43 |
+
x = keras_nlp.layers.TokenAndPositionEmbedding(
|
44 |
+
vocabulary_size=CS_VOCAB_SIZE,
|
45 |
+
sequence_length=MAX_SEQUENCE_LENGTH,
|
46 |
+
embedding_dim=EMBED_DIM,
|
47 |
+
)(decoder_inputs)
|
48 |
+
|
49 |
+
x = keras_nlp.layers.TransformerDecoder(
|
50 |
+
intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
|
51 |
+
)(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)
|
52 |
+
x = keras.layers.Dropout(0.5)(x)
|
53 |
+
decoder_outputs = keras.layers.Dense(CS_VOCAB_SIZE, activation="softmax")(x)
|
54 |
+
decoder = keras.Model(
|
55 |
+
[
|
56 |
+
decoder_inputs,
|
57 |
+
encoded_seq_inputs,
|
58 |
+
],
|
59 |
+
decoder_outputs,
|
60 |
+
)
|
61 |
+
decoder_outputs = decoder([decoder_inputs, encoder_outputs])
|
62 |
+
|
63 |
+
transformer = keras.Model(
|
64 |
+
[encoder_inputs, decoder_inputs],
|
65 |
+
decoder_outputs,
|
66 |
+
name="transformer",
|
67 |
+
)
|
68 |
+
|
69 |
+
transformer.summary()
|
70 |
+
|
71 |
+
optimizer = Adam(learning_rate=LEARNING_RATE)
|
72 |
+
transformer.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"])
|
73 |
+
|
74 |
+
# Callbacks
|
75 |
+
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
|
76 |
+
model_checkpoint = ModelCheckpoint(f'models_europarl/en_cs_translator_checkpoint_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.keras', save_best_only=True)
|
77 |
+
|
78 |
+
transformer.fit(
|
79 |
+
train_ds,
|
80 |
+
epochs=EPOCHS,
|
81 |
+
validation_data=valid_ds,
|
82 |
+
batch_size=BATCH_SIZE,
|
83 |
+
callbacks=[early_stopping, model_checkpoint]
|
84 |
+
)
|
85 |
+
transformer.save(f'models_europarl/en_cs_translator_saved_{datetime.datetime.now().strftime("%Y%m%d_%H%M")}.keras')
|
86 |
+
|
train_tokenizers.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import keras_nlp
|
2 |
+
import keras
|
3 |
+
import tensorflow.data as tf_data
|
4 |
+
import pickle
|
5 |
+
import random
|
6 |
+
|
7 |
+
|
8 |
+
EN_VOCAB_SIZE = 30000
|
9 |
+
CS_VOCAB_SIZE = 30000
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
def train_word_piece(text_samples, vocab_size, reserved_tokens, save_output_path):
|
15 |
+
word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
|
16 |
+
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
|
17 |
+
word_piece_ds.batch(1000).prefetch(2),
|
18 |
+
vocabulary_size=vocab_size,
|
19 |
+
reserved_tokens=reserved_tokens,
|
20 |
+
vocabulary_output_file=save_output_path
|
21 |
+
)
|
22 |
+
return vocab
|
23 |
+
|
24 |
+
def read_files(path):
|
25 |
+
with open(path, "r", encoding="utf-8") as f:
|
26 |
+
dataset_split = f.read().split("\n")[:-1]
|
27 |
+
#to lowercase, idk why
|
28 |
+
dataset_split = [line.lower() for line in dataset_split]
|
29 |
+
return dataset_split
|
30 |
+
|
31 |
+
#OPUS cs-en
|
32 |
+
# train_cs = read_files('datasets/cs-en/opus.cs-en-train.cs')
|
33 |
+
# train_en = read_files('datasets/cs-en/opus.cs-en-train.en')
|
34 |
+
|
35 |
+
|
36 |
+
#EUROPARL cs-en
|
37 |
+
train_cs = read_files('datasets/europarl/train-cs-en.cs')
|
38 |
+
train_en = read_files('datasets/europarl/train-cs-en.en')
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
print(train_cs[0])
|
43 |
+
print(train_en[0])
|
44 |
+
|
45 |
+
|
46 |
+
reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]
|
47 |
+
|
48 |
+
en_vocab = train_word_piece(train_en, EN_VOCAB_SIZE, reserved_tokens, "tokenizers/en_europarl_vocab")
|
49 |
+
cs_vocab = train_word_piece(train_cs, CS_VOCAB_SIZE, reserved_tokens, "tokenizers/cs_europarl_vocab")
|
50 |
+
|
51 |
+
|
52 |
+
|