Upload 5 files
Browse files- create_dataset_splits.py +0 -6
- eval.py +5 -7
- preprocess_dataset.py +0 -17
- train_tokenizers.py +0 -4
create_dataset_splits.py
CHANGED
@@ -4,13 +4,9 @@ import tensorflow.data as tf_data
|
|
4 |
import pickle
|
5 |
import random
|
6 |
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
def read_files(path):
|
11 |
with open(path, "r", encoding="utf-8") as f:
|
12 |
dataset_split = f.read().split("\n")[:-1]
|
13 |
-
#to lowercase, idk why
|
14 |
dataset_split = [line.lower() for line in dataset_split]
|
15 |
return dataset_split
|
16 |
|
@@ -18,8 +14,6 @@ def save_list_to_file(file_path, string_list):
|
|
18 |
with open(file_path, 'w') as file:
|
19 |
file.writelines([f"{string}\n" for string in string_list])
|
20 |
|
21 |
-
|
22 |
-
#EUROPARL cs-en
|
23 |
#load files
|
24 |
cs_file = 'datasets/europarl/europarl-v7.cs-en.cs'
|
25 |
en_file = 'datasets/europarl/europarl-v7.cs-en.en'
|
|
|
4 |
import pickle
|
5 |
import random
|
6 |
|
|
|
|
|
|
|
7 |
def read_files(path):
|
8 |
with open(path, "r", encoding="utf-8") as f:
|
9 |
dataset_split = f.read().split("\n")[:-1]
|
|
|
10 |
dataset_split = [line.lower() for line in dataset_split]
|
11 |
return dataset_split
|
12 |
|
|
|
14 |
with open(file_path, 'w') as file:
|
15 |
file.writelines([f"{string}\n" for string in string_list])
|
16 |
|
|
|
|
|
17 |
#load files
|
18 |
cs_file = 'datasets/europarl/europarl-v7.cs-en.cs'
|
19 |
en_file = 'datasets/europarl/europarl-v7.cs-en.en'
|
eval.py
CHANGED
@@ -2,18 +2,17 @@
|
|
2 |
import keras_nlp
|
3 |
import keras
|
4 |
import tensorflow.data as tf_data
|
5 |
-
import pickle
|
6 |
import tensorflow as tf
|
7 |
from tensorflow.keras.optimizers import Adam
|
8 |
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
|
9 |
-
import datetime
|
10 |
-
import random
|
11 |
import re
|
12 |
from sacrebleu.metrics import CHRF
|
13 |
import time
|
14 |
-
|
15 |
-
|
16 |
MAX_SEQUENCE_LENGTH = 64
|
|
|
|
|
17 |
|
18 |
transformer = keras.models.load_model('models_europarl/en_cs_translator_saved_20231209_0046.keras')
|
19 |
def read_files(path, lowercase = False):
|
@@ -46,7 +45,6 @@ def next_token_logits(encoder_input_tokens, prompt, predicted_token_index):
|
|
46 |
|
47 |
|
48 |
def greedy_decode(encoder_input_tokens, prompt, end_token_id):
|
49 |
-
|
50 |
start_index = 1
|
51 |
current_prompt = prompt
|
52 |
for predicted_token_index in range(start_index, MAX_SEQUENCE_LENGTH):
|
@@ -152,7 +150,7 @@ bleu_metrics = keras_nlp.metrics.Bleu(
|
|
152 |
tokenizer = cs_tokenizer
|
153 |
)
|
154 |
|
155 |
-
|
156 |
chrf = CHRF()
|
157 |
refs = test_cs[:eval_samples]
|
158 |
translations = []
|
|
|
2 |
import keras_nlp
|
3 |
import keras
|
4 |
import tensorflow.data as tf_data
|
|
|
5 |
import tensorflow as tf
|
6 |
from tensorflow.keras.optimizers import Adam
|
7 |
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
|
|
|
|
|
8 |
import re
|
9 |
from sacrebleu.metrics import CHRF
|
10 |
import time
|
11 |
+
|
12 |
+
|
13 |
MAX_SEQUENCE_LENGTH = 64
|
14 |
+
eval_samples = 100
|
15 |
+
|
16 |
|
17 |
transformer = keras.models.load_model('models_europarl/en_cs_translator_saved_20231209_0046.keras')
|
18 |
def read_files(path, lowercase = False):
|
|
|
45 |
|
46 |
|
47 |
def greedy_decode(encoder_input_tokens, prompt, end_token_id):
|
|
|
48 |
start_index = 1
|
49 |
current_prompt = prompt
|
50 |
for predicted_token_index in range(start_index, MAX_SEQUENCE_LENGTH):
|
|
|
150 |
tokenizer = cs_tokenizer
|
151 |
)
|
152 |
|
153 |
+
|
154 |
chrf = CHRF()
|
155 |
refs = test_cs[:eval_samples]
|
156 |
translations = []
|
preprocess_dataset.py
CHANGED
@@ -12,13 +12,10 @@ MAX_SEQUENCE_LENGTH = 64
|
|
12 |
def read_files(path, lowercase = False):
|
13 |
with open(path, "r", encoding="utf-8") as f:
|
14 |
dataset_split = f.read().split("\n")[:-1]
|
15 |
-
#to lowercase, idk why
|
16 |
if(lowercase):
|
17 |
dataset_split = [line.lower() for line in dataset_split]
|
18 |
return dataset_split
|
19 |
|
20 |
-
# en_vocab = read_files("tokenizers/en_opus_vocab")
|
21 |
-
# cs_vocab = read_files("tokenizers/cs_opus_vocab")
|
22 |
en_vocab = read_files("tokenizers/en_europarl_vocab")
|
23 |
cs_vocab = read_files("tokenizers/cs_europarl_vocab")
|
24 |
|
@@ -32,42 +29,28 @@ cs_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
|
|
32 |
)
|
33 |
|
34 |
|
35 |
-
#opus
|
36 |
-
# train_cs_file = 'datasets/cs-en/opus.cs-en-train.cs'
|
37 |
-
# train_en_file = 'datasets/cs-en/opus.cs-en-train.en'
|
38 |
-
# valid_cs_file = 'datasets/cs-en/opus.cs-en-dev.cs'
|
39 |
-
# valid_en_file = 'datasets/cs-en/opus.cs-en-dev.en'
|
40 |
-
# test_cs_file = 'datasets/cs-en/opus.cs-en-test.cs'
|
41 |
-
# test_en_file = 'datasets/cs-en/opus.cs-en-test.en'
|
42 |
-
|
43 |
#europarl
|
44 |
train_cs_file = 'datasets/europarl/train-cs-en.cs'
|
45 |
train_en_file = 'datasets/europarl/train-cs-en.en'
|
46 |
valid_cs_file = 'datasets/europarl/valid-cs-en.cs'
|
47 |
valid_en_file = 'datasets/europarl/valid-cs-en.en'
|
48 |
-
test_cs_file = 'datasets/europarl/test-cs-en.cs'
|
49 |
-
test_en_file = 'datasets/europarl/test-cs-en.en'
|
50 |
|
51 |
|
52 |
train_cs = read_files(train_cs_file, True)
|
53 |
train_en = read_files(train_en_file, True)
|
54 |
valid_cs = read_files(valid_cs_file, True)
|
55 |
valid_en = read_files(valid_en_file, True)
|
56 |
-
test_cs = read_files(test_cs_file, True)
|
57 |
-
test_en = read_files(test_en_file, True)
|
58 |
|
59 |
def preprocess_batch(en, cs):
|
60 |
en = en_tokenizer(en)
|
61 |
cs = cs_tokenizer(cs)
|
62 |
|
63 |
-
# Pad `eng` to `MAX_SEQUENCE_LENGTH`.
|
64 |
en_start_end_packer = keras_nlp.layers.StartEndPacker(
|
65 |
sequence_length=MAX_SEQUENCE_LENGTH,
|
66 |
pad_value=en_tokenizer.token_to_id("[PAD]"),
|
67 |
)
|
68 |
en = en_start_end_packer(en)
|
69 |
|
70 |
-
# Add special tokens (`"[START]"` and `"[END]"`) to `spa` and pad it as well.
|
71 |
cs_start_end_packer = keras_nlp.layers.StartEndPacker(
|
72 |
sequence_length=MAX_SEQUENCE_LENGTH + 1,
|
73 |
start_value=cs_tokenizer.token_to_id("[START]"),
|
|
|
12 |
def read_files(path, lowercase = False):
|
13 |
with open(path, "r", encoding="utf-8") as f:
|
14 |
dataset_split = f.read().split("\n")[:-1]
|
|
|
15 |
if(lowercase):
|
16 |
dataset_split = [line.lower() for line in dataset_split]
|
17 |
return dataset_split
|
18 |
|
|
|
|
|
19 |
en_vocab = read_files("tokenizers/en_europarl_vocab")
|
20 |
cs_vocab = read_files("tokenizers/cs_europarl_vocab")
|
21 |
|
|
|
29 |
)
|
30 |
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
#europarl
|
33 |
train_cs_file = 'datasets/europarl/train-cs-en.cs'
|
34 |
train_en_file = 'datasets/europarl/train-cs-en.en'
|
35 |
valid_cs_file = 'datasets/europarl/valid-cs-en.cs'
|
36 |
valid_en_file = 'datasets/europarl/valid-cs-en.en'
|
|
|
|
|
37 |
|
38 |
|
39 |
train_cs = read_files(train_cs_file, True)
|
40 |
train_en = read_files(train_en_file, True)
|
41 |
valid_cs = read_files(valid_cs_file, True)
|
42 |
valid_en = read_files(valid_en_file, True)
|
|
|
|
|
43 |
|
44 |
def preprocess_batch(en, cs):
|
45 |
en = en_tokenizer(en)
|
46 |
cs = cs_tokenizer(cs)
|
47 |
|
|
|
48 |
en_start_end_packer = keras_nlp.layers.StartEndPacker(
|
49 |
sequence_length=MAX_SEQUENCE_LENGTH,
|
50 |
pad_value=en_tokenizer.token_to_id("[PAD]"),
|
51 |
)
|
52 |
en = en_start_end_packer(en)
|
53 |
|
|
|
54 |
cs_start_end_packer = keras_nlp.layers.StartEndPacker(
|
55 |
sequence_length=MAX_SEQUENCE_LENGTH + 1,
|
56 |
start_value=cs_tokenizer.token_to_id("[START]"),
|
train_tokenizers.py
CHANGED
@@ -4,13 +4,9 @@ import tensorflow.data as tf_data
|
|
4 |
import pickle
|
5 |
import random
|
6 |
|
7 |
-
|
8 |
EN_VOCAB_SIZE = 30000
|
9 |
CS_VOCAB_SIZE = 30000
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
def train_word_piece(text_samples, vocab_size, reserved_tokens, save_output_path):
|
15 |
word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
|
16 |
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
|
|
|
4 |
import pickle
|
5 |
import random
|
6 |
|
|
|
7 |
EN_VOCAB_SIZE = 30000
|
8 |
CS_VOCAB_SIZE = 30000
|
9 |
|
|
|
|
|
|
|
10 |
def train_word_piece(text_samples, vocab_size, reserved_tokens, save_output_path):
|
11 |
word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
|
12 |
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
|