jkot commited on
Commit
63bf667
·
1 Parent(s): 75a2cec

Upload 5 files

Browse files
create_dataset_splits.py CHANGED
@@ -4,13 +4,9 @@ import tensorflow.data as tf_data
4
  import pickle
5
  import random
6
 
7
-
8
-
9
-
10
  def read_files(path):
11
  with open(path, "r", encoding="utf-8") as f:
12
  dataset_split = f.read().split("\n")[:-1]
13
- #to lowercase, idk why
14
  dataset_split = [line.lower() for line in dataset_split]
15
  return dataset_split
16
 
@@ -18,8 +14,6 @@ def save_list_to_file(file_path, string_list):
18
  with open(file_path, 'w') as file:
19
  file.writelines([f"{string}\n" for string in string_list])
20
 
21
-
22
- #EUROPARL cs-en
23
  #load files
24
  cs_file = 'datasets/europarl/europarl-v7.cs-en.cs'
25
  en_file = 'datasets/europarl/europarl-v7.cs-en.en'
 
4
  import pickle
5
  import random
6
 
 
 
 
7
  def read_files(path):
8
  with open(path, "r", encoding="utf-8") as f:
9
  dataset_split = f.read().split("\n")[:-1]
 
10
  dataset_split = [line.lower() for line in dataset_split]
11
  return dataset_split
12
 
 
14
  with open(file_path, 'w') as file:
15
  file.writelines([f"{string}\n" for string in string_list])
16
 
 
 
17
  #load files
18
  cs_file = 'datasets/europarl/europarl-v7.cs-en.cs'
19
  en_file = 'datasets/europarl/europarl-v7.cs-en.en'
eval.py CHANGED
@@ -2,18 +2,17 @@
2
  import keras_nlp
3
  import keras
4
  import tensorflow.data as tf_data
5
- import pickle
6
  import tensorflow as tf
7
  from tensorflow.keras.optimizers import Adam
8
  from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
9
- import datetime
10
- import random
11
  import re
12
  from sacrebleu.metrics import CHRF
13
  import time
14
- # from keras import ops
15
- #hyperparameters
16
  MAX_SEQUENCE_LENGTH = 64
 
 
17
 
18
  transformer = keras.models.load_model('models_europarl/en_cs_translator_saved_20231209_0046.keras')
19
  def read_files(path, lowercase = False):
@@ -46,7 +45,6 @@ def next_token_logits(encoder_input_tokens, prompt, predicted_token_index):
46
 
47
 
48
  def greedy_decode(encoder_input_tokens, prompt, end_token_id):
49
-
50
  start_index = 1
51
  current_prompt = prompt
52
  for predicted_token_index in range(start_index, MAX_SEQUENCE_LENGTH):
@@ -152,7 +150,7 @@ bleu_metrics = keras_nlp.metrics.Bleu(
152
  tokenizer = cs_tokenizer
153
  )
154
 
155
- eval_samples = 100
156
  chrf = CHRF()
157
  refs = test_cs[:eval_samples]
158
  translations = []
 
2
  import keras_nlp
3
  import keras
4
  import tensorflow.data as tf_data
 
5
  import tensorflow as tf
6
  from tensorflow.keras.optimizers import Adam
7
  from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
 
 
8
  import re
9
  from sacrebleu.metrics import CHRF
10
  import time
11
+
12
+
13
  MAX_SEQUENCE_LENGTH = 64
14
+ eval_samples = 100
15
+
16
 
17
  transformer = keras.models.load_model('models_europarl/en_cs_translator_saved_20231209_0046.keras')
18
  def read_files(path, lowercase = False):
 
45
 
46
 
47
  def greedy_decode(encoder_input_tokens, prompt, end_token_id):
 
48
  start_index = 1
49
  current_prompt = prompt
50
  for predicted_token_index in range(start_index, MAX_SEQUENCE_LENGTH):
 
150
  tokenizer = cs_tokenizer
151
  )
152
 
153
+
154
  chrf = CHRF()
155
  refs = test_cs[:eval_samples]
156
  translations = []
preprocess_dataset.py CHANGED
@@ -12,13 +12,10 @@ MAX_SEQUENCE_LENGTH = 64
12
  def read_files(path, lowercase = False):
13
  with open(path, "r", encoding="utf-8") as f:
14
  dataset_split = f.read().split("\n")[:-1]
15
- #to lowercase, idk why
16
  if(lowercase):
17
  dataset_split = [line.lower() for line in dataset_split]
18
  return dataset_split
19
 
20
- # en_vocab = read_files("tokenizers/en_opus_vocab")
21
- # cs_vocab = read_files("tokenizers/cs_opus_vocab")
22
  en_vocab = read_files("tokenizers/en_europarl_vocab")
23
  cs_vocab = read_files("tokenizers/cs_europarl_vocab")
24
 
@@ -32,42 +29,28 @@ cs_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
32
  )
33
 
34
 
35
- #opus
36
- # train_cs_file = 'datasets/cs-en/opus.cs-en-train.cs'
37
- # train_en_file = 'datasets/cs-en/opus.cs-en-train.en'
38
- # valid_cs_file = 'datasets/cs-en/opus.cs-en-dev.cs'
39
- # valid_en_file = 'datasets/cs-en/opus.cs-en-dev.en'
40
- # test_cs_file = 'datasets/cs-en/opus.cs-en-test.cs'
41
- # test_en_file = 'datasets/cs-en/opus.cs-en-test.en'
42
-
43
  #europarl
44
  train_cs_file = 'datasets/europarl/train-cs-en.cs'
45
  train_en_file = 'datasets/europarl/train-cs-en.en'
46
  valid_cs_file = 'datasets/europarl/valid-cs-en.cs'
47
  valid_en_file = 'datasets/europarl/valid-cs-en.en'
48
- test_cs_file = 'datasets/europarl/test-cs-en.cs'
49
- test_en_file = 'datasets/europarl/test-cs-en.en'
50
 
51
 
52
  train_cs = read_files(train_cs_file, True)
53
  train_en = read_files(train_en_file, True)
54
  valid_cs = read_files(valid_cs_file, True)
55
  valid_en = read_files(valid_en_file, True)
56
- test_cs = read_files(test_cs_file, True)
57
- test_en = read_files(test_en_file, True)
58
 
59
  def preprocess_batch(en, cs):
60
  en = en_tokenizer(en)
61
  cs = cs_tokenizer(cs)
62
 
63
- # Pad `eng` to `MAX_SEQUENCE_LENGTH`.
64
  en_start_end_packer = keras_nlp.layers.StartEndPacker(
65
  sequence_length=MAX_SEQUENCE_LENGTH,
66
  pad_value=en_tokenizer.token_to_id("[PAD]"),
67
  )
68
  en = en_start_end_packer(en)
69
 
70
- # Add special tokens (`"[START]"` and `"[END]"`) to `spa` and pad it as well.
71
  cs_start_end_packer = keras_nlp.layers.StartEndPacker(
72
  sequence_length=MAX_SEQUENCE_LENGTH + 1,
73
  start_value=cs_tokenizer.token_to_id("[START]"),
 
12
  def read_files(path, lowercase = False):
13
  with open(path, "r", encoding="utf-8") as f:
14
  dataset_split = f.read().split("\n")[:-1]
 
15
  if(lowercase):
16
  dataset_split = [line.lower() for line in dataset_split]
17
  return dataset_split
18
 
 
 
19
  en_vocab = read_files("tokenizers/en_europarl_vocab")
20
  cs_vocab = read_files("tokenizers/cs_europarl_vocab")
21
 
 
29
  )
30
 
31
 
 
 
 
 
 
 
 
 
32
  #europarl
33
  train_cs_file = 'datasets/europarl/train-cs-en.cs'
34
  train_en_file = 'datasets/europarl/train-cs-en.en'
35
  valid_cs_file = 'datasets/europarl/valid-cs-en.cs'
36
  valid_en_file = 'datasets/europarl/valid-cs-en.en'
 
 
37
 
38
 
39
  train_cs = read_files(train_cs_file, True)
40
  train_en = read_files(train_en_file, True)
41
  valid_cs = read_files(valid_cs_file, True)
42
  valid_en = read_files(valid_en_file, True)
 
 
43
 
44
  def preprocess_batch(en, cs):
45
  en = en_tokenizer(en)
46
  cs = cs_tokenizer(cs)
47
 
 
48
  en_start_end_packer = keras_nlp.layers.StartEndPacker(
49
  sequence_length=MAX_SEQUENCE_LENGTH,
50
  pad_value=en_tokenizer.token_to_id("[PAD]"),
51
  )
52
  en = en_start_end_packer(en)
53
 
 
54
  cs_start_end_packer = keras_nlp.layers.StartEndPacker(
55
  sequence_length=MAX_SEQUENCE_LENGTH + 1,
56
  start_value=cs_tokenizer.token_to_id("[START]"),
train_tokenizers.py CHANGED
@@ -4,13 +4,9 @@ import tensorflow.data as tf_data
4
  import pickle
5
  import random
6
 
7
-
8
  EN_VOCAB_SIZE = 30000
9
  CS_VOCAB_SIZE = 30000
10
 
11
-
12
-
13
-
14
  def train_word_piece(text_samples, vocab_size, reserved_tokens, save_output_path):
15
  word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
16
  vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
 
4
  import pickle
5
  import random
6
 
 
7
  EN_VOCAB_SIZE = 30000
8
  CS_VOCAB_SIZE = 30000
9
 
 
 
 
10
  def train_word_piece(text_samples, vocab_size, reserved_tokens, save_output_path):
11
  word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
12
  vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(