File size: 3,140 Bytes
75a2cec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104

import keras_nlp
import keras
import tensorflow.data as tf_data
import pickle
#hyperparameters
BATCH_SIZE = 16
MAX_SEQUENCE_LENGTH = 64

#load tokenizers/en_vocab to list

def read_files(path, lowercase = False):
    with open(path, "r", encoding="utf-8") as f:
        dataset_split = f.read().split("\n")[:-1]
    #to lowercase, idk why
    if(lowercase):
        dataset_split = [line.lower() for line in dataset_split]
    return dataset_split

# en_vocab = read_files("tokenizers/en_opus_vocab")
# cs_vocab = read_files("tokenizers/cs_opus_vocab")
en_vocab = read_files("tokenizers/en_europarl_vocab")
cs_vocab = read_files("tokenizers/cs_europarl_vocab")

en_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=en_vocab, 
    lowercase=False
)
cs_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=cs_vocab, 
    lowercase=False
)


#opus
# train_cs_file = 'datasets/cs-en/opus.cs-en-train.cs'
# train_en_file = 'datasets/cs-en/opus.cs-en-train.en'
# valid_cs_file = 'datasets/cs-en/opus.cs-en-dev.cs'
# valid_en_file = 'datasets/cs-en/opus.cs-en-dev.en'
# test_cs_file = 'datasets/cs-en/opus.cs-en-test.cs'
# test_en_file = 'datasets/cs-en/opus.cs-en-test.en'

#europarl
train_cs_file = 'datasets/europarl/train-cs-en.cs'
train_en_file = 'datasets/europarl/train-cs-en.en'
valid_cs_file = 'datasets/europarl/valid-cs-en.cs'
valid_en_file = 'datasets/europarl/valid-cs-en.en'
test_cs_file = 'datasets/europarl/test-cs-en.cs'
test_en_file = 'datasets/europarl/test-cs-en.en'


train_cs = read_files(train_cs_file, True)
train_en = read_files(train_en_file, True)
valid_cs = read_files(valid_cs_file, True)
valid_en = read_files(valid_en_file, True)
test_cs = read_files(test_cs_file, True)
test_en = read_files(test_en_file, True)

def preprocess_batch(en, cs):
    en = en_tokenizer(en)
    cs = cs_tokenizer(cs)

    # Pad `eng` to `MAX_SEQUENCE_LENGTH`.
    en_start_end_packer = keras_nlp.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH,
        pad_value=en_tokenizer.token_to_id("[PAD]"),
    )
    en = en_start_end_packer(en)

    # Add special tokens (`"[START]"` and `"[END]"`) to `spa` and pad it as well.
    cs_start_end_packer = keras_nlp.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH + 1,
        start_value=cs_tokenizer.token_to_id("[START]"),
        end_value=cs_tokenizer.token_to_id("[END]"),
        pad_value=cs_tokenizer.token_to_id("[PAD]"),
    )
    cs = cs_start_end_packer(cs)

    return (
        {
            "encoder_inputs": en,
            "decoder_inputs": cs[:, :-1],
        },
        cs[:, 1:],
    )


def make_dataset(en_texts, cs_texts):
    dataset = tf_data.Dataset.from_tensor_slices((en_texts, cs_texts))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(preprocess_batch, num_parallel_calls=tf_data.AUTOTUNE)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(train_en, train_cs)
val_ds = make_dataset(valid_en, valid_cs)

tf_data.Dataset.save(train_ds, "datasets/preprocessed_europarl_train")
tf_data.Dataset.save(val_ds, "datasets/preprocessed_europarl_valid")