NLP_Final_Project/data.py at main · daan-eeltink/NLP_Final_Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
import json
import sentencepiece as spm


# Load JSONL file - extracts text pairs or full objects
def load_jsonl(jsonl_path: str, only_text_pairs = False):

    # Initialze return object
    data = []

    # Open the eval file and iterate over JSON objects
    with open(jsonl_path, "r", encoding="utf-8") as file:
        for i, line in enumerate(file, start=1):

            # Skip empty lines
            line = line.strip()
            if not line:
                continue

            # Read the JSON object and extract eval data
            try:
                obj = json.loads(line)

                # If text pairs are requested, extract them
                if only_text_pairs:

                    # Ensure source and target keys exist
                    if 'source' in obj and 'target' in obj:
                        data.append((obj['source'], obj['target']))

                    else:
                        print(f"Missing 'source' or 'target' at line {i}")

                # Otherwise, append the full JSON object
                else:
                    data.append(obj)

            # Handle exceptions
            except Exception as e:
                print(f"Error at line {i}: {e}")

    return data


# Train SentencePiece model on given set of text pairs
def train_sentencepiece_model(text_list, model_prefix, vocab_size=1000):

    # Write all texts to a file (required input for SentencePiece)
    with open(f"{model_prefix}.txt", "w", encoding="utf-8") as file:
        for text in text_list:
            file.write(text.lower().strip() + "\n")

    # Learn the optimal tokens using SentencePiece model
    spm.SentencePieceTrainer.train(
        input=f"{model_prefix}.txt",
        model_prefix=model_prefix,
        vocab_size=vocab_size,
        pad_id=0, unk_id=1, bos_id=2, eos_id=3,
    )

    # Remove the temporary text file
    os.remove(f"{model_prefix}.txt")


# Load a SentencePiece model
def load_sp_model(model_prefix):
    sp = spm.SentencePieceProcessor()
    sp.load(f"{model_prefix}.model")
    return sp


# Use SentencePiece model to tokenize text pairs
def tokenize_text_pairs(text_pairs, sp_src, sp_tgt):

    # Initialize return object
    token_pairs = []

    # Get BOS/EOS token IDs
    bos = sp_tgt.bos_id()
    eos = sp_tgt.eos_id()

    # Iterate over text_pairs
    for src, tgt in text_pairs:

        # Use SentencePiece models to tokenize source or target text
        src_toks = [bos] + sp_src.encode(src.lower(), out_type=int) + [eos]
        tgt_toks = [bos] + sp_tgt.encode(tgt.lower(), out_type=int) + [eos]

        # Append token pair to return object
        token_pairs.append((src_toks, tgt_toks))

    return token_pairs