chemnlp/data/tabular/ocp/transform.py at 18b9e8c4e293f120161a694fd3270a523afc66e5 · OpenBioML/chemnlp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas as pd
from datasets import load_dataset
from pylatexenc.latexencode import unicode_to_latex


def uniCode2Latex(text: str) -> str:
    """
    converts unicode text to latex and
    fixes UTF-8 chars for latex in a certain range:
        ₀:$_0$ ... ₉:$_9$

    see https://github.com/phfaist/pylatexenc/issues/72

    Args:
        text(str): the string to fix

    Return:
        str: latex presentation of UTF-8 char
    """
    for code in range(8320, 8330):
        text = text.replace(chr(code), f"$_{code-8320}$")

    text = text.replace("\u0305", "$^-$")
    text = text.replace("\u207a", "$^+$")
    text = text.replace("\u207b", "$^-$")
    text = text.replace("\u2074", "$^4$")
    text = text.replace("\u2070", "$^0$")
    text = text.replace("\u2078", "$^1$")
    text = text.replace("\u2075", "$^2$")
    text = text.replace("\u2076", "$^3$")
    text = text.replace("\u2077", "$^5$")

    return unicode_to_latex(text)


def process():
    dataset = load_dataset("kjappelbaum/chemnlp-ocp")
    df_train = dataset["train"].to_pandas()
    df_val = dataset["valid"].to_pandas()

    df_train["split"] = "train"
    df_val["split"] = "valid"

    df = pd.concat([df_train, df_val])
    df["text"] = df["text"].apply(uniCode2Latex)
    print(len(df))
    df.to_csv("data_clean.csv", index=False)


if __name__ == "__main__":
    process()