-
Notifications
You must be signed in to change notification settings - Fork 46
Expand file tree
/
Copy pathtransform.py
More file actions
51 lines (38 loc) · 1.34 KB
/
transform.py
File metadata and controls
51 lines (38 loc) · 1.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas as pd
from datasets import load_dataset
from pylatexenc.latexencode import unicode_to_latex
def uniCode2Latex(text: str) -> str:
"""
converts unicode text to latex and
fixes UTF-8 chars for latex in a certain range:
₀:$_0$ ... ₉:$_9$
see https://github.com/phfaist/pylatexenc/issues/72
Args:
text(str): the string to fix
Return:
str: latex presentation of UTF-8 char
"""
for code in range(8320, 8330):
text = text.replace(chr(code), f"$_{code-8320}$")
text = text.replace("\u0305", "$^-$")
text = text.replace("\u207a", "$^+$")
text = text.replace("\u207b", "$^-$")
text = text.replace("\u2074", "$^4$")
text = text.replace("\u2070", "$^0$")
text = text.replace("\u2078", "$^1$")
text = text.replace("\u2075", "$^2$")
text = text.replace("\u2076", "$^3$")
text = text.replace("\u2077", "$^5$")
return unicode_to_latex(text)
def process():
dataset = load_dataset("kjappelbaum/chemnlp-ocp")
df_train = dataset["train"].to_pandas()
df_val = dataset["valid"].to_pandas()
df_train["split"] = "train"
df_val["split"] = "valid"
df = pd.concat([df_train, df_val])
df["text"] = df["text"].apply(uniCode2Latex)
print(len(df))
df.to_csv("data_clean.csv", index=False)
if __name__ == "__main__":
process()