-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_selfies.py
More file actions
61 lines (49 loc) · 1.68 KB
/
generate_selfies.py
File metadata and controls
61 lines (49 loc) · 1.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""Generate SELFIES from a SMILES CSV."""
from __future__ import annotations
import argparse
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
from SELFormerMM.utils.datasets import smiles_to_selfies
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Generate SELFIES from SMILES.")
parser.add_argument(
"--smiles_dataset",
required=True,
help="Path to input CSV containing a SMILES column.",
)
parser.add_argument(
"--selfies_dataset",
required=True,
help="Path to output CSV with a SELFIES column.",
)
parser.add_argument(
"--smiles_column",
default="smiles",
help="Name of the SMILES column in the input CSV.",
)
parser.add_argument(
"--on_error",
default="keep",
choices=["keep", "empty", "raise"],
help="How to handle SMILES conversion errors.",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
input_path = Path(args.smiles_dataset)
output_path = Path(args.selfies_dataset)
df = pd.read_csv(input_path)
if args.smiles_column not in df.columns:
raise ValueError(f"Column '{args.smiles_column}' not found in {input_path}")
if "selfies" in df.columns:
df.drop(columns=["selfies"], inplace=True)
tqdm.pandas(desc="Generating SELFIES")
selfies = df[args.smiles_column].progress_apply(
lambda smi: smiles_to_selfies(smi, on_error=args.on_error)
)
df.insert(0, "selfies", selfies)
df.to_csv(output_path, index=False)
print(f"SELFIES representation file is ready: {output_path}")
if __name__ == "__main__":
main()