-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathaircraft_organize.py
More file actions
181 lines (151 loc) · 5.41 KB
/
aircraft_organize.py
File metadata and controls
181 lines (151 loc) · 5.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/env python3
"""
Reorganize FGVC-Aircraft-2013b into ImageFolder layout by *variant*:
Source (inside fgvc-aircraft-2013b):
- images/ # raw images (e.g., 10001.jpg)
- data/images_variant_trainval.txt
- data/images_variant_test.txt
Target (created as a sibling of fgvc-aircraft-2013b):
aircraft/
train/<FAMILY_NAME>/*.jpg
test/<FAMILY_NAME>/*.jpg
Notes:
- No absolute paths are used. All paths are derived from args.
- FAMILY_NAME is sanitized to be a safe directory name.
- By default files are copied; you can use --link hard/soft to save disk.
"""
import argparse
import shutil
from pathlib import Path
import re
from typing import List, Tuple
def parse_args():
p = argparse.ArgumentParser(description="Build ImageFolder (family) splits for FGVC-Aircraft-2013b")
p.add_argument(
"--ds", "--dataset_dir",
dest="dataset_dir",
type=Path,
default=Path.cwd() / "fgvc-aircraft-2013b",
help="Path to fgvc-aircraft-2013b directory (default: ./fgvc-aircraft-2013b)"
)
p.add_argument(
"--out", "--out_dir",
dest="out_dir",
type=Path,
default=None,
help="Output root (default: sibling './aircraft' next to dataset_dir)"
)
p.add_argument(
"--link",
choices=["none", "hard", "soft"],
default="none",
help="How to place files: 'none' = copy (default), 'hard' = hardlink, 'soft' = symlink"
)
return p.parse_args()
def sanitize(name: str) -> str:
"""Make a safe directory name: keep letters, numbers, dash and underscore; replace others with '-'."""
# Collapse spaces and slashes into '-'; remove repeated dashes.
name = name.strip()
name = name.replace("/", "-")
name = re.sub(r"\s+", "-", name)
name = re.sub(r"[^A-Za-z0-9\-_]+", "-", name)
name = re.sub(r"-{2,}", "-", name)
return name.strip("-")
def read_list(txt_path: Path) -> List[Tuple[str, str]]:
"""
Read lines like:
10075 Boeing 737-700
Returns list of tuples: (image_stem, family_label_string)
"""
pairs = []
with txt_path.open("r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
# The first token is image id (file stem); the rest is the label string (may contain spaces).
parts = line.split()
img_stem = parts[0]
label = " ".join(parts[1:])
pairs.append((img_stem, label))
return pairs
def find_image(images_dir: Path, stem: str) -> Path:
"""Find actual image file by stem; FGVC uses .jpg, but we check common extensions just in case."""
for ext in [".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"]:
p = images_dir / f"{stem}{ext}"
if p.exists():
return p
raise FileNotFoundError(f"Image not found for stem '{stem}' in {images_dir}")
def place(src: Path, dst: Path, mode: str):
"""
Place file from src to dst according to mode:
- none: copy
- hard: hardlink
- soft: symlink
"""
if dst.exists():
return
if mode == "none":
shutil.copy2(src, dst)
elif mode == "hard":
# Hard link only works on same filesystem.
os_link = getattr(Path, "link_to", None)
if os_link is not None:
dst.hardlink_to(src)
else:
# Fallback for older Python: use os.link
import os
os.link(src, dst)
elif mode == "soft":
dst.symlink_to(src)
else:
raise ValueError(f"Unknown link mode: {mode}")
def build_split(
pairs: List[Tuple[str, str]],
images_dir: Path,
split_root: Path,
link_mode: str
):
"""Create folder tree and populate images for one split (train or test)."""
split_root.mkdir(parents=True, exist_ok=True)
for stem, label in pairs:
family = sanitize(label)
class_dir = split_root / family
class_dir.mkdir(parents=True, exist_ok=True)
src = find_image(images_dir, stem)
dst = class_dir / src.name
place(src, dst, link_mode)
def main():
args = parse_args()
dataset_dir = args.dataset_dir.resolve()
if args.out_dir is None:
out_dir = dataset_dir.parent / "aircraft"
else:
out_dir = args.out_dir.resolve()
data_dir = dataset_dir / "data"
images_dir = data_dir / "images"
train_list = data_dir / "images_variant_trainval.txt"
test_list = data_dir / "images_variant_test.txt"
# Basic validations
for p in [data_dir, images_dir, train_list, test_list]:
if not p.exists():
raise FileNotFoundError(f"Required path not found: {p}")
# Read split files
train_pairs = read_list(train_list)
test_pairs = read_list(test_list)
# Build folder structure
train_root = out_dir / "train"
test_root = out_dir / "test"
print(f"Dataset dir : {dataset_dir}")
print(f"Images dir : {images_dir}")
print(f"Output dir : {out_dir}")
print(f"Train pairs : {len(train_pairs)}")
print(f"Test pairs : {len(test_pairs)}")
print(f"Mode : {'copy' if args.link=='none' else args.link+'-link'}")
build_split(train_pairs, images_dir, train_root, args.link)
build_split(test_pairs, images_dir, test_root, args.link)
print("Done. Example tree:")
print(str(out_dir / "train") + "/<FAMILY_NAME>/*.jpg")
print(str(out_dir / "test") + "/<FAMILY_NAME>/*.jpg")
if __name__ == "__main__":
main()