-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathrun_gen_storage_plan.py
More file actions
118 lines (94 loc) · 4.13 KB
/
run_gen_storage_plan.py
File metadata and controls
118 lines (94 loc) · 4.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import argparse
import json
import os
import sys
from pathlib import Path
from main import run_conv_wrapper
# add parent to path
sys.path.append(os.path.join(os.getcwd(), ".."))
from dataset.dataset_tables_dict import get_benchmark_schema
from utils.cli_config import add_common_args, build_run_config
from utils.gen_common import parse_query_ids
def main(args):
# ===== CONFIGURATION =====
short_name = args.conv
benchmark = args.benchmark
# extract queries from short name
prefix = "storageplan"
assert short_name.startswith(prefix) # b-ase
if "v" in short_name:
query_ids = parse_query_ids(short_name, prefix, benchmark=benchmark)
assert query_ids is not None, f"Failed to parse query ids from {short_name}"
max_scale_factor = 20
# =========================
config = build_run_config(
benchmark=benchmark,
conv_name=short_name,
query_list=",".join(map(str, query_ids)),
notify=args.notify,
conv_mode="scripted",
disable_repo_sync=args.disable_repo_sync,
max_scale_factor=max_scale_factor,
replay_cache=args.replay_cache,
auto_u=args.auto_u,
auto_finish=args.auto_finish,
)
# create conversation
create_conversation(
benchmark,
short_name,
schema=get_benchmark_schema(benchmark),
conversation_dir=Path(config.artifacts_dir) / "conversations",
)
# run conversation
run_conv_wrapper(config)
def create_conversation(
benchmark,
short_name,
schema: str,
conversation_dir: Path,
):
prompt_list = []
# parquet engine
queries_path = "queries.txt"
prompt_list.append(
f"""Your task is to analyze the workload and produce a creative in-memory storage-layout summary for the tables accessed by the query. You have the flexibility to return detailed, free-form text that explores not only conventional storage-layout recommendations but also unconventional, novel, and even 'crazy' storage designs.
You are encouraged to include additional ideas, new partitioning strategies, speculative encoding techniques, or experimental ways of grouping and organizing columns or data.
For each accessed table, feel free to be inventive and elaborate on possibilities such as hybrid layouts, speculative SoA/AoS (Array of Structures/Structure of Arrays) approaches, novel column encodings, or adaptive partitioning.
Use this as an opportunity to push beyond current norms and propose storage techniques that might be futuristic or outlandish.
Output the storage layout for each table. Output only the final storage layout.
Important:
- store all the data, and store them in a way that it could be flattened back to the original data
- do not store data redundantly, but you can use compression or encoding, meta data, or special datastructures
- optimized for in-memory (single-node) analytical query processing
The queries are listed in the file: {queries_path}.
The schema is:
{schema}
Based on the given queries and schema, provide a detailed and creative storage layout summary for the tables accessed by the query. Feel free to explore unconventional and novel storage designs, including speculative encoding techniques or experimental ways of organizing data. Write it to the file: `storage_plan.txt`."""
)
target_path = conversation_dir / f"{benchmark}_{short_name}.json"
if os.path.exists(target_path):
raise ValueError(f"Conversation file {target_path} already exists.")
with open(target_path, "w") as f:
json.dump(prompt_list, f, indent=2)
def build_parser(*, add_help: bool = True) -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(add_help=add_help)
parser.add_argument(
"--conv",
type=str,
required=True,
help="Short name for the conversation",
)
add_common_args(
parser,
include_notify=True,
include_disable_repo_sync=True,
include_replay_cache=True,
include_benchmark=True,
include_auto_u=True,
include_auto_finish=True,
)
return parser
if __name__ == "__main__":
args = build_parser().parse_args()
main(args)