ggsql/src/execute/schema.rs at ec946f7b259d562fcc5b2b3a2c5cc3043a887cea · posit-dev/ggsql · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
//! Schema extraction, type inference, and min/max range computation.
//!
//! This module provides functions for extracting column types and computing
//! min/max ranges from queries. It uses a split approach:
//! 1. fetch_schema_types() - get dtypes only (before casting)
//! 2. Apply casting to queries
//! 3. complete_schema_ranges() - get min/max from cast queries

use crate::plot::{AestheticValue, ColumnInfo, Layer, ParameterValue, Schema};
use crate::{naming, DataFrame, Result};
use polars::prelude::DataType;

/// Simple type info tuple: (name, dtype, is_discrete)
pub type TypeInfo = (String, DataType, bool);

/// Build SQL query to compute min and max for all columns
///
/// Generates a query that returns two rows:
/// - Row 0: MIN of each column
/// - Row 1: MAX of each column
pub fn build_minmax_query(source_query: &str, column_names: &[&str]) -> String {
    let min_exprs: Vec<String> = column_names
        .iter()
        .map(|name| format!("MIN(\"{}\") AS \"{}\"", name, name))
        .collect();

    let max_exprs: Vec<String> = column_names
        .iter()
        .map(|name| format!("MAX(\"{}\") AS \"{}\"", name, name))
        .collect();

    format!(
        "WITH __ggsql_source__ AS ({}) SELECT {} FROM __ggsql_source__ UNION ALL SELECT {} FROM __ggsql_source__",
        source_query,
        min_exprs.join(", "),
        max_exprs.join(", ")
    )
}

/// Extract a value from a DataFrame at a given column and row index
///
/// Converts Polars values to ArrayElement for storage in ColumnInfo.
pub fn extract_series_value(
    df: &DataFrame,
    column: &str,
    row: usize,
) -> Option<crate::plot::ArrayElement> {
    use crate::plot::ArrayElement;

    let col = df.column(column).ok()?;
    let series = col.as_materialized_series();

    if row >= series.len() {
        return None;
    }

    match series.dtype() {
        DataType::Int8 => series
            .i8()
            .ok()
            .and_then(|ca| ca.get(row))
            .map(|v| ArrayElement::Number(v as f64)),
        DataType::Int16 => series
            .i16()
            .ok()
            .and_then(|ca| ca.get(row))
            .map(|v| ArrayElement::Number(v as f64)),
        DataType::Int32 => series
            .i32()
            .ok()
            .and_then(|ca| ca.get(row))
            .map(|v| ArrayElement::Number(v as f64)),
        DataType::Int64 => series
            .i64()
            .ok()
            .and_then(|ca| ca.get(row))
            .map(|v| ArrayElement::Number(v as f64)),
        DataType::UInt8 => series
            .u8()
            .ok()
            .and_then(|ca| ca.get(row))
            .map(|v| ArrayElement::Number(v as f64)),
        DataType::UInt16 => series
            .u16()
            .ok()
            .and_then(|ca| ca.get(row))
            .map(|v| ArrayElement::Number(v as f64)),
        DataType::UInt32 => series
            .u32()
            .ok()
            .and_then(|ca| ca.get(row))
            .map(|v| ArrayElement::Number(v as f64)),
        DataType::UInt64 => series
            .u64()
            .ok()
            .and_then(|ca| ca.get(row))
            .map(|v| ArrayElement::Number(v as f64)),
        DataType::Float32 => series
            .f32()
            .ok()
            .and_then(|ca| ca.get(row))
            .map(|v| ArrayElement::Number(v as f64)),
        DataType::Float64 => series
            .f64()
            .ok()
            .and_then(|ca| ca.get(row))
            .map(ArrayElement::Number),
        DataType::Boolean => series
            .bool()
            .ok()
            .and_then(|ca| ca.get(row))
            .map(ArrayElement::Boolean),
        DataType::String => series
            .str()
            .ok()
            .and_then(|ca| ca.get(row))
            .map(|s| ArrayElement::String(s.to_string())),
        DataType::Date => {
            // Return numeric days since epoch (for range computation)
            series
                .date()
                .ok()
                .and_then(|ca| ca.physical().get(row))
                .map(|days| ArrayElement::Number(days as f64))
        }
        DataType::Datetime(_, _) => {
            // Return numeric microseconds since epoch (for range computation)
            series
                .datetime()
                .ok()
                .and_then(|ca| ca.physical().get(row))
                .map(|us| ArrayElement::Number(us as f64))
        }
        DataType::Time => {
            // Return numeric nanoseconds since midnight (for range computation)
            series
                .time()
                .ok()
                .and_then(|ca| ca.physical().get(row))
                .map(|ns| ArrayElement::Number(ns as f64))
        }
        _ => None,
    }
}

/// Fetch only column types (no min/max) from a query.
///
/// Uses LIMIT 1 to get schema while minimally reading data.
/// Returns `(name, dtype, is_discrete)` tuples for each column.
///
/// This is the first phase of the split schema extraction approach:
/// 1. fetch_schema_types() - get dtypes only (before casting)
/// 2. Apply casting to queries
/// 3. complete_schema_ranges() - get min/max from cast queries
pub fn fetch_schema_types<F>(query: &str, execute_query: &F) -> Result<Vec<TypeInfo>>
where
    F: Fn(&str) -> Result<DataFrame>,
{
    let schema_query = format!(
        "SELECT * FROM ({}) AS {} LIMIT 1",
        query,
        naming::SCHEMA_ALIAS
    );
    let schema_df = execute_query(&schema_query)?;

    let type_info: Vec<TypeInfo> = schema_df
        .get_columns()
        .iter()
        .map(|col| {
            let dtype = col.dtype().clone();
            let is_discrete =
                matches!(dtype, DataType::String | DataType::Boolean) || dtype.is_categorical();
            (col.name().to_string(), is_discrete, dtype)
        })
        .map(|(name, is_discrete, dtype)| (name, dtype, is_discrete))
        .collect();

    Ok(type_info)
}

/// Complete schema with min/max ranges from a (possibly cast) query.
///
/// Takes pre-computed type info and extracts min/max values.
/// Called after casting is applied to queries.
pub fn complete_schema_ranges<F>(
    query: &str,
    type_info: &[TypeInfo],
    execute_query: &F,
) -> Result<Schema>
where
    F: Fn(&str) -> Result<DataFrame>,
{
    if type_info.is_empty() {
        return Ok(Vec::new());
    }

    // Build and execute min/max query
    let column_names: Vec<&str> = type_info.iter().map(|(n, _, _)| n.as_str()).collect();
    let minmax_query = build_minmax_query(query, &column_names);
    let range_df = execute_query(&minmax_query)?;

    // Extract min (row 0) and max (row 1) for each column
    let schema = type_info
        .iter()
        .map(|(name, dtype, is_discrete)| {
            let min = extract_series_value(&range_df, name, 0);
            let max = extract_series_value(&range_df, name, 1);
            ColumnInfo {
                name: name.clone(),
                dtype: dtype.clone(),
                is_discrete: *is_discrete,
                min,
                max,
            }
        })
        .collect();

    Ok(schema)
}

/// Convert type info to schema (without min/max).
///
/// Used when we need a Schema but don't have min/max yet.
pub fn type_info_to_schema(type_info: &[TypeInfo]) -> Schema {
    type_info
        .iter()
        .map(|(name, dtype, is_discrete)| ColumnInfo {
            name: name.clone(),
            dtype: dtype.clone(),
            is_discrete: *is_discrete,
            min: None,
            max: None,
        })
        .collect()
}

/// Add type info for literal (constant) mappings to layer type info.
///
/// When a layer has literal mappings like `'blue' AS fill`, we need the type info
/// for these columns in the schema. Instead of re-querying the database, we can
/// derive the types directly from the AST.
///
/// This is called after global mappings are merged and color is split, so all
/// literal mappings are already in place.
pub fn add_literal_columns_to_type_info(layers: &[Layer], layer_type_info: &mut [Vec<TypeInfo>]) {
    for (layer, type_info) in layers.iter().zip(layer_type_info.iter_mut()) {
        for (aesthetic, value) in &layer.mappings.aesthetics {
            if let AestheticValue::Literal(lit) = value {
                let dtype = match lit {
                    ParameterValue::String(_) => DataType::String,
                    ParameterValue::Number(_) => DataType::Float64,
                    ParameterValue::Boolean(_) => DataType::Boolean,
                    ParameterValue::Array(_) | ParameterValue::Null => unreachable!(
                        "Grammar prevents arrays and null in literal aesthetic mappings"
                    ),
                };
                let is_discrete =
                    matches!(lit, ParameterValue::String(_) | ParameterValue::Boolean(_));
                let col_name = naming::aesthetic_column(aesthetic);

                // Only add if not already present
                if !type_info.iter().any(|(name, _, _)| name == &col_name) {
                    type_info.push((col_name, dtype, is_discrete));
                }
            }
        }
    }
}

/// Build a schema with prefixed aesthetic column names from the original schema.
///
/// For each aesthetic mapped to a column, looks up the original column's type
/// in the schema and adds it with the prefixed aesthetic name (e.g., `__ggsql_aes_x__`).
///
/// This schema is used by stat transforms to look up column types using the
/// prefixed names that appear in the query after `build_layer_select_list`.
pub fn build_aesthetic_schema(layer: &Layer, schema: &Schema) -> Schema {
    let mut aesthetic_schema: Schema = Vec::new();

    for (aesthetic, value) in &layer.mappings.aesthetics {
        let aes_col_name = naming::aesthetic_column(aesthetic);
        match value {
            AestheticValue::Column { name, .. } => {
                // The schema already has aesthetic-prefixed column names from build_layer_base_query,
                // so we look up by aesthetic name, not the original column name.
                // Fall back to original name for backwards compatibility with older schemas.
                let col_info = schema
                    .iter()
                    .find(|c| c.name == aes_col_name)
                    .or_else(|| schema.iter().find(|c| c.name == *name));

                if let Some(original_col) = col_info {
                    aesthetic_schema.push(ColumnInfo {
                        name: aes_col_name,
                        dtype: original_col.dtype.clone(),
                        is_discrete: original_col.is_discrete,
                        min: original_col.min.clone(),
                        max: original_col.max.clone(),
                    });
                } else {
                    // Column not in schema - add with Unknown type
                    aesthetic_schema.push(ColumnInfo {
                        name: aes_col_name,
                        dtype: DataType::Unknown(Default::default()),
                        is_discrete: false,
                        min: None,
                        max: None,
                    });
                }
            }
            AestheticValue::Literal(lit) => {
                // Literals become columns with appropriate type
                let dtype = match lit {
                    ParameterValue::String(_) => DataType::String,
                    ParameterValue::Number(_) => DataType::Float64,
                    ParameterValue::Boolean(_) => DataType::Boolean,
                    ParameterValue::Array(_) | ParameterValue::Null => unreachable!(
                        "Grammar prevents arrays and null in literal aesthetic mappings"
                    ),
                };
                aesthetic_schema.push(ColumnInfo {
                    name: aes_col_name,
                    dtype,
                    is_discrete: matches!(
                        lit,
                        ParameterValue::String(_) | ParameterValue::Boolean(_)
                    ),
                    min: None,
                    max: None,
                });
            }
        }
    }

    // Add facet variables and partition_by columns with their original types
    for col in &layer.partition_by {
        if !aesthetic_schema.iter().any(|c| c.name == *col) {
            if let Some(original_col) = schema.iter().find(|c| c.name == *col) {
                aesthetic_schema.push(original_col.clone());
            }
        }
    }

    aesthetic_schema
}