patterninc · Divyanshu Tiwari (divyanshu-tiwari) · Mar 31, 2026 · Mar 26, 2026 · Mar 27, 2026 · Mar 27, 2026
@@ -64,6 +64,7 @@ Convert a single line to the SSTable which could be stored on s3 or via file. It
 | `sheets` | array | all sheets | Optional array of sheet names to process. If not specified, all sheets are processed |
 | `skip_rows` | int | `0` | Number of rows to skip from the beginning of each sheet (e.g., for header rows) |
 | `skip_rows_by_sheet` | map[string]int | - | Per-sheet row skip overrides. Keys are sheet names, values are rows to skip. Takes precedence over `skip_rows` |
+| `sanitize_headers` | bool | `false` | If true, normalizes header row values to lowercase with non-alphanumeric characters replaced by underscores. Assumes the first unskipped row to be header |
 
 **Important:** The XLSX converter emits **one record per sheet**. Each record contains the sheet's data in CSV format, with the sheet name available in the record context under the key `xlsx_sheet_name`.
 
@@ -143,6 +144,18 @@ tasks:
     # Each record will have xlsx_sheet_name in context
 ```
 
+### Excel to CSV with sanitized headers:
+```yaml
+tasks:
+  - name: read_excel
+    type: file
+    path: report.xlsx
+  - name: convert_excel
+    type: converter
+    format: xlsx
+    sanitize_headers: true  # "First Name" becomes "first_name", "Sales (USD)" becomes "sales_usd_"
+```
+
 ### Excel to CSV with row skipping:
 ```yaml
 tasks:

@@ -2,11 +2,19 @@ package converter
 
 import (
 	"fmt"
+	"regexp"
+	"strings"
 
 	"github.com/patterninc/caterpillar/internal/pkg/pipeline/record"
 	"github.com/patterninc/caterpillar/internal/pkg/pipeline/task"
 )
 
+var columnNameRegex = regexp.MustCompile(`[^a-zA-Z0-9]+`)
+
+func sanitizeColumnName(name string) string {
+	return strings.ToLower(columnNameRegex.ReplaceAllString(name, "_"))
+}
+
 type converterOutput struct {
 	Data     []byte
 	Metadata map[string]string

@@ -6,7 +6,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"log"
-	"regexp"
 	"strconv"
 	"strings"
 )
@@ -21,9 +20,6 @@ type csv struct {
 	Columns   []*csvColumn `yaml:"columns" json:"columns"`
 }
 
-// Pre-compile regex for column name sanitization
-var columnNameRegex = regexp.MustCompile(`[^a-zA-Z0-9]+`)
-
 func (c *csv) convert(data []byte, _ string) ([]converterOutput, error) {
 	// Initialize columns if not provided
 	if len(c.Columns) == 0 {
@@ -93,7 +89,7 @@ func (c *csv) initializeColumns(data []byte) error {
 	if c.SkipFirst {
 		// Use first row as column headers
 		for i, name := range firstRow {
-			sanitizedName := strings.ToLower(columnNameRegex.ReplaceAllString(name, "_"))
+			sanitizedName := sanitizeColumnName(name)
 			c.Columns[i] = &csvColumn{Name: sanitizedName}
 		}
 		// Keep SkipFirst as true so the convert function knows to skip this row

@@ -16,6 +16,7 @@ type xlsx struct {
 	Sheets          []string       `yaml:"sheets,omitempty" json:"sheets,omitempty"`
 	SkipRows        int            `yaml:"skip_rows,omitempty" json:"skip_rows,omitempty"`
 	SkipRowsBySheet map[string]int `yaml:"skip_rows_by_sheet,omitempty" json:"skip_rows_by_sheet,omitempty"`
+	SanitizeHeaders bool           `yaml:"sanitize_headers,omitempty" json:"sanitize_headers,omitempty"`
 }
 
 func (x *xlsx) convert(data []byte, _ string) ([]converterOutput, error) {
@@ -39,7 +40,7 @@ func (x *xlsx) convert(data []byte, _ string) ([]converterOutput, error) {
 	outputs := make([]converterOutput, 0, len(sheets))
 
 	for _, sheet := range sheets {
-		output, err := readSheet(reader, sheet, x.getRowsToSkip(sheet))
+		output, err := readSheet(reader, sheet, x.getRowsToSkip(sheet), x.SanitizeHeaders)
 		if err != nil {
 			return nil, err
 		}
@@ -50,7 +51,7 @@ func (x *xlsx) convert(data []byte, _ string) ([]converterOutput, error) {
 	return outputs, nil
 }
 
-func readSheet(reader *excelize.File, sheet string, rowsToSkip int) (converterOutput, error) {
+func readSheet(reader *excelize.File, sheet string, rowsToSkip int, sanitizeHeaders bool) (converterOutput, error) {
 	// Create buffer for this sheet
 	var buff bytes.Buffer
 	writer := csvEncoder.NewWriter(&buff)
@@ -64,6 +65,7 @@ func readSheet(reader *excelize.File, sheet string, rowsToSkip int) (converterOu
 
 	// Write rows to buffer
 	i := 0
+	isHeaderRow := true
 	for rows.Next() {
 		if i < rowsToSkip {
 			i++
@@ -75,6 +77,13 @@ func readSheet(reader *excelize.File, sheet string, rowsToSkip int) (converterOu
 			return converterOutput{}, err
 		}
 
+		if sanitizeHeaders && isHeaderRow {
+			for j, col := range cols {
+				cols[j] = sanitizeColumnName(col)
+			}
+			isHeaderRow = false
+		}
+
 		if err := writer.Write(cols); err != nil {
 			return converterOutput{}, err
 		}