Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions internal/pkg/pipeline/task/converter/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ Convert a single line to the SSTable which could be stored on s3 or via file. It
| `sheets` | array | all sheets | Optional array of sheet names to process. If not specified, all sheets are processed |
| `skip_rows` | int | `0` | Number of rows to skip from the beginning of each sheet (e.g., for header rows) |
| `skip_rows_by_sheet` | map[string]int | - | Per-sheet row skip overrides. Keys are sheet names, values are rows to skip. Takes precedence over `skip_rows` |
| `sanitize_headers` | bool | `false` | If true, normalizes header row values to lowercase with non-alphanumeric characters replaced by underscores. Assumes the first unskipped row to be header |

**Important:** The XLSX converter emits **one record per sheet**. Each record contains the sheet's data in CSV format, with the sheet name available in the record context under the key `xlsx_sheet_name`.

Expand Down Expand Up @@ -143,6 +144,18 @@ tasks:
# Each record will have xlsx_sheet_name in context
```

### Excel to CSV with sanitized headers:
```yaml
tasks:
- name: read_excel
type: file
path: report.xlsx
- name: convert_excel
type: converter
format: xlsx
sanitize_headers: true # "First Name" becomes "first_name", "Sales (USD)" becomes "sales_usd_"
```

### Excel to CSV with row skipping:
```yaml
tasks:
Expand Down
8 changes: 8 additions & 0 deletions internal/pkg/pipeline/task/converter/converter.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,19 @@ package converter

import (
"fmt"
"regexp"
"strings"

"github.com/patterninc/caterpillar/internal/pkg/pipeline/record"
"github.com/patterninc/caterpillar/internal/pkg/pipeline/task"
)

var columnNameRegex = regexp.MustCompile(`[^a-zA-Z0-9]+`)

func sanitizeColumnName(name string) string {
return strings.ToLower(columnNameRegex.ReplaceAllString(name, "_"))
}

type converterOutput struct {
Data []byte
Metadata map[string]string
Expand Down
6 changes: 1 addition & 5 deletions internal/pkg/pipeline/task/converter/csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
"encoding/json"
"fmt"
"log"
"regexp"
"strconv"
"strings"
)
Expand All @@ -21,9 +20,6 @@ type csv struct {
Columns []*csvColumn `yaml:"columns" json:"columns"`
}

// Pre-compile regex for column name sanitization
var columnNameRegex = regexp.MustCompile(`[^a-zA-Z0-9]+`)

func (c *csv) convert(data []byte, _ string) ([]converterOutput, error) {
// Initialize columns if not provided
if len(c.Columns) == 0 {
Expand Down Expand Up @@ -93,7 +89,7 @@ func (c *csv) initializeColumns(data []byte) error {
if c.SkipFirst {
// Use first row as column headers
for i, name := range firstRow {
sanitizedName := strings.ToLower(columnNameRegex.ReplaceAllString(name, "_"))
sanitizedName := sanitizeColumnName(name)
c.Columns[i] = &csvColumn{Name: sanitizedName}
}
// Keep SkipFirst as true so the convert function knows to skip this row
Expand Down
13 changes: 11 additions & 2 deletions internal/pkg/pipeline/task/converter/xlsx.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ type xlsx struct {
Sheets []string `yaml:"sheets,omitempty" json:"sheets,omitempty"`
SkipRows int `yaml:"skip_rows,omitempty" json:"skip_rows,omitempty"`
SkipRowsBySheet map[string]int `yaml:"skip_rows_by_sheet,omitempty" json:"skip_rows_by_sheet,omitempty"`
SanitizeHeaders bool `yaml:"sanitize_headers,omitempty" json:"sanitize_headers,omitempty"`
}

func (x *xlsx) convert(data []byte, _ string) ([]converterOutput, error) {
Expand All @@ -39,7 +40,7 @@ func (x *xlsx) convert(data []byte, _ string) ([]converterOutput, error) {
outputs := make([]converterOutput, 0, len(sheets))

for _, sheet := range sheets {
output, err := readSheet(reader, sheet, x.getRowsToSkip(sheet))
output, err := readSheet(reader, sheet, x.getRowsToSkip(sheet), x.SanitizeHeaders)
if err != nil {
return nil, err
}
Expand All @@ -50,7 +51,7 @@ func (x *xlsx) convert(data []byte, _ string) ([]converterOutput, error) {
return outputs, nil
}

func readSheet(reader *excelize.File, sheet string, rowsToSkip int) (converterOutput, error) {
func readSheet(reader *excelize.File, sheet string, rowsToSkip int, sanitizeHeaders bool) (converterOutput, error) {
// Create buffer for this sheet
var buff bytes.Buffer
writer := csvEncoder.NewWriter(&buff)
Expand All @@ -64,6 +65,7 @@ func readSheet(reader *excelize.File, sheet string, rowsToSkip int) (converterOu

// Write rows to buffer
i := 0
isHeaderRow := true
for rows.Next() {
if i < rowsToSkip {
i++
Expand All @@ -75,6 +77,13 @@ func readSheet(reader *excelize.File, sheet string, rowsToSkip int) (converterOu
return converterOutput{}, err
}

if sanitizeHeaders && isHeaderRow {
for j, col := range cols {
cols[j] = sanitizeColumnName(col)
}
isHeaderRow = false
Comment thread
divyanshu-tiwari marked this conversation as resolved.
}

if err := writer.Write(cols); err != nil {
return converterOutput{}, err
}
Expand Down
Loading