fileprep/prep.go at main · nao1215/fileprep · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
package fileprep

import (
	"regexp"
	"strconv"
	"strings"
	"unicode"

	"golang.org/x/text/unicode/norm"
)

// String constants for coercion results and URL schemes
const (
	boolTrueValue   = "true"
	boolFalseValue  = "false"
	httpsSchemeType = "https"
)

// Preprocessor defines the interface for preprocessing values
type Preprocessor interface {
	// Process applies preprocessing to the value and returns the result
	Process(value string) string
	// Name returns the name of the preprocessor for error reporting
	Name() string
}

// trimPreprocessor removes leading and trailing whitespace
type trimPreprocessor struct{}

// newTrimPreprocessor creates a new trim preprocessor
func newTrimPreprocessor() *trimPreprocessor {
	return &trimPreprocessor{}
}

// Process removes leading and trailing whitespace
func (p *trimPreprocessor) Process(value string) string {
	return strings.TrimSpace(value)
}

// Name returns the preprocessor name
func (p *trimPreprocessor) Name() string {
	return trimTagValue
}

// ltrimPreprocessor removes leading whitespace
type ltrimPreprocessor struct{}

// newLtrimPreprocessor creates a new left trim preprocessor
func newLtrimPreprocessor() *ltrimPreprocessor {
	return &ltrimPreprocessor{}
}

// Process removes leading whitespace
func (p *ltrimPreprocessor) Process(value string) string {
	return strings.TrimLeft(value, " \t\n\r")
}

// Name returns the preprocessor name
func (p *ltrimPreprocessor) Name() string {
	return ltrimTagValue
}

// rtrimPreprocessor removes trailing whitespace
type rtrimPreprocessor struct{}

// newRtrimPreprocessor creates a new right trim preprocessor
func newRtrimPreprocessor() *rtrimPreprocessor {
	return &rtrimPreprocessor{}
}

// Process removes trailing whitespace
func (p *rtrimPreprocessor) Process(value string) string {
	return strings.TrimRight(value, " \t\n\r")
}

// Name returns the preprocessor name
func (p *rtrimPreprocessor) Name() string {
	return rtrimTagValue
}

// lowercasePreprocessor converts value to lowercase
type lowercasePreprocessor struct{}

// newLowercasePreprocessor creates a new lowercase preprocessor
func newLowercasePreprocessor() *lowercasePreprocessor {
	return &lowercasePreprocessor{}
}

// Process converts value to lowercase
func (p *lowercasePreprocessor) Process(value string) string {
	return strings.ToLower(value)
}

// Name returns the preprocessor name
func (p *lowercasePreprocessor) Name() string {
	return lowercaseTagValue
}

// uppercasePreprocessor converts value to uppercase
type uppercasePreprocessor struct{}

// newUppercasePreprocessor creates a new uppercase preprocessor
func newUppercasePreprocessor() *uppercasePreprocessor {
	return &uppercasePreprocessor{}
}

// Process converts value to uppercase
func (p *uppercasePreprocessor) Process(value string) string {
	return strings.ToUpper(value)
}

// Name returns the preprocessor name
func (p *uppercasePreprocessor) Name() string {
	return uppercaseTagValue
}

// defaultPreprocessor sets a default value if the input is empty
type defaultPreprocessor struct {
	defaultValue string
}

// newDefaultPreprocessor creates a new default value preprocessor
func newDefaultPreprocessor(defaultValue string) *defaultPreprocessor {
	return &defaultPreprocessor{defaultValue: defaultValue}
}

// Process sets the default value if input is empty
func (p *defaultPreprocessor) Process(value string) string {
	if strings.TrimSpace(value) == "" {
		return p.defaultValue
	}
	return value
}

// Name returns the preprocessor name
func (p *defaultPreprocessor) Name() string {
	return defaultTagValue
}

// preprocessors is a slice of Preprocessor
type preprocessors []Preprocessor

// Process applies all preprocessors in order
func (ps preprocessors) Process(value string) string {
	result := value
	for _, p := range ps {
		result = p.Process(result)
	}
	return result
}

// =============================================================================
// String Transformation Preprocessors
// =============================================================================

// replacePreprocessor replaces occurrences of old string with new string
type replacePreprocessor struct {
	oldStr string
	newStr string
}

// newReplacePreprocessor creates a new replace preprocessor
func newReplacePreprocessor(oldStr, newStr string) *replacePreprocessor {
	return &replacePreprocessor{oldStr: oldStr, newStr: newStr}
}

// Process replaces all occurrences of old with new
func (p *replacePreprocessor) Process(value string) string {
	return strings.ReplaceAll(value, p.oldStr, p.newStr)
}

// Name returns the preprocessor name
func (p *replacePreprocessor) Name() string {
	return replaceTagValue
}

// prefixPreprocessor prepends a string to the value
type prefixPreprocessor struct {
	prefix string
}

// newPrefixPreprocessor creates a new prefix preprocessor
func newPrefixPreprocessor(prefix string) *prefixPreprocessor {
	return &prefixPreprocessor{prefix: prefix}
}

// Process prepends the prefix to the value
func (p *prefixPreprocessor) Process(value string) string {
	return p.prefix + value
}

// Name returns the preprocessor name
func (p *prefixPreprocessor) Name() string {
	return prefixTagValue
}

// suffixPreprocessor appends a string to the value
type suffixPreprocessor struct {
	suffix string
}

// newSuffixPreprocessor creates a new suffix preprocessor
func newSuffixPreprocessor(suffix string) *suffixPreprocessor {
	return &suffixPreprocessor{suffix: suffix}
}

// Process appends the suffix to the value
func (p *suffixPreprocessor) Process(value string) string {
	return value + p.suffix
}

// Name returns the preprocessor name
func (p *suffixPreprocessor) Name() string {
	return suffixTagValue
}

// truncatePreprocessor limits the value to a maximum number of characters
type truncatePreprocessor struct {
	maxLen int
}

// newTruncatePreprocessor creates a new truncate preprocessor
func newTruncatePreprocessor(maxLen int) *truncatePreprocessor {
	return &truncatePreprocessor{maxLen: maxLen}
}

// Process truncates the value to the maximum length
func (p *truncatePreprocessor) Process(value string) string {
	runes := []rune(value)
	if len(runes) <= p.maxLen {
		return value
	}
	return string(runes[:p.maxLen])
}

// Name returns the preprocessor name
func (p *truncatePreprocessor) Name() string {
	return truncateTagValue
}

// stripHTMLPreprocessor removes HTML tags from the value
type stripHTMLPreprocessor struct {
	re *regexp.Regexp
}

// newStripHTMLPreprocessor creates a new strip HTML preprocessor
func newStripHTMLPreprocessor() *stripHTMLPreprocessor {
	return &stripHTMLPreprocessor{
		re: regexp.MustCompile(`<[^>]*>`),
	}
}

// Process removes HTML tags from the value
func (p *stripHTMLPreprocessor) Process(value string) string {
	return p.re.ReplaceAllString(value, "")
}

// Name returns the preprocessor name
func (p *stripHTMLPreprocessor) Name() string {
	return stripHTMLTagValue
}

// stripNewlinePreprocessor removes newlines and CRLF from the value
type stripNewlinePreprocessor struct{}

// newStripNewlinePreprocessor creates a new strip newline preprocessor
func newStripNewlinePreprocessor() *stripNewlinePreprocessor {
	return &stripNewlinePreprocessor{}
}

// Process removes newlines from the value.
// This implementation avoids multiple string allocations.
func (p *stripNewlinePreprocessor) Process(value string) string {
	// Quick check: if no newlines, return as-is
	if !strings.ContainsAny(value, "\r\n") {
		return value
	}

	var result strings.Builder
	result.Grow(len(value))
	for _, r := range value {
		if r != '\r' && r != '\n' {
			result.WriteRune(r)
		}
	}
	return result.String()
}

// Name returns the preprocessor name
func (p *stripNewlinePreprocessor) Name() string {
	return stripNewlineTagValue
}

// collapseSpacePreprocessor collapses multiple spaces into one
type collapseSpacePreprocessor struct{}

// newCollapseSpacePreprocessor creates a new collapse space preprocessor
func newCollapseSpacePreprocessor() *collapseSpacePreprocessor {
	return &collapseSpacePreprocessor{}
}

// Process collapses multiple whitespace characters into a single space.
// This implementation avoids regexp for better performance.
func (p *collapseSpacePreprocessor) Process(value string) string {
	if value == "" {
		return value
	}

	var result strings.Builder
	result.Grow(len(value))

	inSpace := false
	for _, r := range value {
		isWhitespace := r == ' ' || r == '\t' || r == '\n' || r == '\r'
		if isWhitespace {
			if !inSpace {
				result.WriteByte(' ')
				inSpace = true
			}
		} else {
			result.WriteRune(r)
			inSpace = false
		}
	}

	return result.String()
}

// Name returns the preprocessor name
func (p *collapseSpacePreprocessor) Name() string {
	return collapseSpaceTagValue
}

// =============================================================================
// Character Filtering Preprocessors
// =============================================================================

// removeDigitsPreprocessor removes all digits from the value
type removeDigitsPreprocessor struct{}

// newRemoveDigitsPreprocessor creates a new remove digits preprocessor
func newRemoveDigitsPreprocessor() *removeDigitsPreprocessor {
	return &removeDigitsPreprocessor{}
}

// Process removes all digits from the value
func (p *removeDigitsPreprocessor) Process(value string) string {
	var result strings.Builder
	result.Grow(len(value))
	for _, r := range value {
		if !unicode.IsDigit(r) {
			result.WriteRune(r)
		}
	}
	return result.String()
}

// Name returns the preprocessor name
func (p *removeDigitsPreprocessor) Name() string {
	return removeDigitsTagValue
}

// removeAlphaPreprocessor removes all alphabetic characters from the value
type removeAlphaPreprocessor struct{}

// newRemoveAlphaPreprocessor creates a new remove alpha preprocessor
func newRemoveAlphaPreprocessor() *removeAlphaPreprocessor {
	return &removeAlphaPreprocessor{}
}

// Process removes all alphabetic characters from the value
func (p *removeAlphaPreprocessor) Process(value string) string {
	var result strings.Builder
	result.Grow(len(value))
	for _, r := range value {
		if !unicode.IsLetter(r) {
			result.WriteRune(r)
		}
	}
	return result.String()
}

// Name returns the preprocessor name
func (p *removeAlphaPreprocessor) Name() string {
	return removeAlphaTagValue
}

// keepDigitsPreprocessor keeps only digits in the value
type keepDigitsPreprocessor struct{}

// newKeepDigitsPreprocessor creates a new keep digits preprocessor
func newKeepDigitsPreprocessor() *keepDigitsPreprocessor {
	return &keepDigitsPreprocessor{}
}

// Process keeps only digits in the value
func (p *keepDigitsPreprocessor) Process(value string) string {
	var result strings.Builder
	result.Grow(len(value))
	for _, r := range value {
		if unicode.IsDigit(r) {
			result.WriteRune(r)
		}
	}
	return result.String()
}

// Name returns the preprocessor name
func (p *keepDigitsPreprocessor) Name() string {
	return keepDigitsTagValue
}

// keepAlphaPreprocessor keeps only alphabetic characters in the value
type keepAlphaPreprocessor struct{}

// newKeepAlphaPreprocessor creates a new keep alpha preprocessor
func newKeepAlphaPreprocessor() *keepAlphaPreprocessor {
	return &keepAlphaPreprocessor{}
}

// Process keeps only alphabetic characters in the value
func (p *keepAlphaPreprocessor) Process(value string) string {
	var result strings.Builder
	result.Grow(len(value))
	for _, r := range value {
		if unicode.IsLetter(r) {
			result.WriteRune(r)
		}
	}
	return result.String()
}

// Name returns the preprocessor name
func (p *keepAlphaPreprocessor) Name() string {
	return keepAlphaTagValue
}

// trimSetPreprocessor removes specified characters from both ends
type trimSetPreprocessor struct {
	cutset string
}

// newTrimSetPreprocessor creates a new trim set preprocessor
func newTrimSetPreprocessor(cutset string) *trimSetPreprocessor {
	return &trimSetPreprocessor{cutset: cutset}
}

// Process removes the specified characters from both ends
func (p *trimSetPreprocessor) Process(value string) string {
	return strings.Trim(value, p.cutset)
}

// Name returns the preprocessor name
func (p *trimSetPreprocessor) Name() string {
	return trimSetTagValue
}

// =============================================================================
// Padding Preprocessors
// =============================================================================

// padLeftPreprocessor left-pads the value to a specified length
type padLeftPreprocessor struct {
	length  int
	padChar rune
}

// newPadLeftPreprocessor creates a new left padding preprocessor
func newPadLeftPreprocessor(length int, padChar rune) *padLeftPreprocessor {
	return &padLeftPreprocessor{length: length, padChar: padChar}
}

// Process left-pads the value to the specified length
func (p *padLeftPreprocessor) Process(value string) string {
	runeCount := len([]rune(value))
	if runeCount >= p.length {
		return value
	}
	padCount := p.length - runeCount
	var result strings.Builder
	result.Grow(len(value) + padCount)
	for range padCount {
		result.WriteRune(p.padChar)
	}
	result.WriteString(value)
	return result.String()
}

// Name returns the preprocessor name
func (p *padLeftPreprocessor) Name() string {
	return padLeftTagValue
}

// padRightPreprocessor right-pads the value to a specified length
type padRightPreprocessor struct {
	length  int
	padChar rune
}

// newPadRightPreprocessor creates a new right padding preprocessor
func newPadRightPreprocessor(length int, padChar rune) *padRightPreprocessor {
	return &padRightPreprocessor{length: length, padChar: padChar}
}

// Process right-pads the value to the specified length
func (p *padRightPreprocessor) Process(value string) string {
	runeCount := len([]rune(value))
	if runeCount >= p.length {
		return value
	}
	padCount := p.length - runeCount
	var result strings.Builder
	result.Grow(len(value) + padCount)
	result.WriteString(value)
	for range padCount {
		result.WriteRune(p.padChar)
	}
	return result.String()
}

// Name returns the preprocessor name
func (p *padRightPreprocessor) Name() string {
	return padRightTagValue
}

// =============================================================================
// Advanced Preprocessors
// =============================================================================

// normalizeUnicodePreprocessor normalizes Unicode to NFC form
type normalizeUnicodePreprocessor struct{}

// newNormalizeUnicodePreprocessor creates a new Unicode normalization preprocessor
func newNormalizeUnicodePreprocessor() *normalizeUnicodePreprocessor {
	return &normalizeUnicodePreprocessor{}
}

// Process normalizes the value to NFC form
func (p *normalizeUnicodePreprocessor) Process(value string) string {
	return norm.NFC.String(value)
}

// Name returns the preprocessor name
func (p *normalizeUnicodePreprocessor) Name() string {
	return normalizeUnicodeTagValue
}

// nullifyPreprocessor treats a specific string as empty
type nullifyPreprocessor struct {
	nullValue string
}

// newNullifyPreprocessor creates a new nullify preprocessor
func newNullifyPreprocessor(nullValue string) *nullifyPreprocessor {
	return &nullifyPreprocessor{nullValue: nullValue}
}

// Process returns empty string if value matches the null value
func (p *nullifyPreprocessor) Process(value string) string {
	if value == p.nullValue {
		return ""
	}
	return value
}

// Name returns the preprocessor name
func (p *nullifyPreprocessor) Name() string {
	return nullifyTagValue
}

// coercePreprocessor performs light type coercion formatting
type coercePreprocessor struct {
	targetType string
}

// newCoercePreprocessor creates a new coerce preprocessor
func newCoercePreprocessor(targetType string) *coercePreprocessor {
	return &coercePreprocessor{targetType: targetType}
}

// Process performs light formatting based on target type
func (p *coercePreprocessor) Process(value string) string {
	trimmed := strings.TrimSpace(value)
	if trimmed == "" {
		return value
	}

	switch p.targetType {
	case "int":
		// Try to parse as float first to handle "123.0" -> "123"
		if f, err := strconv.ParseFloat(trimmed, 64); err == nil {
			return strconv.FormatInt(int64(f), 10)
		}
	case "float":
		if f, err := strconv.ParseFloat(trimmed, 64); err == nil {
			return strconv.FormatFloat(f, 'f', -1, 64)
		}
	case "bool":
		lower := strings.ToLower(trimmed)
		switch lower {
		case boolTrueValue, "1", "yes", "on":
			return boolTrueValue
		case boolFalseValue, "0", "no", "off":
			return boolFalseValue
		}
	}
	return value
}

// Name returns the preprocessor name
func (p *coercePreprocessor) Name() string {
	return coerceTagValue
}

// fixSchemePreprocessor adds or corrects URL scheme
type fixSchemePreprocessor struct {
	scheme string
}

// newFixSchemePreprocessor creates a new fix scheme preprocessor
func newFixSchemePreprocessor(scheme string) *fixSchemePreprocessor {
	return &fixSchemePreprocessor{scheme: scheme}
}

// Process adds scheme if missing, or replaces http with https if scheme is "https"
func (p *fixSchemePreprocessor) Process(value string) string {
	trimmed := strings.TrimSpace(value)
	if trimmed == "" {
		return value
	}

	// Check if URL already has a scheme
	if strings.HasPrefix(trimmed, "http://") {
		if p.scheme == httpsSchemeType {
			return httpsSchemeType + "://" + strings.TrimPrefix(trimmed, "http://")
		}
		return trimmed
	}
	if strings.HasPrefix(trimmed, "https://") {
		return trimmed
	}

	// Add scheme if missing
	return p.scheme + "://" + trimmed
}

// Name returns the preprocessor name
func (p *fixSchemePreprocessor) Name() string {
	return fixSchemeTagValue
}

// regexReplacePreprocessor performs regex-based replacement
type regexReplacePreprocessor struct {
	re          *regexp.Regexp
	replacement string
}

// newRegexReplacePreprocessor creates a new regex replace preprocessor
// Returns nil if the pattern is invalid
func newRegexReplacePreprocessor(pattern, replacement string) *regexReplacePreprocessor {
	re, err := regexp.Compile(pattern)
	if err != nil {
		return nil
	}
	return &regexReplacePreprocessor{re: re, replacement: replacement}
}

// Process applies regex replacement to the value
func (p *regexReplacePreprocessor) Process(value string) string {
	if p.re == nil {
		return value
	}
	return p.re.ReplaceAllString(value, p.replacement)
}

// Name returns the preprocessor name
func (p *regexReplacePreprocessor) Name() string {
	return regexReplaceTagValue
}