From 3543680e8f96b5d071f62aa74c23f420ac2c7a6e Mon Sep 17 00:00:00 2001 From: lmasroca Date: Mon, 1 Jun 2026 17:25:36 -0300 Subject: [PATCH 1/5] Java regex added char class union (nesting), intersection and subtraction. --- .../org/evomaster/core/parser/RegexJava.g4 | 17 ++-- .../core/parser/GeneRegexJavaVisitor.kt | 40 ++++++++- .../search/gene/regex/CharacterRangeRxGene.kt | 2 +- .../core/utils/MultiCharacterRange.kt | 89 +++++++++++++++++++ 4 files changed, 139 insertions(+), 9 deletions(-) diff --git a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 index 6eeee8a209..ccc6a08147 100644 --- a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 +++ b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 @@ -193,11 +193,18 @@ patternCharacter characterClass - //TODO check if lookahead needed, or implicit in rule order resoution - //[ [lookahead ∉ {^}] ClassRanges ] - : BRACKET_open CARET classRanges BRACKET_close - | BRACKET_open classRanges BRACKET_close - ; + : BRACKET_open CARET classContents BRACKET_close + | BRACKET_open classContents BRACKET_close + ; + +classContents + : classUnion ('&&' classUnion)* + ; + +classUnion + : characterClass+ // one or more nested classes = UNION + | classRanges // bare ranges + ; classRanges : diff --git a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt index 8f3d784922..dfecde83db 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt @@ -3,6 +3,7 @@ package org.evomaster.core.parser import org.evomaster.core.search.gene.Gene import org.evomaster.core.search.gene.regex.* import org.evomaster.core.utils.CharacterRange +import org.evomaster.core.utils.MultiCharacterRange import org.evomaster.core.utils.ParsedFlagExpression import org.evomaster.core.utils.RegexFlags @@ -398,11 +399,44 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor(){ val negated = ctx.CARET() != null - val ranges = ctx.classRanges().accept(this).data as List + val innerMultiCharRanges = ctx.classContents().accept(this).data as MultiCharacterRange - val gene = CharacterRangeRxGene(negated, ranges, currentFlags) + val multiCharRanges = MultiCharacterRange(negated, innerMultiCharRanges) - return VisitResult(gene) + return if (ctx.parent is RegexJavaParser.AtomContext){ + // top level character class, create gene + VisitResult(CharacterRangeRxGene(multiCharRanges, currentFlags)) + } else { + // nested char class, set MultiCharacterRange as data + VisitResult(data = multiCharRanges) + } + } + + override fun visitClassContents(ctx: RegexJavaParser.ClassContentsContext): VisitResult { + + // intersect the unions of ranges + val mcr = ctx.classUnion() + .map { it.accept(this).data as MultiCharacterRange } + .reduce { acc, item -> MultiCharacterRange.intersect(acc, item) } + + return VisitResult(data=mcr) + } + + override fun visitClassUnion(ctx: RegexJavaParser.ClassUnionContext): VisitResult { + + return if (ctx.characterClass().isNotEmpty()) { + // union of char classes + val mcr = ctx.characterClass() + .map { it.accept(this).data as MultiCharacterRange } + .reduce { acc, item -> MultiCharacterRange.union(acc, item) } + + VisitResult(data=mcr) + } else { + // single classRanges + val ranges = ctx.classRanges().accept(this).data as List + + VisitResult(data=MultiCharacterRange(false, ranges)) + } } override fun visitClassRanges(ctx: RegexJavaParser.ClassRangesContext): VisitResult { diff --git a/core/src/main/kotlin/org/evomaster/core/search/gene/regex/CharacterRangeRxGene.kt b/core/src/main/kotlin/org/evomaster/core/search/gene/regex/CharacterRangeRxGene.kt index a921d8497f..c1401621bb 100644 --- a/core/src/main/kotlin/org/evomaster/core/search/gene/regex/CharacterRangeRxGene.kt +++ b/core/src/main/kotlin/org/evomaster/core/search/gene/regex/CharacterRangeRxGene.kt @@ -15,7 +15,7 @@ import org.evomaster.core.utils.MultiCharacterRange import org.evomaster.core.utils.RegexFlags import org.slf4j.LoggerFactory -class CharacterRangeRxGene private constructor( +class CharacterRangeRxGene( /** * this represents the valid ranges for a character class, removing overlaps and applying negation */ diff --git a/core/src/main/kotlin/org/evomaster/core/utils/MultiCharacterRange.kt b/core/src/main/kotlin/org/evomaster/core/utils/MultiCharacterRange.kt index 8ea9050bae..c13c99f08f 100644 --- a/core/src/main/kotlin/org/evomaster/core/utils/MultiCharacterRange.kt +++ b/core/src/main/kotlin/org/evomaster/core/utils/MultiCharacterRange.kt @@ -16,6 +16,14 @@ class MultiCharacterRange internal constructor(val ranges: List) return MultiCharacterRange(negated, characters.map { CharacterRange(it, it) }) } + operator fun invoke(negated: Boolean, multiCharRange: MultiCharacterRange): MultiCharacterRange { + return if (negated) { + MultiCharacterRange(true, multiCharRange.ranges) + } else { + multiCharRange + } + } + operator fun invoke(negated: Boolean, ranges: List): MultiCharacterRange { if (ranges.isEmpty()) { throw IllegalArgumentException("No defined ranges") @@ -93,6 +101,87 @@ class MultiCharacterRange internal constructor(val ranges: List) } }.toMutableList() } + + /** + * Create an intersection from two [org.evomaster.core.utils.MultiCharacterRange] instances + * Used to allow character class intersections (e.g.: `[a-z0-9&&[0-9A-Z]]`). + */ + fun intersect(a: MultiCharacterRange, b: MultiCharacterRange): MultiCharacterRange { + val result = mutableListOf() + + var idxA = 0 + var idxB = 0 + + val lenA = a.size + val lenB = b.size + + while (idxA < lenA && idxB < lenB) { + val start = maxOf(a[idxA].start, b[idxB].start) + val end = minOf(a[idxA].end, b[idxB].end) + + if (start <= end) { + result.add(CharacterRange(start, end)) + } + + if ( a[idxA].end < b[idxB].end ) { + idxA++ + } else { + idxB++ + } + } + + return MultiCharacterRange(result) + } + + /** + * Creates a union from two [MultiCharacterRange] instances, merging overlapping + * and adjacent ranges into a single normalized [MultiCharacterRange]. + * Used to allow character class unions (e.g.: `[[a-c][x-z]]`). + */ + fun union(a: MultiCharacterRange, b: MultiCharacterRange): MultiCharacterRange { + val result = mutableListOf() + var idxA = 0 + var idxB = 0 + + while (idxA < a.size && idxB < b.size) { + // pick the range with the smaller start + val (start, end) = if (a[idxA].start <= b[idxB].start) { + a[idxA].start to a[idxA].end.also { idxA++ } + } else { + b[idxB].start to b[idxB].end.also { idxB++ } + } + + // merge with last range in result if overlapping or adjacent + if (result.isNotEmpty() && start.code <= result.last().end.code + 1) { + val last = result.removeLast() + result.add(CharacterRange(last.start, maxOf(last.end, end))) + } else { + result.add(CharacterRange(start, end)) + } + } + + // append remaining ranges from whichever list isn't exhausted + while (idxA < a.size) { + val curr = a[idxA++] + if (result.isNotEmpty() && curr.start.code <= result.last().end.code + 1) { + val last = result.removeLast() + result.add(CharacterRange(last.start, maxOf(last.end, curr.end))) + } else { + result.add(curr) + } + } + while (idxB < b.size) { + val curr = b[idxB++] + if (result.isNotEmpty() && curr.start.code <= result.last().end.code + 1) { + val last = result.removeLast() + result.add(CharacterRange(last.start, maxOf(last.end, curr.end))) + } else { + result.add(curr) + } + } + + return MultiCharacterRange(result) + } } /** From a179a170ef6bc14afec16a4d3defb6f16bd95198 Mon Sep 17 00:00:00 2001 From: lmasroca Date: Mon, 1 Jun 2026 17:26:30 -0300 Subject: [PATCH 2/5] Added some tests. --- .../core/parser/GeneRegexJavaVisitorTest.kt | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt b/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt index 73edc55d60..d1dc1cc4aa 100644 --- a/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt +++ b/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt @@ -251,4 +251,26 @@ class GeneRegexJavaVisitorTest : GeneRegexEcma262VisitorTest() { override fun testJSExclusiveEscapes() { // JS exclusive } + + @Test + fun testCharClassIntersectionSubtractionAndNesting(){ + checkSameAsJava("[abc-e[f-h]ij-l[m]n]") + checkSameAsJava("[a&&a][a&&a&&a]") + checkSameAsJava("[a-z&&[aeiou]]") + checkSameAsJava("[a-z&&[^aeiou]]") + checkSameAsJava("[a-z&&[a-p]&&[f-z]]") + checkSameAsJava("[ac-e&&[a-d]]") + checkSameAsJava("[\\w&&[a-z]]") + checkSameAsJava("[a-z&&[b-y]]") + checkSameAsJava("[a-z0-9&&[A-Z0-9]&&[2B4C]]") + checkSameAsJava("[[a-c][x-z]&&[b-y]]") + checkSameAsJava("[a-c&&[b-d]e-g]") + checkSameAsJava("[^a-z&&[^aeiou]]") + checkSameAsJava("[\\s&&[^\\n]]") + checkSameAsJava("[a-c&&[c-e]]") + checkSameAsJava("[a-z&&[a-z]]") + checkSameAsJava("[a-ce-g&&[b-f]]") + checkSameAsJava("[[a-z&&[a-p]]&&[f-z]]") + checkSameAsJava("[a[b[c[d&&[\\w]]]][0-7&&\\d&&[0-5]&&1-5]]") + } } \ No newline at end of file From 7d63c811c73250e51d406d4ca7c49487d5e0ee80 Mon Sep 17 00:00:00 2001 From: lmasroca Date: Mon, 1 Jun 2026 17:28:21 -0300 Subject: [PATCH 3/5] Small fix. --- core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 | 1 + .../kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt | 1 + 2 files changed, 2 insertions(+) diff --git a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 index ccc6a08147..641078d923 100644 --- a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 +++ b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 @@ -189,6 +189,7 @@ patternCharacter | BRACE_close | BRACKET_close | COLON + | '&&' ; diff --git a/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt b/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt index d1dc1cc4aa..489657b294 100644 --- a/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt +++ b/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt @@ -272,5 +272,6 @@ class GeneRegexJavaVisitorTest : GeneRegexEcma262VisitorTest() { checkSameAsJava("[a-ce-g&&[b-f]]") checkSameAsJava("[[a-z&&[a-p]]&&[f-z]]") checkSameAsJava("[a[b[c[d&&[\\w]]]][0-7&&\\d&&[0-5]&&1-5]]") + checkSameAsJava("&&") } } \ No newline at end of file From 73d896e9fa558b373629a7b1745c624542ca75db Mon Sep 17 00:00:00 2001 From: lmasroca Date: Wed, 3 Jun 2026 14:03:29 -0300 Subject: [PATCH 4/5] Java regex made "&&" an explicit lexer token. --- .../main/antlr4/org/evomaster/core/parser/RegexJava.g4 | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 index 641078d923..7abdcbdeae 100644 --- a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 +++ b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 @@ -189,7 +189,7 @@ patternCharacter | BRACE_close | BRACKET_close | COLON - | '&&' + | DOUBLE_AMPERSAND ; @@ -199,7 +199,7 @@ characterClass ; classContents - : classUnion ('&&' classUnion)* + : classUnion (DOUBLE_AMPERSAND classUnion)* ; classUnion @@ -269,6 +269,10 @@ atomEscape //------ LEXER ------------------------------ // Lexer rules have first letter in upper-case +DOUBLE_AMPERSAND + : '&&' + ; + DecimalDigit : [0-9] ; From ab11660245aa5efead31e5916b4f7b5de305b29c Mon Sep 17 00:00:00 2001 From: lmasroca Date: Thu, 4 Jun 2026 13:19:40 -0300 Subject: [PATCH 5/5] Added a comment. --- core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 index 7abdcbdeae..7af80c6166 100644 --- a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 +++ b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 @@ -189,7 +189,7 @@ patternCharacter | BRACE_close | BRACKET_close | COLON - | DOUBLE_AMPERSAND + | DOUBLE_AMPERSAND // char class intersection not supported by default in JS, only supported if "v" flag is turned on. ;