diff --git a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 index 6eeee8a209..7af80c6166 100644 --- a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 +++ b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 @@ -189,15 +189,23 @@ patternCharacter | BRACE_close | BRACKET_close | COLON + | DOUBLE_AMPERSAND // char class intersection not supported by default in JS, only supported if "v" flag is turned on. ; characterClass - //TODO check if lookahead needed, or implicit in rule order resoution - //[ [lookahead ∉ {^}] ClassRanges ] - : BRACKET_open CARET classRanges BRACKET_close - | BRACKET_open classRanges BRACKET_close - ; + : BRACKET_open CARET classContents BRACKET_close + | BRACKET_open classContents BRACKET_close + ; + +classContents + : classUnion (DOUBLE_AMPERSAND classUnion)* + ; + +classUnion + : characterClass+ // one or more nested classes = UNION + | classRanges // bare ranges + ; classRanges : @@ -261,6 +269,10 @@ atomEscape //------ LEXER ------------------------------ // Lexer rules have first letter in upper-case +DOUBLE_AMPERSAND + : '&&' + ; + DecimalDigit : [0-9] ; diff --git a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt index 8f3d784922..dfecde83db 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt @@ -3,6 +3,7 @@ package org.evomaster.core.parser import org.evomaster.core.search.gene.Gene import org.evomaster.core.search.gene.regex.* import org.evomaster.core.utils.CharacterRange +import org.evomaster.core.utils.MultiCharacterRange import org.evomaster.core.utils.ParsedFlagExpression import org.evomaster.core.utils.RegexFlags @@ -398,11 +399,44 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor(){ val negated = ctx.CARET() != null - val ranges = ctx.classRanges().accept(this).data as List + val innerMultiCharRanges = ctx.classContents().accept(this).data as MultiCharacterRange - val gene = CharacterRangeRxGene(negated, ranges, currentFlags) + val multiCharRanges = MultiCharacterRange(negated, innerMultiCharRanges) - return VisitResult(gene) + return if (ctx.parent is RegexJavaParser.AtomContext){ + // top level character class, create gene + VisitResult(CharacterRangeRxGene(multiCharRanges, currentFlags)) + } else { + // nested char class, set MultiCharacterRange as data + VisitResult(data = multiCharRanges) + } + } + + override fun visitClassContents(ctx: RegexJavaParser.ClassContentsContext): VisitResult { + + // intersect the unions of ranges + val mcr = ctx.classUnion() + .map { it.accept(this).data as MultiCharacterRange } + .reduce { acc, item -> MultiCharacterRange.intersect(acc, item) } + + return VisitResult(data=mcr) + } + + override fun visitClassUnion(ctx: RegexJavaParser.ClassUnionContext): VisitResult { + + return if (ctx.characterClass().isNotEmpty()) { + // union of char classes + val mcr = ctx.characterClass() + .map { it.accept(this).data as MultiCharacterRange } + .reduce { acc, item -> MultiCharacterRange.union(acc, item) } + + VisitResult(data=mcr) + } else { + // single classRanges + val ranges = ctx.classRanges().accept(this).data as List + + VisitResult(data=MultiCharacterRange(false, ranges)) + } } override fun visitClassRanges(ctx: RegexJavaParser.ClassRangesContext): VisitResult { diff --git a/core/src/main/kotlin/org/evomaster/core/search/gene/regex/CharacterRangeRxGene.kt b/core/src/main/kotlin/org/evomaster/core/search/gene/regex/CharacterRangeRxGene.kt index a921d8497f..c1401621bb 100644 --- a/core/src/main/kotlin/org/evomaster/core/search/gene/regex/CharacterRangeRxGene.kt +++ b/core/src/main/kotlin/org/evomaster/core/search/gene/regex/CharacterRangeRxGene.kt @@ -15,7 +15,7 @@ import org.evomaster.core.utils.MultiCharacterRange import org.evomaster.core.utils.RegexFlags import org.slf4j.LoggerFactory -class CharacterRangeRxGene private constructor( +class CharacterRangeRxGene( /** * this represents the valid ranges for a character class, removing overlaps and applying negation */ diff --git a/core/src/main/kotlin/org/evomaster/core/utils/MultiCharacterRange.kt b/core/src/main/kotlin/org/evomaster/core/utils/MultiCharacterRange.kt index 8ea9050bae..c13c99f08f 100644 --- a/core/src/main/kotlin/org/evomaster/core/utils/MultiCharacterRange.kt +++ b/core/src/main/kotlin/org/evomaster/core/utils/MultiCharacterRange.kt @@ -16,6 +16,14 @@ class MultiCharacterRange internal constructor(val ranges: List) return MultiCharacterRange(negated, characters.map { CharacterRange(it, it) }) } + operator fun invoke(negated: Boolean, multiCharRange: MultiCharacterRange): MultiCharacterRange { + return if (negated) { + MultiCharacterRange(true, multiCharRange.ranges) + } else { + multiCharRange + } + } + operator fun invoke(negated: Boolean, ranges: List): MultiCharacterRange { if (ranges.isEmpty()) { throw IllegalArgumentException("No defined ranges") @@ -93,6 +101,87 @@ class MultiCharacterRange internal constructor(val ranges: List) } }.toMutableList() } + + /** + * Create an intersection from two [org.evomaster.core.utils.MultiCharacterRange] instances + * Used to allow character class intersections (e.g.: `[a-z0-9&&[0-9A-Z]]`). + */ + fun intersect(a: MultiCharacterRange, b: MultiCharacterRange): MultiCharacterRange { + val result = mutableListOf() + + var idxA = 0 + var idxB = 0 + + val lenA = a.size + val lenB = b.size + + while (idxA < lenA && idxB < lenB) { + val start = maxOf(a[idxA].start, b[idxB].start) + val end = minOf(a[idxA].end, b[idxB].end) + + if (start <= end) { + result.add(CharacterRange(start, end)) + } + + if ( a[idxA].end < b[idxB].end ) { + idxA++ + } else { + idxB++ + } + } + + return MultiCharacterRange(result) + } + + /** + * Creates a union from two [MultiCharacterRange] instances, merging overlapping + * and adjacent ranges into a single normalized [MultiCharacterRange]. + * Used to allow character class unions (e.g.: `[[a-c][x-z]]`). + */ + fun union(a: MultiCharacterRange, b: MultiCharacterRange): MultiCharacterRange { + val result = mutableListOf() + var idxA = 0 + var idxB = 0 + + while (idxA < a.size && idxB < b.size) { + // pick the range with the smaller start + val (start, end) = if (a[idxA].start <= b[idxB].start) { + a[idxA].start to a[idxA].end.also { idxA++ } + } else { + b[idxB].start to b[idxB].end.also { idxB++ } + } + + // merge with last range in result if overlapping or adjacent + if (result.isNotEmpty() && start.code <= result.last().end.code + 1) { + val last = result.removeLast() + result.add(CharacterRange(last.start, maxOf(last.end, end))) + } else { + result.add(CharacterRange(start, end)) + } + } + + // append remaining ranges from whichever list isn't exhausted + while (idxA < a.size) { + val curr = a[idxA++] + if (result.isNotEmpty() && curr.start.code <= result.last().end.code + 1) { + val last = result.removeLast() + result.add(CharacterRange(last.start, maxOf(last.end, curr.end))) + } else { + result.add(curr) + } + } + while (idxB < b.size) { + val curr = b[idxB++] + if (result.isNotEmpty() && curr.start.code <= result.last().end.code + 1) { + val last = result.removeLast() + result.add(CharacterRange(last.start, maxOf(last.end, curr.end))) + } else { + result.add(curr) + } + } + + return MultiCharacterRange(result) + } } /** diff --git a/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt b/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt index 73edc55d60..489657b294 100644 --- a/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt +++ b/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt @@ -251,4 +251,27 @@ class GeneRegexJavaVisitorTest : GeneRegexEcma262VisitorTest() { override fun testJSExclusiveEscapes() { // JS exclusive } + + @Test + fun testCharClassIntersectionSubtractionAndNesting(){ + checkSameAsJava("[abc-e[f-h]ij-l[m]n]") + checkSameAsJava("[a&&a][a&&a&&a]") + checkSameAsJava("[a-z&&[aeiou]]") + checkSameAsJava("[a-z&&[^aeiou]]") + checkSameAsJava("[a-z&&[a-p]&&[f-z]]") + checkSameAsJava("[ac-e&&[a-d]]") + checkSameAsJava("[\\w&&[a-z]]") + checkSameAsJava("[a-z&&[b-y]]") + checkSameAsJava("[a-z0-9&&[A-Z0-9]&&[2B4C]]") + checkSameAsJava("[[a-c][x-z]&&[b-y]]") + checkSameAsJava("[a-c&&[b-d]e-g]") + checkSameAsJava("[^a-z&&[^aeiou]]") + checkSameAsJava("[\\s&&[^\\n]]") + checkSameAsJava("[a-c&&[c-e]]") + checkSameAsJava("[a-z&&[a-z]]") + checkSameAsJava("[a-ce-g&&[b-f]]") + checkSameAsJava("[[a-z&&[a-p]]&&[f-z]]") + checkSameAsJava("[a[b[c[d&&[\\w]]]][0-7&&\\d&&[0-5]&&1-5]]") + checkSameAsJava("&&") + } } \ No newline at end of file