Skip to content

Commit e2f28a6

Browse files
authored
Merge pull request #3989 from ruby/pm_regexp_classify_property-perf
Improve pm_regexp_classify_property perf
2 parents fb8f6fa + 3bdd792 commit e2f28a6

1 file changed

Lines changed: 54 additions & 30 deletions

File tree

src/regexp.c

Lines changed: 54 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -430,15 +430,6 @@ typedef enum {
430430
PM_REGEXP_PROPERTY_UNICODE
431431
} pm_regexp_property_type_t;
432432

433-
/**
434-
* Check if a property name matches a NUL-terminated target string
435-
* (case-insensitive, exact length match).
436-
*/
437-
static inline bool
438-
pm_regexp_property_name_matches(const uint8_t *name, size_t length, const char *target) {
439-
return strlen(target) == length && pm_strncasecmp(name, (const uint8_t *) target, length) == 0;
440-
}
441-
442433
/**
443434
* Classify a property name. The name may start with '^' for negation, which
444435
* is skipped before matching.
@@ -451,30 +442,63 @@ pm_regexp_classify_property(const uint8_t *name, size_t length) {
451442
length--;
452443
}
453444

454-
// POSIX properties — valid in all encodings.
455-
static const char *const posix_properties[] = {
456-
"Alnum", "Alpha", "ASCII", "Blank", "Cntrl", "Digit", "Graph",
457-
"Lower", "Print", "Punct", "Space", "Upper", "XDigit", "Word",
458-
NULL
459-
};
445+
#define PM_REGEXP_CASECMP(str_) (pm_strncasecmp(name, (const uint8_t *) (str_), length) == 0)
460446

461-
for (const char *const *property = posix_properties; *property != NULL; property++) {
462-
if (pm_regexp_property_name_matches(name, length, *property)) {
463-
return PM_REGEXP_PROPERTY_POSIX;
464-
}
447+
switch (length) {
448+
case 3:
449+
if (PM_REGEXP_CASECMP("Han")) return PM_REGEXP_PROPERTY_SCRIPT;
450+
break;
451+
case 4:
452+
if (PM_REGEXP_CASECMP("Word")) return PM_REGEXP_PROPERTY_POSIX;
453+
break;
454+
case 5:
455+
/* Most properties are length 5, so dispatch on first character. */
456+
switch (name[0] | 0x20) {
457+
case 'a':
458+
if (PM_REGEXP_CASECMP("Alnum")) return PM_REGEXP_PROPERTY_POSIX;
459+
if (PM_REGEXP_CASECMP("Alpha")) return PM_REGEXP_PROPERTY_POSIX;
460+
if (PM_REGEXP_CASECMP("ASCII")) return PM_REGEXP_PROPERTY_POSIX;
461+
break;
462+
case 'b':
463+
if (PM_REGEXP_CASECMP("Blank")) return PM_REGEXP_PROPERTY_POSIX;
464+
break;
465+
case 'c':
466+
if (PM_REGEXP_CASECMP("Cntrl")) return PM_REGEXP_PROPERTY_POSIX;
467+
break;
468+
case 'd':
469+
if (PM_REGEXP_CASECMP("Digit")) return PM_REGEXP_PROPERTY_POSIX;
470+
break;
471+
case 'g':
472+
if (PM_REGEXP_CASECMP("Graph")) return PM_REGEXP_PROPERTY_POSIX;
473+
if (PM_REGEXP_CASECMP("Greek")) return PM_REGEXP_PROPERTY_SCRIPT;
474+
break;
475+
case 'l':
476+
if (PM_REGEXP_CASECMP("Lower")) return PM_REGEXP_PROPERTY_POSIX;
477+
if (PM_REGEXP_CASECMP("Latin")) return PM_REGEXP_PROPERTY_SCRIPT;
478+
break;
479+
case 'p':
480+
if (PM_REGEXP_CASECMP("Print")) return PM_REGEXP_PROPERTY_POSIX;
481+
if (PM_REGEXP_CASECMP("Punct")) return PM_REGEXP_PROPERTY_POSIX;
482+
break;
483+
case 's':
484+
if (PM_REGEXP_CASECMP("Space")) return PM_REGEXP_PROPERTY_POSIX;
485+
break;
486+
case 'u':
487+
if (PM_REGEXP_CASECMP("Upper")) return PM_REGEXP_PROPERTY_POSIX;
488+
break;
489+
}
490+
break;
491+
case 6:
492+
if (PM_REGEXP_CASECMP("XDigit")) return PM_REGEXP_PROPERTY_POSIX;
493+
break;
494+
case 8:
495+
if (PM_REGEXP_CASECMP("Hiragana")) return PM_REGEXP_PROPERTY_SCRIPT;
496+
if (PM_REGEXP_CASECMP("Katakana")) return PM_REGEXP_PROPERTY_SCRIPT;
497+
if (PM_REGEXP_CASECMP("Cyrillic")) return PM_REGEXP_PROPERTY_SCRIPT;
498+
break;
465499
}
466500

467-
// Script properties — valid in /e, /s, /u but not /n.
468-
static const char *const script_properties[] = {
469-
"Hiragana", "Katakana", "Han", "Latin", "Greek", "Cyrillic",
470-
NULL
471-
};
472-
473-
for (const char *const *property = script_properties; *property != NULL; property++) {
474-
if (pm_regexp_property_name_matches(name, length, *property)) {
475-
return PM_REGEXP_PROPERTY_SCRIPT;
476-
}
477-
}
501+
#undef PM_REGEXP_CASECMP
478502

479503
// Everything else is Unicode-only (general categories, other scripts, etc.).
480504
return PM_REGEXP_PROPERTY_UNICODE;

0 commit comments

Comments
 (0)