diff options
author | 2022-10-24 19:02:56 -0700 | |
---|---|---|
committer | 2022-10-24 19:02:56 -0700 | |
commit | 1b50ecc52b55df0c00f991c8206d4ced84ad89b8 (patch) | |
tree | 1cfaa1e41110a47b26059b9a68e3f391b7ff5ef3 /src/bun.js | |
parent | f8ea534f3b135d6dfd20ea6b09f163198503bae0 (diff) | |
download | bun-1b50ecc52b55df0c00f991c8206d4ced84ad89b8.tar.gz bun-1b50ecc52b55df0c00f991c8206d4ced84ad89b8.tar.zst bun-1b50ecc52b55df0c00f991c8206d4ced84ad89b8.zip |
oniguruma regex lookbehind and multibyte hex fix (#1363)
* handle multibyte hex characters
* non extended strings used for toString() and source
* add hasIndices flags
* more tests for lookbehinds, unicode, and hex characters
* handled case when hex doesnt have enough digit, more tests
* fix adding characters out of bounds
* backslash in character class
* compile() returns object
* escape special characters in oniguruma character class
Diffstat (limited to 'src/bun.js')
-rw-r--r-- | src/bun.js/bindings/OnigurumaRegExp.cpp | 130 |
1 files changed, 115 insertions, 15 deletions
diff --git a/src/bun.js/bindings/OnigurumaRegExp.cpp b/src/bun.js/bindings/OnigurumaRegExp.cpp index b49717b05..72eaa2b31 100644 --- a/src/bun.js/bindings/OnigurumaRegExp.cpp +++ b/src/bun.js/bindings/OnigurumaRegExp.cpp @@ -32,6 +32,14 @@ static WTF::String to16Bit(JSC::JSString* str, JSC::JSGlobalObject *globalObjec return WTF::String::make16BitFrom8BitSource(value.characters8(), value.length()); } +static WTF::String to16Bit(WTF::String str) { + if (str.is8Bit()) { + return WTF::String::make16BitFrom8BitSource(str.characters8(), str.length()); + } + + return str; +} + static WTF::String to16Bit(JSValue jsValue, JSC::JSGlobalObject *globalObject, ASCIILiteral defaultValue) { if (!jsValue || jsValue.isUndefinedOrNull()) { @@ -46,6 +54,67 @@ static WTF::String to16Bit(JSValue jsValue, JSC::JSGlobalObject *globalObject, return to16Bit(jsString, globalObject); } +static WTF::String extendMultibyteHexCharacters(const WTF::String &string) { + WTF::StringBuilder sb; + uint32_t length = string.length(); + const UChar *characters = string.characters16(); + bool inCharacterClass = false; + + for (int i = 0; i < length; i++) { + while (characters[i] == '\\') { + if (i + 1 < length && characters[i + 1] == 'x') { + if (i + 2 < length && isxdigit(characters[i+ 2])) { + if (i + 3 < length && isxdigit(characters[i+ 3])) { + sb.append(string.substring(i, 4)); + sb.append("\\x00"_s); + i += 4; + } else { + // skip '\' + sb.append(string.substring(i + 1, 2)); + i += 3; + } + } else { + break; + } + } else { + break; + } + } + + if (i >= length) { + break; + } + + if (inCharacterClass) { + // we know ']' will be escaped so there isn't a need to scan for the closing bracket + if (characters[i] == '[' || characters[i] == ']' || characters[i] == '^' || characters[i] == '-' || characters[i] == ')' || characters[i] == '(') { + if (characters[i- 1] != '\\') { + // character class intersections not supported, assume end of character class + if (characters[i] == ']') { + inCharacterClass = false; + } else { + sb.append('\\'); + } + } + } + } else { + if (characters[i] == '[') { + if (i - 1 >= 0) { + if (characters[i- 1] != '\\') { + inCharacterClass = true; + } + } else { + inCharacterClass = true; + } + } + } + + sb.append(characters[i]); + } + + return to16Bit(sb.toString()); +} + static inline bool is16BitLineTerminator(UChar c) { return c == '\r' || c == '\n' || (c & ~1) == 0x2028; @@ -406,17 +475,21 @@ JSC_DEFINE_HOST_FUNCTION(onigurumaRegExpProtoFuncCompile, (JSGlobalObject *globa JSValue arg0 = callFrame->argument(0); JSValue arg1 = callFrame->argument(1); + WTF::String patternStringExtended; if (auto* regExpObject = jsDynamicCast<OnigurumaRegEx*>(arg0)) { if (!arg1.isUndefined()) { throwScope.throwException(globalObject, createTypeError(globalObject, makeString("Cannot supply flags when constructing one RegExp from another."_s))); return JSValue::encode({}); } thisRegExp->setPatternString(regExpObject->patternString()); + patternStringExtended = extendMultibyteHexCharacters(thisRegExp->patternString()); thisRegExp->setFlagsString(regExpObject->flagsString()); } else { WTF::String newPatternString = to16Bit(arg0, globalObject, "(?:)"_s); RETURN_IF_EXCEPTION(scope, {}); + patternStringExtended = extendMultibyteHexCharacters(newPatternString); + WTF::String newFlagsString = to16Bit(arg1, globalObject, ""_s); RETURN_IF_EXCEPTION(scope, {}); @@ -431,8 +504,10 @@ JSC_DEFINE_HOST_FUNCTION(onigurumaRegExpProtoFuncCompile, (JSGlobalObject *globa thisRegExp->setFlagsString(newFlagsString); } - OnigEncoding encoding = ONIG_ENCODING_UTF16_LE; - onig_initialize(&encoding, 1); + OnigEncoding encodings[] = { + ONIG_ENCODING_UTF16_LE, + }; + onig_initialize(encodings, 1); OnigOptionType options = 0; if (thisRegExp->flagsString().contains('i')) { @@ -447,15 +522,26 @@ JSC_DEFINE_HOST_FUNCTION(onigurumaRegExpProtoFuncCompile, (JSGlobalObject *globa options |= ONIG_OPTION_MULTILINE; } + OnigSyntaxType* syntax = ONIG_SYNTAX_DEFAULT; + onig_set_syntax_op(syntax, onig_get_syntax_op(syntax) | ONIG_SYN_OP_ESC_X_HEX2); + onig_set_syntax_op(syntax, onig_get_syntax_op(syntax) | ONIG_SYN_OP_ESC_X_BRACE_HEX8); + onig_set_syntax_op2(syntax, onig_get_syntax_op2(syntax) | ONIG_SYN_OP2_ESC_U_HEX4); + onig_set_syntax_behavior(syntax, onig_get_syntax_behavior(syntax) | ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC); + onig_set_syntax_behavior(syntax, onig_get_syntax_behavior(syntax) | ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC); + onig_set_syntax_behavior(syntax, onig_get_syntax_behavior(syntax) & ~ONIG_SYN_BACKSLASH_ESCAPE_IN_CC); + + OnigEncodingType* encoding = ONIG_ENCODING_UTF16_LE; OnigErrorInfo errorInfo = { 0 }; regex_t* onigRegExp = NULL; - int errorCode = onig_new( + int errorCode = 0; + + errorCode = onig_new( &onigRegExp, - reinterpret_cast<const OnigUChar*>(thisRegExp->patternString().characters16()), - reinterpret_cast<const OnigUChar*>(thisRegExp->patternString().characters16() + thisRegExp->patternString().length()), + reinterpret_cast<const OnigUChar*>(patternStringExtended.characters16()), + reinterpret_cast<const OnigUChar*>(patternStringExtended.characters16() + patternStringExtended.length()), options, - ONIG_ENCODING_UTF16_LE, - ONIG_SYNTAX_DEFAULT, + encoding, + syntax, &errorInfo ); @@ -480,7 +566,7 @@ JSC_DEFINE_HOST_FUNCTION(onigurumaRegExpProtoFuncCompile, (JSGlobalObject *globa thisRegExp->m_onigurumaRegExp = onigRegExp; thisRegExp->m_lastIndex = 0; - return JSValue::encode(jsUndefined()); + return JSValue::encode(thisRegExp); } JSC_DEFINE_HOST_FUNCTION(onigurumaRegExpProtoFuncTest, (JSGlobalObject *globalObject, JSC::CallFrame *callFrame)) @@ -717,6 +803,8 @@ static JSC::EncodedJSValue constructOrCall(Zig::GlobalObject *globalObject, JSVa WTF::String patternString = to16Bit(arg0, globalObject, "(?:)"_s); RETURN_IF_EXCEPTION(scope, {}); + WTF::String patternStringExtended = extendMultibyteHexCharacters(patternString); + WTF::String flagsString = to16Bit(arg1, globalObject, ""_s); RETURN_IF_EXCEPTION(scope, {}); @@ -727,8 +815,10 @@ static JSC::EncodedJSValue constructOrCall(Zig::GlobalObject *globalObject, JSVa flagsString = sortRegExpFlags(flagsString); - OnigEncoding encoding = ONIG_ENCODING_UTF16_LE; - onig_initialize(&encoding, 1); + OnigEncoding encodings[] = { + ONIG_ENCODING_UTF16_LE, + }; + onig_initialize(encodings, 1); OnigOptionType options = 0; if (flagsString.contains('i')) { @@ -743,15 +833,25 @@ static JSC::EncodedJSValue constructOrCall(Zig::GlobalObject *globalObject, JSVa options |= ONIG_OPTION_MULTILINE; } + OnigSyntaxType* syntax = ONIG_SYNTAX_ONIGURUMA; + onig_set_syntax_op(syntax, onig_get_syntax_op(syntax) | ONIG_SYN_OP_ESC_X_HEX2); + onig_set_syntax_op(syntax, onig_get_syntax_op(syntax) | ONIG_SYN_OP_ESC_X_BRACE_HEX8); + onig_set_syntax_op2(syntax, onig_get_syntax_op2(syntax) | ONIG_SYN_OP2_ESC_U_HEX4); + onig_set_syntax_behavior(syntax, onig_get_syntax_behavior(syntax) | ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC); + onig_set_syntax_behavior(syntax, onig_get_syntax_behavior(syntax) | ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC); + + OnigEncodingType* encoding = encodings[0]; OnigErrorInfo errorInfo = { 0 }; regex_t* onigRegExp = NULL; - int errorCode = onig_new( + int errorCode = 0; + + errorCode = onig_new( &onigRegExp, - reinterpret_cast<const OnigUChar*>(patternString.characters16()), - reinterpret_cast<const OnigUChar*>(patternString.characters16() + patternString.length()), + reinterpret_cast<const OnigUChar*>(patternStringExtended.characters16()), + reinterpret_cast<const OnigUChar*>(patternStringExtended.characters16() + patternStringExtended.length()), options, - ONIG_ENCODING_UTF16_LE, - ONIG_SYNTAX_DEFAULT, + encoding, + syntax, &errorInfo ); |