| | |
| | | // Prepare a Regenerate set containing all code points, used for negative |
| | | // character classes (if any). |
| | | const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF); |
| | | // Without the `u` flag, the range stops at 0xFFFF. |
| | | // https://mths.be/es6#sec-pattern-semantics |
| | | const BMP_SET = regenerate().addRange(0x0, 0xFFFF); |
| | | |
| | | // Prepare a Regenerate set containing all code points that are supposed to be |
| | | // matched by `/./u`. https://mths.be/es6#sec-atom |
| | |
| | | const category = unicodeMatchPropertyValue(property, value); |
| | | return getUnicodePropertyValueSet(property, category); |
| | | } catch (exception) {} |
| | | // It’s not a `General_Category` value, so check if it’s a binary |
| | | // property. Note: `unicodeMatchProperty` throws on invalid properties. |
| | | // It’s not a `General_Category` value, so check if it’s a property |
| | | // of strings. |
| | | try { |
| | | return getUnicodePropertyValueSet('Property_of_Strings', value); |
| | | } catch (exception) {} |
| | | // Lastly, check if it’s a binary property of single code points. |
| | | // Note: `unicodeMatchProperty` throws on invalid properties. |
| | | const property = unicodeMatchProperty(value); |
| | | return getUnicodePropertyValueSet(property); |
| | | }; |
| | |
| | | set = getUnicodePropertyValueSet(property, value); |
| | | } |
| | | if (isNegative) { |
| | | return UNICODE_SET.clone().remove(set); |
| | | if (set.strings) { |
| | | throw new Error('Cannot negate Unicode property of strings'); |
| | | } |
| | | return { |
| | | characters: UNICODE_SET.clone().remove(set.characters), |
| | | strings: new Set() |
| | | }; |
| | | } |
| | | return set.clone(); |
| | | return { |
| | | characters: set.characters.clone(), |
| | | strings: new Set(set.strings || []) |
| | | }; |
| | | }; |
| | | |
| | | const getUnicodePropertyEscapeCharacterClassData = (property, isNegative) => { |
| | | const set = getUnicodePropertyEscapeSet(property, isNegative); |
| | | const data = getCharacterClassEmptyData(); |
| | | data.singleChars = set.characters; |
| | | if (set.strings.size > 0) { |
| | | data.longStrings = set.strings; |
| | | data.maybeIncludesStrings = true; |
| | | } |
| | | return data; |
| | | }; |
| | | |
| | | // Given a range of code points, add any case-folded code points in that range |
| | |
| | | const folded = caseFold(min); |
| | | if (folded) { |
| | | $this.add(folded); |
| | | } |
| | | } while (++min <= max); |
| | | return $this; |
| | | }; |
| | | regenerate.prototype.iuRemoveRange = function(min, max) { |
| | | const $this = this; |
| | | do { |
| | | const folded = caseFold(min); |
| | | if (folded) { |
| | | $this.remove(folded); |
| | | } |
| | | } while (++min <= max); |
| | | return $this; |
| | |
| | | return iuMappings.get(codePoint) || false; |
| | | }; |
| | | |
| | | const processCharacterClass = (characterClassItem, regenerateOptions) => { |
| | | const set = regenerate(); |
| | | const buildHandler = (action) => { |
| | | switch (action) { |
| | | case 'union': |
| | | return { |
| | | single: (data, cp) => { |
| | | data.singleChars.add(cp); |
| | | }, |
| | | regSet: (data, set2) => { |
| | | data.singleChars.add(set2); |
| | | }, |
| | | range: (data, start, end) => { |
| | | data.singleChars.addRange(start, end); |
| | | }, |
| | | iuRange: (data, start, end) => { |
| | | data.singleChars.iuAddRange(start, end); |
| | | }, |
| | | nested: (data, nestedData) => { |
| | | data.singleChars.add(nestedData.singleChars); |
| | | for (const str of nestedData.longStrings) data.longStrings.add(str); |
| | | if (nestedData.maybeIncludesStrings) data.maybeIncludesStrings = true; |
| | | } |
| | | }; |
| | | case 'union-negative': { |
| | | const regSet = (data, set2) => { |
| | | data.singleChars = UNICODE_SET.clone().remove(set2).add(data.singleChars); |
| | | }; |
| | | return { |
| | | single: (data, cp) => { |
| | | const unicode = UNICODE_SET.clone(); |
| | | data.singleChars = data.singleChars.contains(cp) ? unicode : unicode.remove(cp); |
| | | }, |
| | | regSet: regSet, |
| | | range: (data, start, end) => { |
| | | data.singleChars = UNICODE_SET.clone().removeRange(start, end).add(data.singleChars); |
| | | }, |
| | | iuRange: (data, start, end) => { |
| | | data.singleChars = UNICODE_SET.clone().iuRemoveRange(start, end).add(data.singleChars); |
| | | }, |
| | | nested: (data, nestedData) => { |
| | | regSet(data, nestedData.singleChars); |
| | | if (nestedData.maybeIncludesStrings) throw new Error('ASSERTION ERROR'); |
| | | } |
| | | }; |
| | | } |
| | | case 'intersection': { |
| | | const regSet = (data, set2) => { |
| | | if (data.first) data.singleChars = set2; |
| | | else data.singleChars.intersection(set2); |
| | | }; |
| | | return { |
| | | single: (data, cp) => { |
| | | data.singleChars = data.first || data.singleChars.contains(cp) ? regenerate(cp) : regenerate(); |
| | | data.longStrings.clear(); |
| | | data.maybeIncludesStrings = false; |
| | | }, |
| | | regSet: (data, set) => { |
| | | regSet(data, set); |
| | | data.longStrings.clear(); |
| | | data.maybeIncludesStrings = false; |
| | | }, |
| | | range: (data, start, end) => { |
| | | if (data.first) data.singleChars.addRange(start, end); |
| | | else data.singleChars.intersection(regenerate().addRange(start, end)); |
| | | data.longStrings.clear(); |
| | | data.maybeIncludesStrings = false; |
| | | }, |
| | | iuRange: (data, start, end) => { |
| | | if (data.first) data.singleChars.iuAddRange(start, end); |
| | | else data.singleChars.intersection(regenerate().iuAddRange(start, end)); |
| | | data.longStrings.clear(); |
| | | data.maybeIncludesStrings = false; |
| | | }, |
| | | nested: (data, nestedData) => { |
| | | regSet(data, nestedData.singleChars); |
| | | |
| | | if (data.first) { |
| | | data.longStrings = nestedData.longStrings; |
| | | data.maybeIncludesStrings = nestedData.maybeIncludesStrings; |
| | | } else { |
| | | for (const str of data.longStrings) { |
| | | if (!nestedData.longStrings.has(str)) data.longStrings.delete(str); |
| | | } |
| | | if (!nestedData.maybeIncludesStrings) data.maybeIncludesStrings = false; |
| | | } |
| | | } |
| | | }; |
| | | } |
| | | case 'subtraction': { |
| | | const regSet = (data, set2) => { |
| | | if (data.first) data.singleChars.add(set2); |
| | | else data.singleChars.remove(set2); |
| | | }; |
| | | return { |
| | | single: (data, cp) => { |
| | | if (data.first) data.singleChars.add(cp); |
| | | else data.singleChars.remove(cp); |
| | | }, |
| | | regSet: regSet, |
| | | range: (data, start, end) => { |
| | | if (data.first) data.singleChars.addRange(start, end); |
| | | else data.singleChars.removeRange(start, end); |
| | | }, |
| | | iuRange: (data, start, end) => { |
| | | if (data.first) data.singleChars.iuAddRange(start, end); |
| | | else data.singleChars.iuRemoveRange(start, end); |
| | | }, |
| | | nested: (data, nestedData) => { |
| | | regSet(data, nestedData.singleChars); |
| | | |
| | | if (data.first) { |
| | | data.longStrings = nestedData.longStrings; |
| | | data.maybeIncludesStrings = nestedData.maybeIncludesStrings; |
| | | } else { |
| | | for (const str of data.longStrings) { |
| | | if (nestedData.longStrings.has(str)) data.longStrings.delete(str); |
| | | } |
| | | } |
| | | } |
| | | }; |
| | | } |
| | | // The `default` clause is only here as a safeguard; it should never be |
| | | // reached. Code coverage tools should ignore it. |
| | | /* istanbul ignore next */ |
| | | default: |
| | | throw new Error(`Unknown set action: ${ characterClassItem.kind }`); |
| | | } |
| | | }; |
| | | |
| | | const getCharacterClassEmptyData = () => ({ |
| | | transformed: config.transform.unicodeFlag, |
| | | singleChars: regenerate(), |
| | | longStrings: new Set(), |
| | | hasEmptyString: false, |
| | | first: true, |
| | | maybeIncludesStrings: false |
| | | }); |
| | | |
| | | const maybeFold = (codePoint) => { |
| | | if (config.flags.ignoreCase && config.transform.unicodeFlag) { |
| | | const folded = caseFold(codePoint); |
| | | if (folded) { |
| | | return [codePoint, folded]; |
| | | } |
| | | } |
| | | return [codePoint]; |
| | | }; |
| | | |
| | | const computeClassStrings = (classStrings, regenerateOptions) => { |
| | | let data = getCharacterClassEmptyData(); |
| | | |
| | | for (const string of classStrings.strings) { |
| | | if (string.characters.length === 1) { |
| | | maybeFold(string.characters[0].codePoint).forEach((cp) => { |
| | | data.singleChars.add(cp); |
| | | }); |
| | | } else { |
| | | let stringifiedString; |
| | | if (config.flags.ignoreCase && config.transform.unicodeFlag) { |
| | | stringifiedString = ''; |
| | | for (const ch of string.characters) { |
| | | let set = regenerate(ch.codePoint); |
| | | const folded = caseFold(ch.codePoint); |
| | | if (folded) set.add(folded); |
| | | stringifiedString += set.toString(regenerateOptions); |
| | | } |
| | | } else { |
| | | stringifiedString = string.characters.map(ch => generate(ch)).join('') |
| | | } |
| | | |
| | | data.longStrings.add(stringifiedString); |
| | | data.maybeIncludesStrings = true; |
| | | } |
| | | } |
| | | |
| | | return data; |
| | | } |
| | | |
| | | const computeCharacterClass = (characterClassItem, regenerateOptions) => { |
| | | let data = getCharacterClassEmptyData(); |
| | | |
| | | let handlePositive; |
| | | let handleNegative; |
| | | |
| | | switch (characterClassItem.kind) { |
| | | case 'union': |
| | | handlePositive = buildHandler('union'); |
| | | handleNegative = buildHandler('union-negative'); |
| | | break; |
| | | case 'intersection': |
| | | handlePositive = buildHandler('intersection'); |
| | | handleNegative = buildHandler('subtraction'); |
| | | break; |
| | | case 'subtraction': |
| | | handlePositive = buildHandler('subtraction'); |
| | | handleNegative = buildHandler('intersection'); |
| | | break; |
| | | // The `default` clause is only here as a safeguard; it should never be |
| | | // reached. Code coverage tools should ignore it. |
| | | /* istanbul ignore next */ |
| | | default: |
| | | throw new Error(`Unknown character class kind: ${ characterClassItem.kind }`); |
| | | } |
| | | |
| | | for (const item of characterClassItem.body) { |
| | | switch (item.type) { |
| | | case 'value': |
| | | set.add(item.codePoint); |
| | | if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) { |
| | | const folded = caseFold(item.codePoint); |
| | | if (folded) { |
| | | set.add(folded); |
| | | } |
| | | } |
| | | maybeFold(item.codePoint).forEach((cp) => { |
| | | handlePositive.single(data, cp); |
| | | }); |
| | | break; |
| | | case 'characterClassRange': |
| | | const min = item.min.codePoint; |
| | | const max = item.max.codePoint; |
| | | set.addRange(min, max); |
| | | if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) { |
| | | set.iuAddRange(min, max); |
| | | handlePositive.range(data, min, max); |
| | | if (config.flags.ignoreCase && config.transform.unicodeFlag) { |
| | | handlePositive.iuRange(data, min, max); |
| | | } |
| | | break; |
| | | case 'characterClassEscape': |
| | | set.add(getCharacterClassEscapeSet( |
| | | handlePositive.regSet(data, getCharacterClassEscapeSet( |
| | | item.value, |
| | | config.unicode, |
| | | config.ignoreCase |
| | | config.flags.unicode, |
| | | config.flags.ignoreCase |
| | | )); |
| | | break; |
| | | case 'unicodePropertyEscape': |
| | | set.add(getUnicodePropertyEscapeSet(item.value, item.negative)); |
| | | const nestedData = getUnicodePropertyEscapeCharacterClassData(item.value, item.negative); |
| | | handlePositive.nested(data, nestedData); |
| | | data.transformed = |
| | | data.transformed || |
| | | config.transform.unicodePropertyEscapes || |
| | | (config.transform.unicodeSetsFlag && nestedData.maybeIncludesStrings); |
| | | break; |
| | | case 'characterClass': |
| | | const handler = item.negative ? handleNegative : handlePositive; |
| | | const res = computeCharacterClass(item, regenerateOptions); |
| | | handler.nested(data, res); |
| | | data.transformed = true; |
| | | break; |
| | | case 'classStrings': |
| | | handlePositive.nested(data, computeClassStrings(item, regenerateOptions)); |
| | | data.transformed = true; |
| | | break; |
| | | // The `default` clause is only here as a safeguard; it should never be |
| | | // reached. Code coverage tools should ignore it. |
| | |
| | | default: |
| | | throw new Error(`Unknown term type: ${ item.type }`); |
| | | } |
| | | |
| | | data.first = false; |
| | | } |
| | | if (characterClassItem.negative) { |
| | | update(characterClassItem, `(?!${set.toString(regenerateOptions)})[\\s\\S]`) |
| | | } else { |
| | | update(characterClassItem, set.toString(regenerateOptions)); |
| | | |
| | | if (characterClassItem.negative && data.maybeIncludesStrings) { |
| | | throw new SyntaxError('Cannot negate set containing strings'); |
| | | } |
| | | |
| | | return data; |
| | | } |
| | | |
| | | const processCharacterClass = ( |
| | | characterClassItem, |
| | | regenerateOptions, |
| | | computed = computeCharacterClass(characterClassItem, regenerateOptions) |
| | | ) => { |
| | | const negative = characterClassItem.negative; |
| | | const { singleChars, transformed, longStrings } = computed; |
| | | if (transformed) { |
| | | const setStr = singleChars.toString(regenerateOptions); |
| | | |
| | | if (negative) { |
| | | if (config.useUnicodeFlag) { |
| | | update(characterClassItem, `[^${setStr[0] === '[' ? setStr.slice(1, -1) : setStr}]`) |
| | | } else { |
| | | update(characterClassItem, `(?!${setStr})[\\s\\S]`) |
| | | } |
| | | } else { |
| | | const hasEmptyString = longStrings.has(''); |
| | | const pieces = Array.from(longStrings).sort((a, b) => b.length - a.length); |
| | | if (setStr !== '[]' || longStrings.size === 0) { |
| | | pieces.splice(pieces.length - (hasEmptyString ? 1 : 0), 0, setStr); |
| | | } |
| | | |
| | | update(characterClassItem, pieces.join('|')); |
| | | } |
| | | } |
| | | return characterClassItem; |
| | | }; |
| | |
| | | const processTerm = (item, regenerateOptions, groups) => { |
| | | switch (item.type) { |
| | | case 'dot': |
| | | if (config.useDotAllFlag) { |
| | | break; |
| | | } else if (config.unicode) { |
| | | if (config.transform.unicodeFlag) { |
| | | update( |
| | | item, |
| | | getUnicodeDotSet(config.dotAll).toString(regenerateOptions) |
| | | getUnicodeDotSet(config.flags.dotAll).toString(regenerateOptions) |
| | | ); |
| | | } else if (config.dotAll) { |
| | | } else if (config.transform.dotAllFlag) { |
| | | // TODO: consider changing this at the regenerate level. |
| | | update(item, '[\\s\\S]'); |
| | | } |
| | |
| | | item = processCharacterClass(item, regenerateOptions); |
| | | break; |
| | | case 'unicodePropertyEscape': |
| | | if (config.unicodePropertyEscape) { |
| | | const data = getUnicodePropertyEscapeCharacterClassData(item.value, item.negative); |
| | | if (data.maybeIncludesStrings) { |
| | | if (!config.flags.unicodeSets) { |
| | | throw new Error( |
| | | 'Properties of strings are only supported when using the unicodeSets (v) flag.' |
| | | ); |
| | | } |
| | | if (config.transform.unicodeSetsFlag) { |
| | | data.transformed = true; |
| | | item = processCharacterClass(item, regenerateOptions, data); |
| | | } |
| | | } else if (config.transform.unicodePropertyEscapes) { |
| | | update( |
| | | item, |
| | | getUnicodePropertyEscapeSet(item.value, item.negative) |
| | | .toString(regenerateOptions) |
| | | data.singleChars.toString(regenerateOptions) |
| | | ); |
| | | } |
| | | break; |
| | | case 'characterClassEscape': |
| | | update( |
| | | item, |
| | | getCharacterClassEscapeSet( |
| | | item.value, |
| | | config.unicode, |
| | | config.ignoreCase |
| | | ).toString(regenerateOptions) |
| | | ); |
| | | if (config.transform.unicodeFlag) { |
| | | update( |
| | | item, |
| | | getCharacterClassEscapeSet( |
| | | item.value, |
| | | /* config.transform.unicodeFlag implies config.flags.unicode */ true, |
| | | config.flags.ignoreCase |
| | | ).toString(regenerateOptions) |
| | | ); |
| | | } |
| | | break; |
| | | case 'group': |
| | | if (item.behavior == 'normal') { |
| | | groups.lastIndex++; |
| | | } |
| | | if (item.name && config.namedGroup) { |
| | | if (item.name && config.transform.namedGroups) { |
| | | const name = item.name.value; |
| | | |
| | | if (groups.names[name]) { |
| | |
| | | case 'value': |
| | | const codePoint = item.codePoint; |
| | | const set = regenerate(codePoint); |
| | | if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) { |
| | | if (config.flags.ignoreCase && config.transform.unicodeFlag) { |
| | | const folded = caseFold(codePoint); |
| | | if (folded) { |
| | | set.add(folded); |
| | |
| | | }; |
| | | |
| | | const config = { |
| | | 'ignoreCase': false, |
| | | 'unicode': false, |
| | | 'dotAll': false, |
| | | 'useDotAllFlag': false, |
| | | 'useUnicodeFlag': false, |
| | | 'unicodePropertyEscape': false, |
| | | 'namedGroup': false |
| | | }; |
| | | const rewritePattern = (pattern, flags, options) => { |
| | | config.unicode = flags && flags.includes('u'); |
| | | const regjsparserFeatures = { |
| | | 'unicodePropertyEscape': config.unicode, |
| | | 'namedGroups': true, |
| | | 'lookbehind': options && options.lookbehind |
| | | }; |
| | | config.ignoreCase = flags && flags.includes('i'); |
| | | const supportDotAllFlag = options && options.dotAllFlag; |
| | | config.dotAll = supportDotAllFlag && flags && flags.includes('s'); |
| | | config.namedGroup = options && options.namedGroup; |
| | | config.useDotAllFlag = options && options.useDotAllFlag; |
| | | config.useUnicodeFlag = options && options.useUnicodeFlag; |
| | | config.unicodePropertyEscape = options && options.unicodePropertyEscape; |
| | | if (supportDotAllFlag && config.useDotAllFlag) { |
| | | throw new Error('`useDotAllFlag` and `dotAllFlag` cannot both be true!'); |
| | | 'flags': { |
| | | 'ignoreCase': false, |
| | | 'unicode': false, |
| | | 'unicodeSets': false, |
| | | 'dotAll': false, |
| | | }, |
| | | 'transform': { |
| | | 'dotAllFlag': false, |
| | | 'unicodeFlag': false, |
| | | 'unicodeSetsFlag': false, |
| | | 'unicodePropertyEscapes': false, |
| | | 'namedGroups': false, |
| | | }, |
| | | get useUnicodeFlag() { |
| | | return (this.flags.unicode || this.flags.unicodeSets) && !this.transform.unicodeFlag; |
| | | } |
| | | }; |
| | | |
| | | const validateOptions = (options) => { |
| | | if (!options) return; |
| | | |
| | | for (const key of Object.keys(options)) { |
| | | const value = options[key]; |
| | | switch (key) { |
| | | case 'dotAllFlag': |
| | | case 'unicodeFlag': |
| | | case 'unicodePropertyEscapes': |
| | | case 'namedGroups': |
| | | if (value != null && value !== false && value !== 'transform') { |
| | | throw new Error(`.${key} must be false (default) or 'transform'.`); |
| | | } |
| | | break; |
| | | case 'unicodeSetsFlag': |
| | | if (value != null && value !== false && value !== 'parse' && value !== 'transform') { |
| | | throw new Error(`.${key} must be false (default), 'parse' or 'transform'.`); |
| | | } |
| | | break; |
| | | case 'onNamedGroup': |
| | | if (value != null && typeof value !== 'function') { |
| | | throw new Error('.onNamedGroup must be a function.'); |
| | | } |
| | | break; |
| | | default: |
| | | throw new Error(`.${key} is not a valid regexpu-core option.`); |
| | | } |
| | | } |
| | | }; |
| | | |
| | | const hasFlag = (flags, flag) => flags ? flags.includes(flag) : false; |
| | | const transform = (options, name) => options ? options[name] === 'transform' : false; |
| | | |
| | | const rewritePattern = (pattern, flags, options) => { |
| | | validateOptions(options); |
| | | |
| | | config.flags.unicode = hasFlag(flags, 'u'); |
| | | config.flags.unicodeSets = hasFlag(flags, 'v'); |
| | | config.flags.ignoreCase = hasFlag(flags, 'i'); |
| | | config.flags.dotAll = hasFlag(flags, 's'); |
| | | |
| | | config.transform.dotAllFlag = config.flags.dotAll && transform(options, 'dotAllFlag'); |
| | | config.transform.unicodeFlag = (config.flags.unicode || config.flags.unicodeSets) && transform(options, 'unicodeFlag'); |
| | | config.transform.unicodeSetsFlag = config.flags.unicodeSets && transform(options, 'unicodeSetsFlag'); |
| | | |
| | | // unicodeFlag: 'transform' implies unicodePropertyEscapes: 'transform' |
| | | config.transform.unicodePropertyEscapes = config.flags.unicode && ( |
| | | transform(options, 'unicodeFlag') || transform(options, 'unicodePropertyEscapes') |
| | | ); |
| | | config.transform.namedGroups = transform(options, 'namedGroups'); |
| | | |
| | | const regjsparserFeatures = { |
| | | 'unicodeSet': Boolean(options && options.unicodeSetsFlag), |
| | | |
| | | // Enable every stable RegExp feature by default |
| | | 'unicodePropertyEscape': true, |
| | | 'namedGroups': true, |
| | | 'lookbehind': true, |
| | | }; |
| | | |
| | | const regenerateOptions = { |
| | | 'hasUnicodeFlag': config.useUnicodeFlag, |
| | | 'bmpOnly': !config.unicode |
| | | 'bmpOnly': !config.flags.unicode |
| | | }; |
| | | |
| | | const groups = { |
| | | 'onNamedGroup': options && options.onNamedGroup, |
| | | 'lastIndex': 0, |
| | | 'names': Object.create(null), // { [name]: index } |
| | | 'unmatchedReferences': Object.create(null) // { [name]: Array<reference> } |
| | | }; |
| | | |
| | | const tree = parse(pattern, flags, regjsparserFeatures); |
| | | // Note: `processTerm` mutates `tree` and `groups`. |
| | | processTerm(tree, regenerateOptions, groups); |