diff --git a/src/1p-filters.html b/src/1p-filters.html index e17733d53..ecd186277 100644 --- a/src/1p-filters.html +++ b/src/1p-filters.html @@ -49,6 +49,7 @@ + diff --git a/src/about.html b/src/about.html index 71a321ec7..02d03547e 100644 --- a/src/about.html +++ b/src/about.html @@ -39,6 +39,7 @@
Inter font family by Rasmus Andersson
FontAwesome font family by Dave Gandy
An implementation of Myers' diff algorithm by Arpad Borsos
+
Regular Expression Analyzer by Nikos M.

diff --git a/src/asset-viewer.html b/src/asset-viewer.html index dfcf92c32..d94fc0bdc 100644 --- a/src/asset-viewer.html +++ b/src/asset-viewer.html @@ -33,6 +33,7 @@ + diff --git a/src/background.html b/src/background.html index 108ebd89c..7d183dfa4 100644 --- a/src/background.html +++ b/src/background.html @@ -9,6 +9,7 @@ + diff --git a/src/js/codemirror/ubo-static-filtering.js b/src/js/codemirror/ubo-static-filtering.js index f4f07f5ce..0d38c14ed 100644 --- a/src/js/codemirror/ubo-static-filtering.js +++ b/src/js/codemirror/ubo-static-filtering.js @@ -274,7 +274,9 @@ CodeMirror.defineMode('ubo-static-filtering', function() { if ( parser.patternIsRegex() ) { stream.pos = parser.slices[parser.optionsAnchorSpan.i+1]; parserSlot = parser.optionsAnchorSpan.i; - return 'variable notice'; + return parser.patternIsTokenizable() + ? 'variable notice' + : 'variable warning'; } if ( (parser.slices[parserSlot] & (parser.BITAsterisk | parser.BITCaret)) !== 0 ) { stream.pos += parser.slices[parserSlot+2]; diff --git a/src/js/static-filtering-parser.js b/src/js/static-filtering-parser.js index 3d4a4cb45..d3c81d46e 100644 --- a/src/js/static-filtering-parser.js +++ b/src/js/static-filtering-parser.js @@ -1003,6 +1003,18 @@ const Parser = class { return (this.flavorBits & BITFlavorNetRegex) !== 0; } + patternIsTokenizable() { + // TODO: not necessarily true, this needs more work. + if ( this.patternIsRegex === false ) { return true; } + const s = Parser.tokenizableStrFromRegex(this.getNetPattern()); + try { + return /(? { + + const firstCharCodeClass = s => { + return /^[\x01%0-9A-Za-z]/.test(s) ? 1 : 0; + }; + + const lastCharCodeClass = s => { + return /[\x01%0-9A-Za-z]$/.test(s) ? 1 : 0; + }; + + const toTokenizableString = node => { + switch ( node.type ) { + case 1: /* T_SEQUENCE, 'Sequence' */ { + let s = ''; + for ( let i = 0; i < node.val.length; i++ ) { + s += toTokenizableString(node.val[i]); + } + return s; + } + case 2: /* T_ALTERNATION,'Alternation' */ + case 8: /* T_CHARGROUP, 'CharacterGroup' */ { + let firstChar = 0; + let lastChar = 0; + for ( let i = 0; i < node.val.length; i++ ) { + const s = toTokenizableString(node.val[i]); + if ( firstChar === 0 && firstCharCodeClass(s) === 1 ) { + firstChar = 1; + } + if ( lastChar === 0 && lastCharCodeClass(s) === 1 ) { + lastChar = 1; + } + if ( firstChar === 1 && lastChar === 1 ) { break; } + } + return String.fromCharCode(firstChar, lastChar); + } + case 4: /* T_GROUP, 'Group' */ { + if ( node.flags.NegativeLookAhead === 1 ) { return '\x01'; } + if ( node.flags.NegativeLookBehind === 1 ) { return '\x01'; } + return toTokenizableString(node.val); + } + case 16: /* T_QUANTIFIER, 'Quantifier' */ { + const s = toTokenizableString(node.val); + const first = firstCharCodeClass(s); + const last = lastCharCodeClass(s); + if ( node.flags.min === 0 && first === 0 && last === 0 ) { + return ''; + } + return String.fromCharCode(first, last); + } + case 64: /* T_HEXCHAR, 'HexChar' */ { + return String.fromCharCode(parseInt(node.val.slice(1), 16)); + } + case 128: /* T_SPECIAL, 'Special' */ { + const flags = node.flags; + if ( flags.MatchEnd === 1 ) { return '\x00'; } + if ( flags.MatchStart === 1 ) { return '\x00'; } + if ( flags.MatchWordBoundary === 1 ) { return '\x00'; } + return '\x01'; + } + case 256: /* T_CHARS, 'Characters' */ { + for ( let i = 0; i < node.val.length; i++ ) { + if ( firstCharCodeClass(node.val[i]) === 1 ) { + return '\x01'; + } + } + return '\x00'; + } + // Ranges are assumed to always involve token-related characters. + case 512: /* T_CHARRANGE, 'CharacterRange' */ { + return '\x01'; + } + case 1024: /* T_STRING, 'String' */ { + return node.val; + } + case 2048: /* T_COMMENT, 'Comment' */ { + return ''; + } + default: + break; + } + return '\x01'; + }; + + return function(reStr) { + if ( + self.Regex instanceof Object === false || + self.Regex.Analyzer instanceof Object === false + ) { + return ''; + } + try { + return toTokenizableString(self.Regex.Analyzer(reStr, false).tree()); + } catch(ex) { + } + return ''; + }; +})(); + +/******************************************************************************/ + if ( typeof vAPI === 'object' && vAPI !== null ) { vAPI.StaticFilteringParser = Parser; } else { diff --git a/src/js/static-net-filtering.js b/src/js/static-net-filtering.js index 335f52f15..1edc3619c 100644 --- a/src/js/static-net-filtering.js +++ b/src/js/static-net-filtering.js @@ -2622,15 +2622,9 @@ const FilterParser = class { if ( other !== undefined ) { return Object.assign(this, other); } - this.cantWebsocket = vAPI.cantWebsocket; this.noTokenHash = urlTokenizer.noTokenHash; - this.reIsolateHostname = /^(\*?\.)?([^\x00-\x24\x26-\x2C\x2F\x3A-\x5E\x60\x7B-\x7F]+)(.*)/; this.reBadCSP = /(?:=|;)\s*report-(?:to|uri)\b/; this.reToken = /[%0-9A-Za-z]+/g; - this.reRegexTokenAbort = /[\(\)\[\]]/; - this.reRegexBadPrefix = /(^|[^\\]\.|\\[%SDWsdw]|[^\\][()*+?[\\\]{}])$/; - this.reRegexBadSuffix = /^([^\\]\.|\\[%SDWsdw]|[()*+?[\]{}]|$)/; - this.reGoodToken = /[%0-9a-z]{1,}/g; this.domainOptList = []; this.tokenIdToNormalizedType = new Map([ [ parser.OPTTokenCname, bitFromType('cname') ], @@ -3175,32 +3169,22 @@ const FilterParser = class { // not `bads`. extractTokenFromRegex() { this.reToken.lastIndex = 0; - const pattern = this.pattern; + const pattern = + vAPI.StaticFilteringParser.tokenizableStrFromRegex(this.pattern); let bestToken; let bestBadness = 0x7FFFFFFF; for (;;) { const matches = this.reToken.exec(pattern); if ( matches === null ) { break; } - let token = matches[0]; - let prefix = pattern.slice(0, matches.index); - let suffix = pattern.slice(this.reToken.lastIndex); - if ( - this.reRegexTokenAbort.test(prefix) && - this.reRegexTokenAbort.test(suffix) - ) { + const { 0: token, index } = matches; + if ( index === 0 || pattern.charAt(index - 1) === '\x01' ) { continue; } - if ( token.charCodeAt(0) === 0x62 /* 'b' */ ) { - const match = /\\+$/.exec(prefix); - if ( match !== null && (match[0].length & 1) !== 0 ) { - prefix += 'b'; - token = token.slice(1); - } - } + const { lastIndex } = this.reToken; if ( - this.reRegexBadPrefix.test(prefix) || ( - token.length < this.maxTokenLen && - this.reRegexBadSuffix.test(suffix) + token.length < this.maxTokenLen && ( + lastIndex === pattern.length || + pattern.charAt(lastIndex) === '\x01' ) ) { continue; diff --git a/src/lib/regexanalyzer/README.md b/src/lib/regexanalyzer/README.md new file mode 100644 index 000000000..8b9c2706a --- /dev/null +++ b/src/lib/regexanalyzer/README.md @@ -0,0 +1,14 @@ +https://github.com/foo123/RegexAnalyzer/issues/1#issuecomment-750039255 + +> The (implied) license is as free as it can get. You can modify it and use +> it anywhere you want if it suits you. +> +> An attribution to original author would be appreciated but even this is not +> mandatory. +> +> Copy Left + +References: + +- https://en.wikipedia.org/wiki/Copyleft +- http://gplv3.fsf.org/wiki/index.php/Compatible_licenses diff --git a/src/lib/regexanalyzer/regex.js b/src/lib/regexanalyzer/regex.js new file mode 100644 index 000000000..788f03ee8 --- /dev/null +++ b/src/lib/regexanalyzer/regex.js @@ -0,0 +1,2156 @@ +/** +* +* Regex +* @version: 1.1.0 +* +* A simple & generic Regular Expression Analyzer & Composer for PHP, Python, Node.js / Browser / XPCOM Javascript +* https://github.com/foo123/RegexAnalyzer +* +**/ +!function( root, name, factory ){ +"use strict"; +if ( ('undefined'!==typeof Components)&&('object'===typeof Components.classes)&&('object'===typeof Components.classesByID)&&Components.utils&&('function'===typeof Components.utils['import']) ) /* XPCOM */ + (root.$deps = root.$deps||{}) && (root.EXPORTED_SYMBOLS = [name]) && (root[name] = root.$deps[name] = factory.call(root)); +else if ( ('object'===typeof module)&&module.exports ) /* CommonJS */ + (module.$deps = module.$deps||{}) && (module.exports = module.$deps[name] = factory.call(root)); +else if ( ('undefined'!==typeof System)&&('function'===typeof System.register)&&('function'===typeof System['import']) ) /* ES6 module */ + System.register(name,[],function($__export){$__export(name, factory.call(root));}); +else if ( ('function'===typeof define)&&define.amd&&('function'===typeof require)&&('function'===typeof require.specified)&&require.specified(name) /*&& !require.defined(name)*/ ) /* AMD */ + define(name,['module'],function(module){factory.moduleUri = module.uri; return factory.call(root);}); +else if ( !(name in root) ) /* Browser/WebWorker/.. */ + (root[name] = factory.call(root)||1)&&('function'===typeof(define))&&define.amd&&define(function(){return root[name];} ); +}( /* current root */ 'undefined' !== typeof self ? self : this, + /* module name */ "Regex", + /* module factory */ function ModuleFactory__Regex( undef ){ +"use strict"; +var __version__ = "1.1.0", + + PROTO = 'prototype', OP = Object[PROTO], AP = Array[PROTO], + Keys = Object.keys, to_string = OP.toString, HAS = OP.hasOwnProperty, + fromCharCode = String.fromCharCode, CHAR = 'charAt', CHARCODE = 'charCodeAt', toJSON = JSON.stringify, + INF = Infinity, ESC = '\\', + specialChars = { + "." : "MatchAnyChar", + "|" : "MatchEither", + "?" : "MatchZeroOrOne", + "*" : "MatchZeroOrMore", + "+" : "MatchOneOrMore", + "^" : "MatchStart", + "$" : "MatchEnd", + "{" : "StartRepeats", + "}" : "EndRepeats", + "(" : "StartGroup", + ")" : "EndGroup", + "[" : "StartCharGroup", + "]" : "EndCharGroup" + }, + /* + http://www.javascriptkit.com/javatutors/redev2.shtml + + \f matches form-feed. + \r matches carriage return. + \n matches linefeed. + \t matches horizontal tab. + \v matches vertical tab. + \0 matches NUL character. + [\b] matches backspace. + \s matches whitespace (short for [\f\n\r\t\v\u00A0\u2028\u2029]). + \S matches anything but a whitespace (short for [^\f\n\r\t\v\u00A0\u2028\u2029]). + \w matches any alphanumerical character (word characters) including underscore (short for [a-zA-Z0-9_]). + \W matches any non-word characters (short for [^a-zA-Z0-9_]). + \d matches any digit (short for [0-9]). + \D matches any non-digit (short for [^0-9]). + \b matches a word boundary (the position between a word and a space). + \B matches a non-word boundary (short for [^\b]). + \cX matches a control character. E.g: \cm matches control-M. + \xhh matches the character with two characters of hexadecimal code hh. + \uhhhh matches the Unicode character with four characters of hexadecimal code hhhh. + */ + specialCharsEscaped = { + "\\" : "ESC", + "/" : "/", + "0" : "NULChar", + "f" : "FormFeed", + "n" : "LineFeed", + "r" : "CarriageReturn", + "t" : "HorizontalTab", + "v" : "VerticalTab", + "b" : "MatchWordBoundary", + "B" : "MatchNonWordBoundary", + "s" : "MatchSpaceChar", + "S" : "MatchNonSpaceChar", + "w" : "MatchWordChar", + "W" : "MatchNonWordChar", + "d" : "MatchDigitChar", + "D" : "MatchNonDigitChar" + }, + T_SEQUENCE = 1, + T_ALTERNATION = 2, + T_GROUP = 4, + T_CHARGROUP = 8, + T_QUANTIFIER = 16, + T_UNICODECHAR = 32, + T_HEXCHAR = 64, + T_SPECIAL = 128, + T_CHARS = 256, + T_CHARRANGE = 512, + T_STRING = 1024, + T_COMMENT = 2048 +; + +function is_array( x ) +{ + return (x instanceof Array) || ('[object Array]' === to_string.call(x)); +} +function is_string( x ) +{ + return (x instanceof String) || ('[object String]' === to_string.call(x)); +} +function is_regexp( x ) +{ + return (x instanceof RegExp) || ('[object RegExp]' === to_string.call(x)); +} +function array( x ) +{ + return is_array(x) ? x : [x]; +} +function clone( obj, cloned ) +{ + cloned = cloned || {}; + for (var p in obj) if ( HAS.call(obj,p) ) cloned[p] = obj[p]; + return cloned; +} +function RE_OBJ( re ) +{ + var self = this; + self.re = re; + self.len = re.length; + self.pos = 0; + self.index = 0; + self.groupIndex = 0; + self.group = {}; + self.inGroup = 0; +} +RE_OBJ[PROTO] = { + constructor: RE_OBJ + ,re: null + ,len: null + ,pos: null + ,index: null + ,groupIndex: null + ,inGroup: null + ,groups: null + ,dispose: function( ) { + var self = this; + self.re = null; + self.len = null; + self.pos = null; + self.index = null; + self.groupIndex = null; + self.group = null; + self.inGroup = null; + } +}; +function Node( type, value, flags ) +{ + var self = this; + if ( !(self instanceof Node) ) return new Node(type, value, flags); + self.type = type; + self.val = value; + self.flags = flags || {}; + switch(type) + { + case T_SEQUENCE: + self.typeName = "Sequence"; break; + case T_ALTERNATION: + self.typeName = "Alternation"; break; + case T_GROUP: + self.typeName = "Group"; break; + case T_CHARGROUP: + self.typeName = "CharacterGroup"; break; + case T_CHARS: + self.typeName = "Characters"; break; + case T_CHARRANGE: + self.typeName = "CharacterRange"; break; + case T_STRING: + self.typeName = "String"; break; + case T_QUANTIFIER: + self.typeName = "Quantifier"; break; + case T_UNICODECHAR: + self.typeName = "UnicodeChar"; break; + case T_HEXCHAR: + self.typeName = "HexChar"; break; + case T_SPECIAL: + self.typeName = "Special"; break; + case T_COMMENT: + self.typeName = "Comment"; break; + default: + self.typeName = "unspecified"; break; + } +}; +Node.toObjectStatic = function toObject( v ) { + if (v instanceof Node) + { + return v.flags && Object.keys(v.flags).length ? { + type: v.typeName, + value: toObject(v.val), + flags: v.flags + } : { + type: v.typeName, + value: toObject(v.val) + }; + } + else if (is_array(v)) + { + return v.map(toObject); + } + return v; +}; +Node[PROTO] = { + constructor: Node + ,type: null + ,typeName: null + ,val: null + ,flags: null + ,dispose: function( ) { + var self = this; + self.val = null; + self.flags = null; + self.type = null; + self.typeName = null; + return self; + } + ,toObject: function( ) { + return Node.toObjectStatic(this); + } +}; + +var rnd = function( a, b ){ return Math.round((b-a)*Math.random()+a); }, + RE = function( re, fl ){ return new RegExp(re, fl||''); }, + slice = function( a ) { return AP.slice.apply(a, AP.slice.call(arguments, 1)); }, + flatten = function( a ) { + var r = [], i = 0; + while (i < a.length) r = r.concat(a[i++]); + return r; + }, + getArgs = function( args, asArray ) { + /*var a = slice(args); + if ( asArray && a[0] && + ( a[0] instanceof Array || '[object Array]' == to_string.call(a[0]) ) + ) + a = a[0];*/ + return flatten( slice( args ) ); //a; + }, + esc_re = function( s, esc, chargroup ) { + var es = '', l = s.length, i=0, c; + //escaped_re = /([.*+?^${}()|[\]\/\\\-])/g + if ( chargroup ) + { + while( i < l ) + { + c = s[CHAR](i++); + es += (/*('?' === c) || ('*' === c) || ('+' === c) ||*/ + ('-' === c) || /*('.' === c) ||*/ ('^' === c) || ('$' === c) || ('|' === c) || + ('{' === c) || ('}' === c) || ('(' === c) || (')' === c) || + ('[' === c) || (']' === c) || ('/' === c) || (esc === c) ? esc : '') + c; + } + } + else + { + while( i < l ) + { + c = s[CHAR](i++); + es += (('?' === c) || ('*' === c) || ('+' === c) || + /*('-' === c) ||*/ ('.' === c) || ('^' === c) || ('$' === c) || ('|' === c) || + ('{' === c) || ('}' === c) || ('(' === c) || (')' === c) || + ('[' === c) || (']' === c) || ('/' === c) || (esc === c) ? esc : '') + c; + } + } + return es; + }, + pad = function( s, n, z ) { + var ps = String(s); + z = z || '0'; + while ( ps.length < n ) ps = z + ps; + return ps; + }, + char_code = function( c ) { return c[CHARCODE](0); }, + char_code_range = function( s ) { return [s[CHARCODE](0), s[CHARCODE](s.length-1)]; }, + //char_codes = function( s_or_a ) { return (s_or_a.substr ? s_or_a.split("") : s_or_a).map( char_code ); }, + // http://stackoverflow.com/questions/12376870/create-an-array-of-characters-from-specified-range + character_range = function(first, last) { + if ( first && is_array(first) ) { last = first[1]; first = first[0]; } + var ch, chars, start = first[CHARCODE](0), end = last[CHARCODE](0); + + if ( end === start ) return [ fromCharCode( start ) ]; + + chars = []; + for (ch = start; ch <= end; ++ch) chars.push( fromCharCode( ch ) ); + return chars; + }, + concat = function(p1, p2) { + if ( p2 ) + { + var p, l; + if ( is_array(p2) ) + { + for (p=0,l=p2.length; p= minlen ? l : false; + }, + match_char_range = function( RANGE, s, pos, minlen, maxlen ) { + pos = pos || 0; + minlen = minlen || 1; + maxlen = maxlen || INF; + var lp = pos, l = 0, sl = s.length, ch; + while ( (lp < sl) && (l <= maxlen) && ((ch=s[CHARCODE](lp)) >= RANGE[0] && ch <= RANGE[1]) ) + { + lp++; l++; + } + return l >= minlen ? l : false; + }, + match_char_ranges = function( RANGES, s, pos, minlen, maxlen ) { + pos = pos || 0; + minlen = minlen || 1; + maxlen = maxlen || INF; + var lp = pos, l = 0, sl = s.length, ch, + i, Rl = RANGES.length, RANGE, found = true; + while ( (lp < sl) && (l <= maxlen) && found ) + { + ch = s[CHARCODE](lp); found = false; + for (i=0; i= RANGE[0] && ch <= RANGE[1] ) + { + lp++; l++; found = true; + break; + } + } + } + return l >= minlen ? l : false; + }, + + punct = function( ){ + return PUNCTS[CHAR](rnd(0, PUNCTS.length-1)); + }, + space = function( positive ){ + return false !== positive + ? SPACES[CHAR](rnd(0, SPACES.length-1)) + : (punct()+digit()+alpha())[CHAR](rnd(0,2)) + ; + }, + digit = function( positive ){ + return false !== positive + ? DIGITS[CHAR](rnd(0, DIGITS.length-1)) + : (punct()+space()+alpha())[CHAR](rnd(0,2)) + ; + }, + alpha = function( positive ){ + return false !== positive + ? ALPHAS[CHAR](rnd(0, ALPHAS.length-1)) + : (punct()+space()+digit())[CHAR](rnd(0,2)) + ; + }, + word = function( positive ){ + return false !== positive + ? (ALPHAS+DIGITS)[CHAR](rnd(0, ALPHAS.length+DIGITS.length-1)) + : (punct()+space())[CHAR](rnd(0,1)) + ; + }, + any = function( ){ + return ALL[CHAR](rnd(0, ALL.length-1)); + }, + character = function( chars, positive ){ + if ( false !== positive ) return chars.length ? chars[rnd(0, chars.length-1)] : ''; + var choices = ALL_ARY.filter(function(c){ return 0 > chars.indexOf(c); }); + return choices.length ? choices[rnd(0, choices.length-1)] : ''; + }, + random_upper_or_lower = function( c ) { return 0.5 < Math.random() ? c.toLowerCase( ) : c.toUpperCase( ); }, + case_insensitive = function( chars, asArray ) { + if ( asArray ) + { + if ( chars[CHAR] ) chars = chars.split(''); + chars = chars.map( random_upper_or_lower ); + //if ( !asArray ) chars = chars.join(''); + return chars; + } + else + { + return random_upper_or_lower( chars ); + } + }, + + walk = function walk( ret, node, state ) { + if ( (null == node) || !state ) return ret; + + var i, l, r, type = node instanceof Node ? node.type : null; + + // walk the tree + if ( null === type ) + { + // custom, let reduce handle it + ret = state.reduce( ret, node, state ); + } + + else if ( state.IGNORE & type ) + { + /* nothing */ + } + + else if ( state.MAP & type ) + { + r = state.map( ret, node, state ); + if ( null != state.ret ) + { + ret = state.reduce( ret, node, state ); + state.ret = null; + } + else if ( null != r ) + { + r = array(r); + for(i=0,l=r?r.length:0; i= state.maxLength ) + { + numrepeats = node.flags.min; + } + else + { + mmin = node.flags.min; + mmax = -1 === node.flags.max ? (mmin+1+2*state.maxLength) : node.flags.max; + numrepeats = rnd(mmin, mmax); + } + if ( numrepeats ) + { + repeats = new Array(numrepeats); + for(var i=0; i max ) + { + max = cur; + } + } + } + if ( l ) state.ret = max; + return null; + } + else if ( T_CHARGROUP === type ) + { + return node.val.length ? node.val[0] : null; + } + else if ( T_QUANTIFIER === type ) + { + max = walk(0, node.val, state); + if ( -1 === max ) + { + state.ret = -1; + } + else if ( 0 < max ) + { + if ( -1 === node.flags.max ) + { + state.ret = -1; + } + else if ( 0 < node.flags.max ) + { + state.ret = node.flags.max*max; + } + else + { + state.ret = max; + } + } + return null; + } + else if ( (T_GROUP === type) && node.flags.GroupIndex ) + { + var max = walk(0, node.val, state); + state.group[node.flags.GroupIndex] = max; + state.ret = max; + return null; + } + else + { + return node.val; + } + }, + map_1st = function map_1st( ret, node, state ) { + var type = node.type; + if ( T_SEQUENCE === type ) + { + var seq=[], i=0, l=node.val.length, n; + for(i=0; i 2) && ('x' === s[CHAR](0)) ) + { + if ( match_char_ranges(HEXDIGITS_RANGES, s, 1, 2, 2) ) return [m=s.slice(0,3), m.slice(1)]; + } + return false; + }, + match_unicode = function( s ) { + var m = false; + if ( (s.length > 4) && ('u' === s[CHAR](0)) ) + { + if ( match_char_ranges(HEXDIGITS_RANGES, s, 1, 4, 4) ) return [m=s.slice(0,5), m.slice(1)]; + } + return false; + }, + match_repeats = function( s ) { + var l, sl = s.length, pos = 0, m = false, hasComma = false; + if ( (sl > 2) && ('{' === s[CHAR](pos)) ) + { + m = ['', '', null]; + pos++; + if ( l=match_chars(SPACES, s, pos) ) pos += l; + if ( l=match_char_range(DIGITS_RANGE, s, pos) ) + { + m[1] = s.slice(pos, pos+l); + pos += l; + } + else + { + return false; + } + if ( l=match_chars(SPACES, s, pos) ) pos += l; + if ( (pos < sl) && (',' === s[CHAR](pos)) ) {pos += 1; hasComma = true;} + if ( l=match_chars(SPACES, s, pos) ) pos += l; + if ( l=match_char_range(DIGITS_RANGE, s, pos) ) + { + m[2] = s.slice(pos, pos+l); + pos += l; + } + if ( l=match_chars(SPACES, s, pos) ) pos += l; + if ( (pos < sl) && ('}' === s[CHAR](pos)) ) + { + pos++; + m[0] = s.slice(0, pos); + if ( !hasComma ) m[2] = m[1]; + return m; + } + else + { + return false; + } + } + return false; + }, + chargroup = function chargroup( re_obj ) { + var sequence = [], chars = [], allchars = [], flags = {}, flag, ch, lre, + prevch, range, isRange = false, m, isUnicode, isHex, escaped = false; + + if ( '^' === re_obj.re[CHAR]( re_obj.pos ) ) + { + flags[ "NegativeMatch" ] = 1; + re_obj.pos++; + } + + lre = re_obj.len; + while ( re_obj.pos < lre ) + { + isUnicode = false; + isHex = false; + m = null; + prevch = ch; + ch = re_obj.re[CHAR]( re_obj.pos++ ); + + escaped = ESC === ch; + if ( escaped ) ch = re_obj.re[CHAR]( re_obj.pos++ ); + + if ( escaped ) + { + // unicode character + if ( 'u' === ch ) + { + m = match_unicode( re_obj.re.substr( re_obj.pos-1 ) ); + re_obj.pos += m[0].length-1; + ch = Node(T_UNICODECHAR, m[0], {"Char": fromCharCode(parseInt(m[1], 16)), "Code": m[1]}); + isUnicode = true; isHex = false; + } + + // hex character + else if ( 'x' === ch ) + { + m = match_hex( re_obj.re.substr( re_obj.pos-1 ) ); + re_obj.pos += m[0].length-1; + ch = Node(T_HEXCHAR, m[0], {"Char": fromCharCode(parseInt(m[1], 16)), "Code": m[1]}); + isUnicode = true; isHex = true; + } + } + + if ( isRange ) + { + if ( chars.length ) + { + allchars = allchars.concat( chars ); + chars = []; + } + range[1] = ch; + isRange = false; + sequence.push( Node(T_CHARRANGE, range) ); + } + else + { + if ( escaped ) + { + if ( isUnicode ) + { + if ( chars.length ) + { + allchars = allchars.concat( chars ); + chars = []; + } + sequence.push( ch ); + } + + else if ( HAS.call(specialCharsEscaped,ch) && ('/' !== ch) ) + { + if ( chars.length ) + { + allchars = allchars.concat( chars ); + chars = []; + } + flag = {}; + flag[ specialCharsEscaped[ch] ] = 1; + sequence.push( Node(T_SPECIAL, ch, flag) ); + } + + else + { + chars.push( ch ); + } + } + + else + { + // end of char group + if ( ']' === ch ) + { + if ( chars.length ) + { + allchars = allchars.concat( chars ); + chars = []; + } + // map all chars into one node + if ( allchars.length ) sequence.push( Node(T_CHARS, allchars) ); + return Node(T_CHARGROUP, sequence, flags); + } + + else if ( '-' === ch ) + { + range = [prevch, '']; + if ( prevch instanceof Node ) sequence.pop(); else chars.pop(); + isRange = true; + } + + else + { + chars.push( ch ); + } + } + } + } + if ( chars.length ) + { + allchars = allchars.concat( chars ); + chars = []; + } + // map all chars into one node + if ( allchars.length ) sequence.push( Node(T_CHARS, allchars) ); + return Node(T_CHARGROUP, sequence, flags); + }, + + analyze_re = function analyze_re( re_obj ) { + var lre, ch, m, word = '', wordlen = 0, + alternation = [], sequence = [], flags = {}, + flag, escaped = false, pre, pre3, captured; + + if ( re_obj.inGroup > 0 ) + { + pre = re_obj.re.substr(re_obj.pos, 2); + pre3 = re_obj.re.substr(re_obj.pos, 3); + captured = 1; + + if ( "?P=" === pre3 ) + { + flags[ "BackReference" ] = 1; + flags[ "GroupName" ] = ''; + re_obj.pos += 3; + lre = re_obj.len; + while ( re_obj.pos < lre ) + { + ch = re_obj.re[CHAR]( re_obj.pos++ ); + if ( ")" === ch ) break; + flags[ "GroupName" ] += ch; + } + flags[ "GroupIndex" ] = HAS.call(re_obj.group,flags[ "GroupName" ]) ? re_obj.group[flags[ "GroupName" ]] : null; + return Node(T_SPECIAL, String(flags[ "GroupIndex" ]), flags); + } + + else if ( "?#" === pre ) + { + flags[ "Comment" ] = 1; + re_obj.pos += 2; + word = ''; + lre = re_obj.len; + while ( re_obj.pos < lre ) + { + ch = re_obj.re[CHAR]( re_obj.pos++ ); + if ( ")" === ch ) break; + word += ch; + } + return Node(T_COMMENT, word); + } + + else if ( "?:" === pre ) + { + flags[ "NotCaptured" ] = 1; + re_obj.pos += 2; + captured = 0; + } + + else if ( "?=" === pre ) + { + flags[ "LookAhead" ] = 1; + re_obj.pos += 2; + captured = 0; + } + + else if ( "?!" === pre ) + { + flags[ "NegativeLookAhead" ] = 1; + re_obj.pos += 2; + captured = 0; + } + + else if ( "?<=" === pre3 ) + { + flags[ "LookBehind" ] = 1; + re_obj.pos += 3; + captured = 0; + } + + else if ( "?" === ch ) break; + flags[ "GroupName" ] += ch; + } + } + + ++re_obj.index; + if ( captured ) + { + ++re_obj.groupIndex; + flags[ "GroupIndex" ] = re_obj.groupIndex; + re_obj.group[flags[ "GroupIndex" ]] = flags[ "GroupIndex" ]; + if ( flags[ "GroupName" ] ) re_obj.group[flags[ "GroupName" ]] = flags[ "GroupIndex" ]; + } + } + + lre = re_obj.len; + while ( re_obj.pos < lre ) + { + ch = re_obj.re[CHAR]( re_obj.pos++ ); + + // \\abc + escaped = ESC === ch; + if ( escaped ) ch = re_obj.re[CHAR]( re_obj.pos++ ); + + if ( escaped ) + { + // unicode character + if ( 'u' === ch ) + { + if ( wordlen ) + { + sequence.push( Node(T_STRING, word) ); + word = ''; + wordlen = 0; + } + m = match_unicode( re_obj.re.substr( re_obj.pos-1 ) ); + re_obj.pos += m[0].length-1; + sequence.push( Node(T_UNICODECHAR, m[0], {"Char": fromCharCode(parseInt(m[1], 16)), "Code": m[1]}) ); + } + + // hex character + else if ( 'x' === ch ) + { + if ( wordlen ) + { + sequence.push( Node(T_STRING, word) ); + word = ''; + wordlen = 0; + } + m = match_hex( re_obj.re.substr( re_obj.pos-1 ) ); + re_obj.pos += m[0].length-1; + sequence.push( Node(T_HEXCHAR, m[0], {"Char": fromCharCode(parseInt(m[1], 16)), "Code": m[1]}) ); + } + + else if ( HAS.call(specialCharsEscaped,ch) && ('/' !== ch) ) + { + if ( wordlen ) + { + sequence.push( Node(T_STRING, word) ); + word = ''; + wordlen = 0; + } + flag = {}; + flag[ specialCharsEscaped[ch] ] = 1; + sequence.push( Node(T_SPECIAL, ch, flag) ); + } + + else if ( ('1' <= ch) && ('9' >= ch) ) + { + if ( wordlen ) + { + sequence.push( Node(T_STRING, word) ); + word = ''; + wordlen = 0; + } + word = ch; + while (re_obj.pos < lre) + { + ch = re_obj.re[CHAR]( re_obj.pos ); + if ( ('0' <= ch) && ('9' >= ch) ) { word += ch; re_obj.pos++; } + else break; + } + flag = {}; + flag[ 'BackReference' ] = 1; + flag[ 'GroupIndex' ] = parseInt(word, 10); + sequence.push( Node(T_SPECIAL, word, flag) ); + word = ''; + } + + else + { + word += ch; + wordlen += 1; + } + } + + else + { + // group end + if ( (re_obj.inGroup > 0) && (')' === ch) ) + { + if ( wordlen ) + { + sequence.push( Node(T_STRING, word) ); + word = ''; + wordlen = 0; + } + if ( alternation.length ) + { + alternation.push( Node(T_SEQUENCE, sequence) ); + sequence = []; + flag = {}; + flag[ specialChars['|'] ] = 1; + return Node(T_GROUP, Node(T_ALTERNATION, alternation, flag), flags); + } + else + { + return Node(T_GROUP, Node(T_SEQUENCE, sequence), flags); + } + } + + // parse alternation + else if ( '|' === ch ) + { + if ( wordlen ) + { + sequence.push( Node(T_STRING, word) ); + word = ''; + wordlen = 0; + } + alternation.push( Node(T_SEQUENCE, sequence) ); + sequence = []; + } + + // parse character group + else if ( '[' === ch ) + { + if ( wordlen ) + { + sequence.push( Node(T_STRING, word) ); + word = ''; + wordlen = 0; + } + sequence.push( chargroup( re_obj ) ); + } + + // parse sub-group + else if ( '(' === ch ) + { + if ( wordlen ) + { + sequence.push( Node(T_STRING, word) ); + word = ''; + wordlen = 0; + } + re_obj.inGroup += 1; + sequence.push( analyze_re( re_obj ) ); + re_obj.inGroup -= 1; + } + + // parse num repeats + else if ( '{' === ch ) + { + if ( wordlen ) + { + sequence.push( Node(T_STRING, word) ); + word = ''; + wordlen = 0; + } + m = match_repeats( re_obj.re.substr( re_obj.pos-1 ) ); + re_obj.pos += m[0].length-1; + flag = { val: m[0], "MatchMinimum": m[1], "MatchMaximum": m[2] || "unlimited", "min": parseInt(m[1],10), "max": m[2] ? parseInt(m[2],10) : -1 }; + flag[ specialChars[ch] ] = 1; + if ( (re_obj.pos < lre) && ('?' === re_obj.re[CHAR](re_obj.pos)) ) + { + flag[ "isGreedy" ] = 0; + re_obj.pos++; + } + else + { + flag[ "isGreedy" ] = 1; + } + var prev = sequence.pop(); + if ( (T_STRING === prev.type) && (prev.val.length > 1) ) + { + sequence.push( Node(T_STRING, prev.val.slice(0, -1)) ); + prev.val = prev.val.slice(-1); + } + sequence.push( Node(T_QUANTIFIER, prev, flag) ); + } + + // quantifiers + else if ( ('*' === ch) || ('+' === ch) || ('?' === ch) ) + { + if ( wordlen ) + { + sequence.push( Node(T_STRING, word) ); + word = ''; + wordlen = 0; + } + flag = {}; + flag[ specialChars[ch] ] = 1; + flag["min"] = '+' === ch ? 1 : 0; + flag["max"] = '?' === ch ? 1 : -1; + if ( (re_obj.pos < lre) && ('?' === re_obj.re[CHAR](re_obj.pos)) ) + { + flag[ "isGreedy" ] = 0; + re_obj.pos++; + } + else + { + flag[ "isGreedy" ] = 1; + } + var prev = sequence.pop(); + if ( (T_STRING === prev.type) && (prev.val.length > 1) ) + { + sequence.push( Node(T_STRING, prev.val.slice(0, -1)) ); + prev.val = prev.val.slice(-1); + } + sequence.push( Node(T_QUANTIFIER, prev, flag) ); + } + + // special characters like ^, $, ., etc.. + else if ( HAS.call(specialChars,ch) ) + { + if ( wordlen ) + { + sequence.push( Node(T_STRING, word) ); + word = ''; + wordlen = 0; + } + flag = {}; + flag[ specialChars[ch] ] = 1; + sequence.push( Node(T_SPECIAL, ch, flag) ); + } + + else + { + word += ch; + wordlen += 1; + } + } + } + + if ( wordlen ) + { + sequence.push( Node(T_STRING, word) ); + word = ''; + wordlen = 0; + } + + if ( alternation.length ) + { + alternation.push( Node(T_SEQUENCE, sequence) ); + sequence = []; + flag = {}; + flags[ specialChars['|'] ] = 1; + return Node(T_ALTERNATION, alternation, flag); + } + return Node(T_SEQUENCE, sequence); + } +; + +// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions +// https://docs.python.org/3/library/re.html +// http://php.net/manual/en/reference.pcre.pattern.syntax.php +// A simple regular expression analyzer +function Analyzer( re, delim ) +{ + if ( !(this instanceof Analyzer) ) return new Analyzer(re, delim); + if ( re ) this.input( re, delim ); +} +Analyzer.VERSION = __version__; +Analyzer[PROTO] = { + + constructor: Analyzer, + + ast: null, + re: null, + fl: null, + src: null, + grp: null, + min: null, + max: null, + ch: null, + + dispose: function( ) { + var self = this; + self.ast = null; + self.re = null; + self.fl = null; + self.src = null; + self.grp = null; + self.min = null; + self.max = null; + self.ch = null; + return self; + }, + + reset: function( ) { + var self = this; + self.ast = null; + self.src = null; + self.grp = null; + self.min = null; + self.max = null; + self.ch = null; + return self; + }, + + input: function( re, delim ) { + var self = this; + if ( !arguments.length ) return self.re; + if ( re ) + { + delim = false === delim ? false : (delim || '/'); + var l, ch, fl = {}; + re = re.toString( ); + l = re.length; + + if ( delim ) + { + // parse re flags, if any + while ( 0 < l ) + { + ch = re[CHAR](l-1); + if ( delim === ch ) break; + else { fl[ ch ] = 1; l--; } + } + + if ( 0 < l ) + { + // remove re delimiters + if ( (delim === re[CHAR](0)) && (delim === re[CHAR](l-1)) ) re = re.slice(1, l-1); + else re = re.slice(0, l); + } + else + { + re = ''; + } + } + + // re is different, reset the ast, etc + if ( self.re !== re ) self.reset(); + self.re = re; self.fl = fl; + } + return self; + }, + + analyze: function( ) { + var self = this; + if ( (null != self.re) && (null === self.ast) ) + { + var re = new RE_OBJ(self.re); + self.ast = analyze_re( re ); + re.dispose(); + } + return self; + }, + + synthesize: function( escaped ) { + var self = this, state, re; + if ( null == self.re ) return self; + if ( null === self.ast ) + { + self.analyze( ); + self.src = null; + self.grp = null; + } + if ( null === self.src ) + { + state = { + MAP : T_SEQUENCE|T_ALTERNATION|T_GROUP|T_CHARGROUP|T_QUANTIFIER, + REDUCE : T_UNICODECHAR|T_HEXCHAR|T_SPECIAL|T_CHARS|T_CHARRANGE|T_STRING, + IGNORE : T_COMMENT, + map : map_src, + reduce : reduce_src, + escaped : false !== escaped, + group : {} + }; + re = walk({src:'',group:{}}, self.ast, state); + self.src = re.src; self.grp = re.group; + } + return self; + }, + + source: function( ) { + var self = this; + if ( null == self.re ) return null; + if ( null === self.src ) self.synthesize(); + return self.src; + }, + + groups: function( raw ) { + var self = this; + if ( null == self.re ) return null; + if ( null === self.grp ) self.synthesize(); + return true===raw ? sel.grp : clone(self.grp); + }, + + compile: function( flags ) { + var self = this; + if ( null == self.re ) return null; + flags = flags || self.fl || {}; + return new RegExp(self.source(), (flags.g||flags.G?'g':'')+(flags.i||flags.I?'i':'')+(flags.m||flags.M?'m':'')+(flags.y||flags.Y?'y':'')); + }, + + tree: function( flat ) { + var self = this; + if ( null == self.re ) return null; + if ( null === self.ast ) self.analyze( ); + return true===flat ? self.ast.toObject() : self.ast; + }, + + // experimental feature + sample: function( maxlen, numsamples ) { + var self = this, state; + if ( null == self.re ) return null; + if ( null === self.ast ) self.analyze( ); + state = { + MAP : T_SEQUENCE|T_ALTERNATION|T_GROUP|T_CHARGROUP|T_QUANTIFIER, + REDUCE : T_UNICODECHAR|T_HEXCHAR|T_SPECIAL|T_CHARS|T_CHARRANGE|T_STRING, + IGNORE : T_COMMENT, + map : map_any, + reduce : reduce_str, + maxLength : (maxlen|0) || 1, + isCaseInsensitive : null != self.fl.i, + group : {} + }; + numsamples = (numsamples|0) || 1; + if ( 1 < numsamples ) + { + var samples = new Array(numsamples); + for(var i=0; i