diff --git a/src/1p-filters.html b/src/1p-filters.html
index e17733d53..ecd186277 100644
--- a/src/1p-filters.html
+++ b/src/1p-filters.html
@@ -49,6 +49,7 @@
+
diff --git a/src/about.html b/src/about.html
index 71a321ec7..02d03547e 100644
--- a/src/about.html
+++ b/src/about.html
@@ -39,6 +39,7 @@
+
diff --git a/src/asset-viewer.html b/src/asset-viewer.html
index dfcf92c32..d94fc0bdc 100644
--- a/src/asset-viewer.html
+++ b/src/asset-viewer.html
@@ -33,6 +33,7 @@
+
diff --git a/src/background.html b/src/background.html
index 108ebd89c..7d183dfa4 100644
--- a/src/background.html
+++ b/src/background.html
@@ -9,6 +9,7 @@
+
diff --git a/src/js/codemirror/ubo-static-filtering.js b/src/js/codemirror/ubo-static-filtering.js
index f4f07f5ce..0d38c14ed 100644
--- a/src/js/codemirror/ubo-static-filtering.js
+++ b/src/js/codemirror/ubo-static-filtering.js
@@ -274,7 +274,9 @@ CodeMirror.defineMode('ubo-static-filtering', function() {
if ( parser.patternIsRegex() ) {
stream.pos = parser.slices[parser.optionsAnchorSpan.i+1];
parserSlot = parser.optionsAnchorSpan.i;
- return 'variable notice';
+ return parser.patternIsTokenizable()
+ ? 'variable notice'
+ : 'variable warning';
}
if ( (parser.slices[parserSlot] & (parser.BITAsterisk | parser.BITCaret)) !== 0 ) {
stream.pos += parser.slices[parserSlot+2];
diff --git a/src/js/static-filtering-parser.js b/src/js/static-filtering-parser.js
index 3d4a4cb45..d3c81d46e 100644
--- a/src/js/static-filtering-parser.js
+++ b/src/js/static-filtering-parser.js
@@ -1003,6 +1003,18 @@ const Parser = class {
return (this.flavorBits & BITFlavorNetRegex) !== 0;
}
+ patternIsTokenizable() {
+ // TODO: not necessarily true, this needs more work.
+ if ( this.patternIsRegex === false ) { return true; }
+ const s = Parser.tokenizableStrFromRegex(this.getNetPattern());
+ try {
+ return /(? {
+
+ const firstCharCodeClass = s => {
+ return /^[\x01%0-9A-Za-z]/.test(s) ? 1 : 0;
+ };
+
+ const lastCharCodeClass = s => {
+ return /[\x01%0-9A-Za-z]$/.test(s) ? 1 : 0;
+ };
+
+ const toTokenizableString = node => {
+ switch ( node.type ) {
+ case 1: /* T_SEQUENCE, 'Sequence' */ {
+ let s = '';
+ for ( let i = 0; i < node.val.length; i++ ) {
+ s += toTokenizableString(node.val[i]);
+ }
+ return s;
+ }
+ case 2: /* T_ALTERNATION,'Alternation' */
+ case 8: /* T_CHARGROUP, 'CharacterGroup' */ {
+ let firstChar = 0;
+ let lastChar = 0;
+ for ( let i = 0; i < node.val.length; i++ ) {
+ const s = toTokenizableString(node.val[i]);
+ if ( firstChar === 0 && firstCharCodeClass(s) === 1 ) {
+ firstChar = 1;
+ }
+ if ( lastChar === 0 && lastCharCodeClass(s) === 1 ) {
+ lastChar = 1;
+ }
+ if ( firstChar === 1 && lastChar === 1 ) { break; }
+ }
+ return String.fromCharCode(firstChar, lastChar);
+ }
+ case 4: /* T_GROUP, 'Group' */ {
+ if ( node.flags.NegativeLookAhead === 1 ) { return '\x01'; }
+ if ( node.flags.NegativeLookBehind === 1 ) { return '\x01'; }
+ return toTokenizableString(node.val);
+ }
+ case 16: /* T_QUANTIFIER, 'Quantifier' */ {
+ const s = toTokenizableString(node.val);
+ const first = firstCharCodeClass(s);
+ const last = lastCharCodeClass(s);
+ if ( node.flags.min === 0 && first === 0 && last === 0 ) {
+ return '';
+ }
+ return String.fromCharCode(first, last);
+ }
+ case 64: /* T_HEXCHAR, 'HexChar' */ {
+ return String.fromCharCode(parseInt(node.val.slice(1), 16));
+ }
+ case 128: /* T_SPECIAL, 'Special' */ {
+ const flags = node.flags;
+ if ( flags.MatchEnd === 1 ) { return '\x00'; }
+ if ( flags.MatchStart === 1 ) { return '\x00'; }
+ if ( flags.MatchWordBoundary === 1 ) { return '\x00'; }
+ return '\x01';
+ }
+ case 256: /* T_CHARS, 'Characters' */ {
+ for ( let i = 0; i < node.val.length; i++ ) {
+ if ( firstCharCodeClass(node.val[i]) === 1 ) {
+ return '\x01';
+ }
+ }
+ return '\x00';
+ }
+ // Ranges are assumed to always involve token-related characters.
+ case 512: /* T_CHARRANGE, 'CharacterRange' */ {
+ return '\x01';
+ }
+ case 1024: /* T_STRING, 'String' */ {
+ return node.val;
+ }
+ case 2048: /* T_COMMENT, 'Comment' */ {
+ return '';
+ }
+ default:
+ break;
+ }
+ return '\x01';
+ };
+
+ return function(reStr) {
+ if (
+ self.Regex instanceof Object === false ||
+ self.Regex.Analyzer instanceof Object === false
+ ) {
+ return '';
+ }
+ try {
+ return toTokenizableString(self.Regex.Analyzer(reStr, false).tree());
+ } catch(ex) {
+ }
+ return '';
+ };
+})();
+
+/******************************************************************************/
+
if ( typeof vAPI === 'object' && vAPI !== null ) {
vAPI.StaticFilteringParser = Parser;
} else {
diff --git a/src/js/static-net-filtering.js b/src/js/static-net-filtering.js
index 335f52f15..1edc3619c 100644
--- a/src/js/static-net-filtering.js
+++ b/src/js/static-net-filtering.js
@@ -2622,15 +2622,9 @@ const FilterParser = class {
if ( other !== undefined ) {
return Object.assign(this, other);
}
- this.cantWebsocket = vAPI.cantWebsocket;
this.noTokenHash = urlTokenizer.noTokenHash;
- this.reIsolateHostname = /^(\*?\.)?([^\x00-\x24\x26-\x2C\x2F\x3A-\x5E\x60\x7B-\x7F]+)(.*)/;
this.reBadCSP = /(?:=|;)\s*report-(?:to|uri)\b/;
this.reToken = /[%0-9A-Za-z]+/g;
- this.reRegexTokenAbort = /[\(\)\[\]]/;
- this.reRegexBadPrefix = /(^|[^\\]\.|\\[%SDWsdw]|[^\\][()*+?[\\\]{}])$/;
- this.reRegexBadSuffix = /^([^\\]\.|\\[%SDWsdw]|[()*+?[\]{}]|$)/;
- this.reGoodToken = /[%0-9a-z]{1,}/g;
this.domainOptList = [];
this.tokenIdToNormalizedType = new Map([
[ parser.OPTTokenCname, bitFromType('cname') ],
@@ -3175,32 +3169,22 @@ const FilterParser = class {
// not `bads`.
extractTokenFromRegex() {
this.reToken.lastIndex = 0;
- const pattern = this.pattern;
+ const pattern =
+ vAPI.StaticFilteringParser.tokenizableStrFromRegex(this.pattern);
let bestToken;
let bestBadness = 0x7FFFFFFF;
for (;;) {
const matches = this.reToken.exec(pattern);
if ( matches === null ) { break; }
- let token = matches[0];
- let prefix = pattern.slice(0, matches.index);
- let suffix = pattern.slice(this.reToken.lastIndex);
- if (
- this.reRegexTokenAbort.test(prefix) &&
- this.reRegexTokenAbort.test(suffix)
- ) {
+ const { 0: token, index } = matches;
+ if ( index === 0 || pattern.charAt(index - 1) === '\x01' ) {
continue;
}
- if ( token.charCodeAt(0) === 0x62 /* 'b' */ ) {
- const match = /\\+$/.exec(prefix);
- if ( match !== null && (match[0].length & 1) !== 0 ) {
- prefix += 'b';
- token = token.slice(1);
- }
- }
+ const { lastIndex } = this.reToken;
if (
- this.reRegexBadPrefix.test(prefix) || (
- token.length < this.maxTokenLen &&
- this.reRegexBadSuffix.test(suffix)
+ token.length < this.maxTokenLen && (
+ lastIndex === pattern.length ||
+ pattern.charAt(lastIndex) === '\x01'
)
) {
continue;
diff --git a/src/lib/regexanalyzer/README.md b/src/lib/regexanalyzer/README.md
new file mode 100644
index 000000000..8b9c2706a
--- /dev/null
+++ b/src/lib/regexanalyzer/README.md
@@ -0,0 +1,14 @@
+https://github.com/foo123/RegexAnalyzer/issues/1#issuecomment-750039255
+
+> The (implied) license is as free as it can get. You can modify it and use
+> it anywhere you want if it suits you.
+>
+> An attribution to original author would be appreciated but even this is not
+> mandatory.
+>
+> Copy Left
+
+References:
+
+- https://en.wikipedia.org/wiki/Copyleft
+- http://gplv3.fsf.org/wiki/index.php/Compatible_licenses
diff --git a/src/lib/regexanalyzer/regex.js b/src/lib/regexanalyzer/regex.js
new file mode 100644
index 000000000..788f03ee8
--- /dev/null
+++ b/src/lib/regexanalyzer/regex.js
@@ -0,0 +1,2156 @@
+/**
+*
+* Regex
+* @version: 1.1.0
+*
+* A simple & generic Regular Expression Analyzer & Composer for PHP, Python, Node.js / Browser / XPCOM Javascript
+* https://github.com/foo123/RegexAnalyzer
+*
+**/
+!function( root, name, factory ){
+"use strict";
+if ( ('undefined'!==typeof Components)&&('object'===typeof Components.classes)&&('object'===typeof Components.classesByID)&&Components.utils&&('function'===typeof Components.utils['import']) ) /* XPCOM */
+ (root.$deps = root.$deps||{}) && (root.EXPORTED_SYMBOLS = [name]) && (root[name] = root.$deps[name] = factory.call(root));
+else if ( ('object'===typeof module)&&module.exports ) /* CommonJS */
+ (module.$deps = module.$deps||{}) && (module.exports = module.$deps[name] = factory.call(root));
+else if ( ('undefined'!==typeof System)&&('function'===typeof System.register)&&('function'===typeof System['import']) ) /* ES6 module */
+ System.register(name,[],function($__export){$__export(name, factory.call(root));});
+else if ( ('function'===typeof define)&&define.amd&&('function'===typeof require)&&('function'===typeof require.specified)&&require.specified(name) /*&& !require.defined(name)*/ ) /* AMD */
+ define(name,['module'],function(module){factory.moduleUri = module.uri; return factory.call(root);});
+else if ( !(name in root) ) /* Browser/WebWorker/.. */
+ (root[name] = factory.call(root)||1)&&('function'===typeof(define))&&define.amd&&define(function(){return root[name];} );
+}( /* current root */ 'undefined' !== typeof self ? self : this,
+ /* module name */ "Regex",
+ /* module factory */ function ModuleFactory__Regex( undef ){
+"use strict";
+var __version__ = "1.1.0",
+
+ PROTO = 'prototype', OP = Object[PROTO], AP = Array[PROTO],
+ Keys = Object.keys, to_string = OP.toString, HAS = OP.hasOwnProperty,
+ fromCharCode = String.fromCharCode, CHAR = 'charAt', CHARCODE = 'charCodeAt', toJSON = JSON.stringify,
+ INF = Infinity, ESC = '\\',
+ specialChars = {
+ "." : "MatchAnyChar",
+ "|" : "MatchEither",
+ "?" : "MatchZeroOrOne",
+ "*" : "MatchZeroOrMore",
+ "+" : "MatchOneOrMore",
+ "^" : "MatchStart",
+ "$" : "MatchEnd",
+ "{" : "StartRepeats",
+ "}" : "EndRepeats",
+ "(" : "StartGroup",
+ ")" : "EndGroup",
+ "[" : "StartCharGroup",
+ "]" : "EndCharGroup"
+ },
+ /*
+ http://www.javascriptkit.com/javatutors/redev2.shtml
+
+ \f matches form-feed.
+ \r matches carriage return.
+ \n matches linefeed.
+ \t matches horizontal tab.
+ \v matches vertical tab.
+ \0 matches NUL character.
+ [\b] matches backspace.
+ \s matches whitespace (short for [\f\n\r\t\v\u00A0\u2028\u2029]).
+ \S matches anything but a whitespace (short for [^\f\n\r\t\v\u00A0\u2028\u2029]).
+ \w matches any alphanumerical character (word characters) including underscore (short for [a-zA-Z0-9_]).
+ \W matches any non-word characters (short for [^a-zA-Z0-9_]).
+ \d matches any digit (short for [0-9]).
+ \D matches any non-digit (short for [^0-9]).
+ \b matches a word boundary (the position between a word and a space).
+ \B matches a non-word boundary (short for [^\b]).
+ \cX matches a control character. E.g: \cm matches control-M.
+ \xhh matches the character with two characters of hexadecimal code hh.
+ \uhhhh matches the Unicode character with four characters of hexadecimal code hhhh.
+ */
+ specialCharsEscaped = {
+ "\\" : "ESC",
+ "/" : "/",
+ "0" : "NULChar",
+ "f" : "FormFeed",
+ "n" : "LineFeed",
+ "r" : "CarriageReturn",
+ "t" : "HorizontalTab",
+ "v" : "VerticalTab",
+ "b" : "MatchWordBoundary",
+ "B" : "MatchNonWordBoundary",
+ "s" : "MatchSpaceChar",
+ "S" : "MatchNonSpaceChar",
+ "w" : "MatchWordChar",
+ "W" : "MatchNonWordChar",
+ "d" : "MatchDigitChar",
+ "D" : "MatchNonDigitChar"
+ },
+ T_SEQUENCE = 1,
+ T_ALTERNATION = 2,
+ T_GROUP = 4,
+ T_CHARGROUP = 8,
+ T_QUANTIFIER = 16,
+ T_UNICODECHAR = 32,
+ T_HEXCHAR = 64,
+ T_SPECIAL = 128,
+ T_CHARS = 256,
+ T_CHARRANGE = 512,
+ T_STRING = 1024,
+ T_COMMENT = 2048
+;
+
+function is_array( x )
+{
+ return (x instanceof Array) || ('[object Array]' === to_string.call(x));
+}
+function is_string( x )
+{
+ return (x instanceof String) || ('[object String]' === to_string.call(x));
+}
+function is_regexp( x )
+{
+ return (x instanceof RegExp) || ('[object RegExp]' === to_string.call(x));
+}
+function array( x )
+{
+ return is_array(x) ? x : [x];
+}
+function clone( obj, cloned )
+{
+ cloned = cloned || {};
+ for (var p in obj) if ( HAS.call(obj,p) ) cloned[p] = obj[p];
+ return cloned;
+}
+function RE_OBJ( re )
+{
+ var self = this;
+ self.re = re;
+ self.len = re.length;
+ self.pos = 0;
+ self.index = 0;
+ self.groupIndex = 0;
+ self.group = {};
+ self.inGroup = 0;
+}
+RE_OBJ[PROTO] = {
+ constructor: RE_OBJ
+ ,re: null
+ ,len: null
+ ,pos: null
+ ,index: null
+ ,groupIndex: null
+ ,inGroup: null
+ ,groups: null
+ ,dispose: function( ) {
+ var self = this;
+ self.re = null;
+ self.len = null;
+ self.pos = null;
+ self.index = null;
+ self.groupIndex = null;
+ self.group = null;
+ self.inGroup = null;
+ }
+};
+function Node( type, value, flags )
+{
+ var self = this;
+ if ( !(self instanceof Node) ) return new Node(type, value, flags);
+ self.type = type;
+ self.val = value;
+ self.flags = flags || {};
+ switch(type)
+ {
+ case T_SEQUENCE:
+ self.typeName = "Sequence"; break;
+ case T_ALTERNATION:
+ self.typeName = "Alternation"; break;
+ case T_GROUP:
+ self.typeName = "Group"; break;
+ case T_CHARGROUP:
+ self.typeName = "CharacterGroup"; break;
+ case T_CHARS:
+ self.typeName = "Characters"; break;
+ case T_CHARRANGE:
+ self.typeName = "CharacterRange"; break;
+ case T_STRING:
+ self.typeName = "String"; break;
+ case T_QUANTIFIER:
+ self.typeName = "Quantifier"; break;
+ case T_UNICODECHAR:
+ self.typeName = "UnicodeChar"; break;
+ case T_HEXCHAR:
+ self.typeName = "HexChar"; break;
+ case T_SPECIAL:
+ self.typeName = "Special"; break;
+ case T_COMMENT:
+ self.typeName = "Comment"; break;
+ default:
+ self.typeName = "unspecified"; break;
+ }
+};
+Node.toObjectStatic = function toObject( v ) {
+ if (v instanceof Node)
+ {
+ return v.flags && Object.keys(v.flags).length ? {
+ type: v.typeName,
+ value: toObject(v.val),
+ flags: v.flags
+ } : {
+ type: v.typeName,
+ value: toObject(v.val)
+ };
+ }
+ else if (is_array(v))
+ {
+ return v.map(toObject);
+ }
+ return v;
+};
+Node[PROTO] = {
+ constructor: Node
+ ,type: null
+ ,typeName: null
+ ,val: null
+ ,flags: null
+ ,dispose: function( ) {
+ var self = this;
+ self.val = null;
+ self.flags = null;
+ self.type = null;
+ self.typeName = null;
+ return self;
+ }
+ ,toObject: function( ) {
+ return Node.toObjectStatic(this);
+ }
+};
+
+var rnd = function( a, b ){ return Math.round((b-a)*Math.random()+a); },
+ RE = function( re, fl ){ return new RegExp(re, fl||''); },
+ slice = function( a ) { return AP.slice.apply(a, AP.slice.call(arguments, 1)); },
+ flatten = function( a ) {
+ var r = [], i = 0;
+ while (i < a.length) r = r.concat(a[i++]);
+ return r;
+ },
+ getArgs = function( args, asArray ) {
+ /*var a = slice(args);
+ if ( asArray && a[0] &&
+ ( a[0] instanceof Array || '[object Array]' == to_string.call(a[0]) )
+ )
+ a = a[0];*/
+ return flatten( slice( args ) ); //a;
+ },
+ esc_re = function( s, esc, chargroup ) {
+ var es = '', l = s.length, i=0, c;
+ //escaped_re = /([.*+?^${}()|[\]\/\\\-])/g
+ if ( chargroup )
+ {
+ while( i < l )
+ {
+ c = s[CHAR](i++);
+ es += (/*('?' === c) || ('*' === c) || ('+' === c) ||*/
+ ('-' === c) || /*('.' === c) ||*/ ('^' === c) || ('$' === c) || ('|' === c) ||
+ ('{' === c) || ('}' === c) || ('(' === c) || (')' === c) ||
+ ('[' === c) || (']' === c) || ('/' === c) || (esc === c) ? esc : '') + c;
+ }
+ }
+ else
+ {
+ while( i < l )
+ {
+ c = s[CHAR](i++);
+ es += (('?' === c) || ('*' === c) || ('+' === c) ||
+ /*('-' === c) ||*/ ('.' === c) || ('^' === c) || ('$' === c) || ('|' === c) ||
+ ('{' === c) || ('}' === c) || ('(' === c) || (')' === c) ||
+ ('[' === c) || (']' === c) || ('/' === c) || (esc === c) ? esc : '') + c;
+ }
+ }
+ return es;
+ },
+ pad = function( s, n, z ) {
+ var ps = String(s);
+ z = z || '0';
+ while ( ps.length < n ) ps = z + ps;
+ return ps;
+ },
+ char_code = function( c ) { return c[CHARCODE](0); },
+ char_code_range = function( s ) { return [s[CHARCODE](0), s[CHARCODE](s.length-1)]; },
+ //char_codes = function( s_or_a ) { return (s_or_a.substr ? s_or_a.split("") : s_or_a).map( char_code ); },
+ // http://stackoverflow.com/questions/12376870/create-an-array-of-characters-from-specified-range
+ character_range = function(first, last) {
+ if ( first && is_array(first) ) { last = first[1]; first = first[0]; }
+ var ch, chars, start = first[CHARCODE](0), end = last[CHARCODE](0);
+
+ if ( end === start ) return [ fromCharCode( start ) ];
+
+ chars = [];
+ for (ch = start; ch <= end; ++ch) chars.push( fromCharCode( ch ) );
+ return chars;
+ },
+ concat = function(p1, p2) {
+ if ( p2 )
+ {
+ var p, l;
+ if ( is_array(p2) )
+ {
+ for (p=0,l=p2.length; p= minlen ? l : false;
+ },
+ match_char_range = function( RANGE, s, pos, minlen, maxlen ) {
+ pos = pos || 0;
+ minlen = minlen || 1;
+ maxlen = maxlen || INF;
+ var lp = pos, l = 0, sl = s.length, ch;
+ while ( (lp < sl) && (l <= maxlen) && ((ch=s[CHARCODE](lp)) >= RANGE[0] && ch <= RANGE[1]) )
+ {
+ lp++; l++;
+ }
+ return l >= minlen ? l : false;
+ },
+ match_char_ranges = function( RANGES, s, pos, minlen, maxlen ) {
+ pos = pos || 0;
+ minlen = minlen || 1;
+ maxlen = maxlen || INF;
+ var lp = pos, l = 0, sl = s.length, ch,
+ i, Rl = RANGES.length, RANGE, found = true;
+ while ( (lp < sl) && (l <= maxlen) && found )
+ {
+ ch = s[CHARCODE](lp); found = false;
+ for (i=0; i= RANGE[0] && ch <= RANGE[1] )
+ {
+ lp++; l++; found = true;
+ break;
+ }
+ }
+ }
+ return l >= minlen ? l : false;
+ },
+
+ punct = function( ){
+ return PUNCTS[CHAR](rnd(0, PUNCTS.length-1));
+ },
+ space = function( positive ){
+ return false !== positive
+ ? SPACES[CHAR](rnd(0, SPACES.length-1))
+ : (punct()+digit()+alpha())[CHAR](rnd(0,2))
+ ;
+ },
+ digit = function( positive ){
+ return false !== positive
+ ? DIGITS[CHAR](rnd(0, DIGITS.length-1))
+ : (punct()+space()+alpha())[CHAR](rnd(0,2))
+ ;
+ },
+ alpha = function( positive ){
+ return false !== positive
+ ? ALPHAS[CHAR](rnd(0, ALPHAS.length-1))
+ : (punct()+space()+digit())[CHAR](rnd(0,2))
+ ;
+ },
+ word = function( positive ){
+ return false !== positive
+ ? (ALPHAS+DIGITS)[CHAR](rnd(0, ALPHAS.length+DIGITS.length-1))
+ : (punct()+space())[CHAR](rnd(0,1))
+ ;
+ },
+ any = function( ){
+ return ALL[CHAR](rnd(0, ALL.length-1));
+ },
+ character = function( chars, positive ){
+ if ( false !== positive ) return chars.length ? chars[rnd(0, chars.length-1)] : '';
+ var choices = ALL_ARY.filter(function(c){ return 0 > chars.indexOf(c); });
+ return choices.length ? choices[rnd(0, choices.length-1)] : '';
+ },
+ random_upper_or_lower = function( c ) { return 0.5 < Math.random() ? c.toLowerCase( ) : c.toUpperCase( ); },
+ case_insensitive = function( chars, asArray ) {
+ if ( asArray )
+ {
+ if ( chars[CHAR] ) chars = chars.split('');
+ chars = chars.map( random_upper_or_lower );
+ //if ( !asArray ) chars = chars.join('');
+ return chars;
+ }
+ else
+ {
+ return random_upper_or_lower( chars );
+ }
+ },
+
+ walk = function walk( ret, node, state ) {
+ if ( (null == node) || !state ) return ret;
+
+ var i, l, r, type = node instanceof Node ? node.type : null;
+
+ // walk the tree
+ if ( null === type )
+ {
+ // custom, let reduce handle it
+ ret = state.reduce( ret, node, state );
+ }
+
+ else if ( state.IGNORE & type )
+ {
+ /* nothing */
+ }
+
+ else if ( state.MAP & type )
+ {
+ r = state.map( ret, node, state );
+ if ( null != state.ret )
+ {
+ ret = state.reduce( ret, node, state );
+ state.ret = null;
+ }
+ else if ( null != r )
+ {
+ r = array(r);
+ for(i=0,l=r?r.length:0; i= state.maxLength )
+ {
+ numrepeats = node.flags.min;
+ }
+ else
+ {
+ mmin = node.flags.min;
+ mmax = -1 === node.flags.max ? (mmin+1+2*state.maxLength) : node.flags.max;
+ numrepeats = rnd(mmin, mmax);
+ }
+ if ( numrepeats )
+ {
+ repeats = new Array(numrepeats);
+ for(var i=0; i max )
+ {
+ max = cur;
+ }
+ }
+ }
+ if ( l ) state.ret = max;
+ return null;
+ }
+ else if ( T_CHARGROUP === type )
+ {
+ return node.val.length ? node.val[0] : null;
+ }
+ else if ( T_QUANTIFIER === type )
+ {
+ max = walk(0, node.val, state);
+ if ( -1 === max )
+ {
+ state.ret = -1;
+ }
+ else if ( 0 < max )
+ {
+ if ( -1 === node.flags.max )
+ {
+ state.ret = -1;
+ }
+ else if ( 0 < node.flags.max )
+ {
+ state.ret = node.flags.max*max;
+ }
+ else
+ {
+ state.ret = max;
+ }
+ }
+ return null;
+ }
+ else if ( (T_GROUP === type) && node.flags.GroupIndex )
+ {
+ var max = walk(0, node.val, state);
+ state.group[node.flags.GroupIndex] = max;
+ state.ret = max;
+ return null;
+ }
+ else
+ {
+ return node.val;
+ }
+ },
+ map_1st = function map_1st( ret, node, state ) {
+ var type = node.type;
+ if ( T_SEQUENCE === type )
+ {
+ var seq=[], i=0, l=node.val.length, n;
+ for(i=0; i 2) && ('x' === s[CHAR](0)) )
+ {
+ if ( match_char_ranges(HEXDIGITS_RANGES, s, 1, 2, 2) ) return [m=s.slice(0,3), m.slice(1)];
+ }
+ return false;
+ },
+ match_unicode = function( s ) {
+ var m = false;
+ if ( (s.length > 4) && ('u' === s[CHAR](0)) )
+ {
+ if ( match_char_ranges(HEXDIGITS_RANGES, s, 1, 4, 4) ) return [m=s.slice(0,5), m.slice(1)];
+ }
+ return false;
+ },
+ match_repeats = function( s ) {
+ var l, sl = s.length, pos = 0, m = false, hasComma = false;
+ if ( (sl > 2) && ('{' === s[CHAR](pos)) )
+ {
+ m = ['', '', null];
+ pos++;
+ if ( l=match_chars(SPACES, s, pos) ) pos += l;
+ if ( l=match_char_range(DIGITS_RANGE, s, pos) )
+ {
+ m[1] = s.slice(pos, pos+l);
+ pos += l;
+ }
+ else
+ {
+ return false;
+ }
+ if ( l=match_chars(SPACES, s, pos) ) pos += l;
+ if ( (pos < sl) && (',' === s[CHAR](pos)) ) {pos += 1; hasComma = true;}
+ if ( l=match_chars(SPACES, s, pos) ) pos += l;
+ if ( l=match_char_range(DIGITS_RANGE, s, pos) )
+ {
+ m[2] = s.slice(pos, pos+l);
+ pos += l;
+ }
+ if ( l=match_chars(SPACES, s, pos) ) pos += l;
+ if ( (pos < sl) && ('}' === s[CHAR](pos)) )
+ {
+ pos++;
+ m[0] = s.slice(0, pos);
+ if ( !hasComma ) m[2] = m[1];
+ return m;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ return false;
+ },
+ chargroup = function chargroup( re_obj ) {
+ var sequence = [], chars = [], allchars = [], flags = {}, flag, ch, lre,
+ prevch, range, isRange = false, m, isUnicode, isHex, escaped = false;
+
+ if ( '^' === re_obj.re[CHAR]( re_obj.pos ) )
+ {
+ flags[ "NegativeMatch" ] = 1;
+ re_obj.pos++;
+ }
+
+ lre = re_obj.len;
+ while ( re_obj.pos < lre )
+ {
+ isUnicode = false;
+ isHex = false;
+ m = null;
+ prevch = ch;
+ ch = re_obj.re[CHAR]( re_obj.pos++ );
+
+ escaped = ESC === ch;
+ if ( escaped ) ch = re_obj.re[CHAR]( re_obj.pos++ );
+
+ if ( escaped )
+ {
+ // unicode character
+ if ( 'u' === ch )
+ {
+ m = match_unicode( re_obj.re.substr( re_obj.pos-1 ) );
+ re_obj.pos += m[0].length-1;
+ ch = Node(T_UNICODECHAR, m[0], {"Char": fromCharCode(parseInt(m[1], 16)), "Code": m[1]});
+ isUnicode = true; isHex = false;
+ }
+
+ // hex character
+ else if ( 'x' === ch )
+ {
+ m = match_hex( re_obj.re.substr( re_obj.pos-1 ) );
+ re_obj.pos += m[0].length-1;
+ ch = Node(T_HEXCHAR, m[0], {"Char": fromCharCode(parseInt(m[1], 16)), "Code": m[1]});
+ isUnicode = true; isHex = true;
+ }
+ }
+
+ if ( isRange )
+ {
+ if ( chars.length )
+ {
+ allchars = allchars.concat( chars );
+ chars = [];
+ }
+ range[1] = ch;
+ isRange = false;
+ sequence.push( Node(T_CHARRANGE, range) );
+ }
+ else
+ {
+ if ( escaped )
+ {
+ if ( isUnicode )
+ {
+ if ( chars.length )
+ {
+ allchars = allchars.concat( chars );
+ chars = [];
+ }
+ sequence.push( ch );
+ }
+
+ else if ( HAS.call(specialCharsEscaped,ch) && ('/' !== ch) )
+ {
+ if ( chars.length )
+ {
+ allchars = allchars.concat( chars );
+ chars = [];
+ }
+ flag = {};
+ flag[ specialCharsEscaped[ch] ] = 1;
+ sequence.push( Node(T_SPECIAL, ch, flag) );
+ }
+
+ else
+ {
+ chars.push( ch );
+ }
+ }
+
+ else
+ {
+ // end of char group
+ if ( ']' === ch )
+ {
+ if ( chars.length )
+ {
+ allchars = allchars.concat( chars );
+ chars = [];
+ }
+ // map all chars into one node
+ if ( allchars.length ) sequence.push( Node(T_CHARS, allchars) );
+ return Node(T_CHARGROUP, sequence, flags);
+ }
+
+ else if ( '-' === ch )
+ {
+ range = [prevch, ''];
+ if ( prevch instanceof Node ) sequence.pop(); else chars.pop();
+ isRange = true;
+ }
+
+ else
+ {
+ chars.push( ch );
+ }
+ }
+ }
+ }
+ if ( chars.length )
+ {
+ allchars = allchars.concat( chars );
+ chars = [];
+ }
+ // map all chars into one node
+ if ( allchars.length ) sequence.push( Node(T_CHARS, allchars) );
+ return Node(T_CHARGROUP, sequence, flags);
+ },
+
+ analyze_re = function analyze_re( re_obj ) {
+ var lre, ch, m, word = '', wordlen = 0,
+ alternation = [], sequence = [], flags = {},
+ flag, escaped = false, pre, pre3, captured;
+
+ if ( re_obj.inGroup > 0 )
+ {
+ pre = re_obj.re.substr(re_obj.pos, 2);
+ pre3 = re_obj.re.substr(re_obj.pos, 3);
+ captured = 1;
+
+ if ( "?P=" === pre3 )
+ {
+ flags[ "BackReference" ] = 1;
+ flags[ "GroupName" ] = '';
+ re_obj.pos += 3;
+ lre = re_obj.len;
+ while ( re_obj.pos < lre )
+ {
+ ch = re_obj.re[CHAR]( re_obj.pos++ );
+ if ( ")" === ch ) break;
+ flags[ "GroupName" ] += ch;
+ }
+ flags[ "GroupIndex" ] = HAS.call(re_obj.group,flags[ "GroupName" ]) ? re_obj.group[flags[ "GroupName" ]] : null;
+ return Node(T_SPECIAL, String(flags[ "GroupIndex" ]), flags);
+ }
+
+ else if ( "?#" === pre )
+ {
+ flags[ "Comment" ] = 1;
+ re_obj.pos += 2;
+ word = '';
+ lre = re_obj.len;
+ while ( re_obj.pos < lre )
+ {
+ ch = re_obj.re[CHAR]( re_obj.pos++ );
+ if ( ")" === ch ) break;
+ word += ch;
+ }
+ return Node(T_COMMENT, word);
+ }
+
+ else if ( "?:" === pre )
+ {
+ flags[ "NotCaptured" ] = 1;
+ re_obj.pos += 2;
+ captured = 0;
+ }
+
+ else if ( "?=" === pre )
+ {
+ flags[ "LookAhead" ] = 1;
+ re_obj.pos += 2;
+ captured = 0;
+ }
+
+ else if ( "?!" === pre )
+ {
+ flags[ "NegativeLookAhead" ] = 1;
+ re_obj.pos += 2;
+ captured = 0;
+ }
+
+ else if ( "?<=" === pre3 )
+ {
+ flags[ "LookBehind" ] = 1;
+ re_obj.pos += 3;
+ captured = 0;
+ }
+
+ else if ( "?" === ch ) break;
+ flags[ "GroupName" ] += ch;
+ }
+ }
+
+ ++re_obj.index;
+ if ( captured )
+ {
+ ++re_obj.groupIndex;
+ flags[ "GroupIndex" ] = re_obj.groupIndex;
+ re_obj.group[flags[ "GroupIndex" ]] = flags[ "GroupIndex" ];
+ if ( flags[ "GroupName" ] ) re_obj.group[flags[ "GroupName" ]] = flags[ "GroupIndex" ];
+ }
+ }
+
+ lre = re_obj.len;
+ while ( re_obj.pos < lre )
+ {
+ ch = re_obj.re[CHAR]( re_obj.pos++ );
+
+ // \\abc
+ escaped = ESC === ch;
+ if ( escaped ) ch = re_obj.re[CHAR]( re_obj.pos++ );
+
+ if ( escaped )
+ {
+ // unicode character
+ if ( 'u' === ch )
+ {
+ if ( wordlen )
+ {
+ sequence.push( Node(T_STRING, word) );
+ word = '';
+ wordlen = 0;
+ }
+ m = match_unicode( re_obj.re.substr( re_obj.pos-1 ) );
+ re_obj.pos += m[0].length-1;
+ sequence.push( Node(T_UNICODECHAR, m[0], {"Char": fromCharCode(parseInt(m[1], 16)), "Code": m[1]}) );
+ }
+
+ // hex character
+ else if ( 'x' === ch )
+ {
+ if ( wordlen )
+ {
+ sequence.push( Node(T_STRING, word) );
+ word = '';
+ wordlen = 0;
+ }
+ m = match_hex( re_obj.re.substr( re_obj.pos-1 ) );
+ re_obj.pos += m[0].length-1;
+ sequence.push( Node(T_HEXCHAR, m[0], {"Char": fromCharCode(parseInt(m[1], 16)), "Code": m[1]}) );
+ }
+
+ else if ( HAS.call(specialCharsEscaped,ch) && ('/' !== ch) )
+ {
+ if ( wordlen )
+ {
+ sequence.push( Node(T_STRING, word) );
+ word = '';
+ wordlen = 0;
+ }
+ flag = {};
+ flag[ specialCharsEscaped[ch] ] = 1;
+ sequence.push( Node(T_SPECIAL, ch, flag) );
+ }
+
+ else if ( ('1' <= ch) && ('9' >= ch) )
+ {
+ if ( wordlen )
+ {
+ sequence.push( Node(T_STRING, word) );
+ word = '';
+ wordlen = 0;
+ }
+ word = ch;
+ while (re_obj.pos < lre)
+ {
+ ch = re_obj.re[CHAR]( re_obj.pos );
+ if ( ('0' <= ch) && ('9' >= ch) ) { word += ch; re_obj.pos++; }
+ else break;
+ }
+ flag = {};
+ flag[ 'BackReference' ] = 1;
+ flag[ 'GroupIndex' ] = parseInt(word, 10);
+ sequence.push( Node(T_SPECIAL, word, flag) );
+ word = '';
+ }
+
+ else
+ {
+ word += ch;
+ wordlen += 1;
+ }
+ }
+
+ else
+ {
+ // group end
+ if ( (re_obj.inGroup > 0) && (')' === ch) )
+ {
+ if ( wordlen )
+ {
+ sequence.push( Node(T_STRING, word) );
+ word = '';
+ wordlen = 0;
+ }
+ if ( alternation.length )
+ {
+ alternation.push( Node(T_SEQUENCE, sequence) );
+ sequence = [];
+ flag = {};
+ flag[ specialChars['|'] ] = 1;
+ return Node(T_GROUP, Node(T_ALTERNATION, alternation, flag), flags);
+ }
+ else
+ {
+ return Node(T_GROUP, Node(T_SEQUENCE, sequence), flags);
+ }
+ }
+
+ // parse alternation
+ else if ( '|' === ch )
+ {
+ if ( wordlen )
+ {
+ sequence.push( Node(T_STRING, word) );
+ word = '';
+ wordlen = 0;
+ }
+ alternation.push( Node(T_SEQUENCE, sequence) );
+ sequence = [];
+ }
+
+ // parse character group
+ else if ( '[' === ch )
+ {
+ if ( wordlen )
+ {
+ sequence.push( Node(T_STRING, word) );
+ word = '';
+ wordlen = 0;
+ }
+ sequence.push( chargroup( re_obj ) );
+ }
+
+ // parse sub-group
+ else if ( '(' === ch )
+ {
+ if ( wordlen )
+ {
+ sequence.push( Node(T_STRING, word) );
+ word = '';
+ wordlen = 0;
+ }
+ re_obj.inGroup += 1;
+ sequence.push( analyze_re( re_obj ) );
+ re_obj.inGroup -= 1;
+ }
+
+ // parse num repeats
+ else if ( '{' === ch )
+ {
+ if ( wordlen )
+ {
+ sequence.push( Node(T_STRING, word) );
+ word = '';
+ wordlen = 0;
+ }
+ m = match_repeats( re_obj.re.substr( re_obj.pos-1 ) );
+ re_obj.pos += m[0].length-1;
+ flag = { val: m[0], "MatchMinimum": m[1], "MatchMaximum": m[2] || "unlimited", "min": parseInt(m[1],10), "max": m[2] ? parseInt(m[2],10) : -1 };
+ flag[ specialChars[ch] ] = 1;
+ if ( (re_obj.pos < lre) && ('?' === re_obj.re[CHAR](re_obj.pos)) )
+ {
+ flag[ "isGreedy" ] = 0;
+ re_obj.pos++;
+ }
+ else
+ {
+ flag[ "isGreedy" ] = 1;
+ }
+ var prev = sequence.pop();
+ if ( (T_STRING === prev.type) && (prev.val.length > 1) )
+ {
+ sequence.push( Node(T_STRING, prev.val.slice(0, -1)) );
+ prev.val = prev.val.slice(-1);
+ }
+ sequence.push( Node(T_QUANTIFIER, prev, flag) );
+ }
+
+ // quantifiers
+ else if ( ('*' === ch) || ('+' === ch) || ('?' === ch) )
+ {
+ if ( wordlen )
+ {
+ sequence.push( Node(T_STRING, word) );
+ word = '';
+ wordlen = 0;
+ }
+ flag = {};
+ flag[ specialChars[ch] ] = 1;
+ flag["min"] = '+' === ch ? 1 : 0;
+ flag["max"] = '?' === ch ? 1 : -1;
+ if ( (re_obj.pos < lre) && ('?' === re_obj.re[CHAR](re_obj.pos)) )
+ {
+ flag[ "isGreedy" ] = 0;
+ re_obj.pos++;
+ }
+ else
+ {
+ flag[ "isGreedy" ] = 1;
+ }
+ var prev = sequence.pop();
+ if ( (T_STRING === prev.type) && (prev.val.length > 1) )
+ {
+ sequence.push( Node(T_STRING, prev.val.slice(0, -1)) );
+ prev.val = prev.val.slice(-1);
+ }
+ sequence.push( Node(T_QUANTIFIER, prev, flag) );
+ }
+
+ // special characters like ^, $, ., etc..
+ else if ( HAS.call(specialChars,ch) )
+ {
+ if ( wordlen )
+ {
+ sequence.push( Node(T_STRING, word) );
+ word = '';
+ wordlen = 0;
+ }
+ flag = {};
+ flag[ specialChars[ch] ] = 1;
+ sequence.push( Node(T_SPECIAL, ch, flag) );
+ }
+
+ else
+ {
+ word += ch;
+ wordlen += 1;
+ }
+ }
+ }
+
+ if ( wordlen )
+ {
+ sequence.push( Node(T_STRING, word) );
+ word = '';
+ wordlen = 0;
+ }
+
+ if ( alternation.length )
+ {
+ alternation.push( Node(T_SEQUENCE, sequence) );
+ sequence = [];
+ flag = {};
+ flags[ specialChars['|'] ] = 1;
+ return Node(T_ALTERNATION, alternation, flag);
+ }
+ return Node(T_SEQUENCE, sequence);
+ }
+;
+
+// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions
+// https://docs.python.org/3/library/re.html
+// http://php.net/manual/en/reference.pcre.pattern.syntax.php
+// A simple regular expression analyzer
+function Analyzer( re, delim )
+{
+ if ( !(this instanceof Analyzer) ) return new Analyzer(re, delim);
+ if ( re ) this.input( re, delim );
+}
+Analyzer.VERSION = __version__;
+Analyzer[PROTO] = {
+
+ constructor: Analyzer,
+
+ ast: null,
+ re: null,
+ fl: null,
+ src: null,
+ grp: null,
+ min: null,
+ max: null,
+ ch: null,
+
+ dispose: function( ) {
+ var self = this;
+ self.ast = null;
+ self.re = null;
+ self.fl = null;
+ self.src = null;
+ self.grp = null;
+ self.min = null;
+ self.max = null;
+ self.ch = null;
+ return self;
+ },
+
+ reset: function( ) {
+ var self = this;
+ self.ast = null;
+ self.src = null;
+ self.grp = null;
+ self.min = null;
+ self.max = null;
+ self.ch = null;
+ return self;
+ },
+
+ input: function( re, delim ) {
+ var self = this;
+ if ( !arguments.length ) return self.re;
+ if ( re )
+ {
+ delim = false === delim ? false : (delim || '/');
+ var l, ch, fl = {};
+ re = re.toString( );
+ l = re.length;
+
+ if ( delim )
+ {
+ // parse re flags, if any
+ while ( 0 < l )
+ {
+ ch = re[CHAR](l-1);
+ if ( delim === ch ) break;
+ else { fl[ ch ] = 1; l--; }
+ }
+
+ if ( 0 < l )
+ {
+ // remove re delimiters
+ if ( (delim === re[CHAR](0)) && (delim === re[CHAR](l-1)) ) re = re.slice(1, l-1);
+ else re = re.slice(0, l);
+ }
+ else
+ {
+ re = '';
+ }
+ }
+
+ // re is different, reset the ast, etc
+ if ( self.re !== re ) self.reset();
+ self.re = re; self.fl = fl;
+ }
+ return self;
+ },
+
+ analyze: function( ) {
+ var self = this;
+ if ( (null != self.re) && (null === self.ast) )
+ {
+ var re = new RE_OBJ(self.re);
+ self.ast = analyze_re( re );
+ re.dispose();
+ }
+ return self;
+ },
+
+ synthesize: function( escaped ) {
+ var self = this, state, re;
+ if ( null == self.re ) return self;
+ if ( null === self.ast )
+ {
+ self.analyze( );
+ self.src = null;
+ self.grp = null;
+ }
+ if ( null === self.src )
+ {
+ state = {
+ MAP : T_SEQUENCE|T_ALTERNATION|T_GROUP|T_CHARGROUP|T_QUANTIFIER,
+ REDUCE : T_UNICODECHAR|T_HEXCHAR|T_SPECIAL|T_CHARS|T_CHARRANGE|T_STRING,
+ IGNORE : T_COMMENT,
+ map : map_src,
+ reduce : reduce_src,
+ escaped : false !== escaped,
+ group : {}
+ };
+ re = walk({src:'',group:{}}, self.ast, state);
+ self.src = re.src; self.grp = re.group;
+ }
+ return self;
+ },
+
+ source: function( ) {
+ var self = this;
+ if ( null == self.re ) return null;
+ if ( null === self.src ) self.synthesize();
+ return self.src;
+ },
+
+ groups: function( raw ) {
+ var self = this;
+ if ( null == self.re ) return null;
+ if ( null === self.grp ) self.synthesize();
+ return true===raw ? sel.grp : clone(self.grp);
+ },
+
+ compile: function( flags ) {
+ var self = this;
+ if ( null == self.re ) return null;
+ flags = flags || self.fl || {};
+ return new RegExp(self.source(), (flags.g||flags.G?'g':'')+(flags.i||flags.I?'i':'')+(flags.m||flags.M?'m':'')+(flags.y||flags.Y?'y':''));
+ },
+
+ tree: function( flat ) {
+ var self = this;
+ if ( null == self.re ) return null;
+ if ( null === self.ast ) self.analyze( );
+ return true===flat ? self.ast.toObject() : self.ast;
+ },
+
+ // experimental feature
+ sample: function( maxlen, numsamples ) {
+ var self = this, state;
+ if ( null == self.re ) return null;
+ if ( null === self.ast ) self.analyze( );
+ state = {
+ MAP : T_SEQUENCE|T_ALTERNATION|T_GROUP|T_CHARGROUP|T_QUANTIFIER,
+ REDUCE : T_UNICODECHAR|T_HEXCHAR|T_SPECIAL|T_CHARS|T_CHARRANGE|T_STRING,
+ IGNORE : T_COMMENT,
+ map : map_any,
+ reduce : reduce_str,
+ maxLength : (maxlen|0) || 1,
+ isCaseInsensitive : null != self.fl.i,
+ group : {}
+ };
+ numsamples = (numsamples|0) || 1;
+ if ( 1 < numsamples )
+ {
+ var samples = new Array(numsamples);
+ for(var i=0; i