123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566 |
- /*jshint node:true */
- /*
- The MIT License (MIT)
- Copyright (c) 2007-2018 Einar Lielmanis, Liam Newman, and contributors.
- Permission is hereby granted, free of charge, to any person
- obtaining a copy of this software and associated documentation files
- (the "Software"), to deal in the Software without restriction,
- including without limitation the rights to use, copy, modify, merge,
- publish, distribute, sublicense, and/or sell copies of the Software,
- and to permit persons to whom the Software is furnished to do so,
- subject to the following conditions:
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
- */
- 'use strict';
- var InputScanner = require('../core/inputscanner').InputScanner;
- var BaseTokenizer = require('../core/tokenizer').Tokenizer;
- var BASETOKEN = require('../core/tokenizer').TOKEN;
- var Directives = require('../core/directives').Directives;
- var acorn = require('./acorn');
- var Pattern = require('../core/pattern').Pattern;
- var TemplatablePattern = require('../core/templatablepattern').TemplatablePattern;
- function in_array(what, arr) {
- return arr.indexOf(what) !== -1;
- }
- var TOKEN = {
- START_EXPR: 'TK_START_EXPR',
- END_EXPR: 'TK_END_EXPR',
- START_BLOCK: 'TK_START_BLOCK',
- END_BLOCK: 'TK_END_BLOCK',
- WORD: 'TK_WORD',
- RESERVED: 'TK_RESERVED',
- SEMICOLON: 'TK_SEMICOLON',
- STRING: 'TK_STRING',
- EQUALS: 'TK_EQUALS',
- OPERATOR: 'TK_OPERATOR',
- COMMA: 'TK_COMMA',
- BLOCK_COMMENT: 'TK_BLOCK_COMMENT',
- COMMENT: 'TK_COMMENT',
- DOT: 'TK_DOT',
- UNKNOWN: 'TK_UNKNOWN',
- START: BASETOKEN.START,
- RAW: BASETOKEN.RAW,
- EOF: BASETOKEN.EOF
- };
- var directives_core = new Directives(/\/\*/, /\*\//);
- var number_pattern = /0[xX][0123456789abcdefABCDEF]*|0[oO][01234567]*|0[bB][01]*|\d+n|(?:\.\d+|\d+\.?\d*)(?:[eE][+-]?\d+)?/;
- var digit = /[0-9]/;
- // Dot "." must be distinguished from "..." and decimal
- var dot_pattern = /[^\d\.]/;
- var positionable_operators = (
- ">>> === !== " +
- "<< && >= ** != == <= >> || ?? |> " +
- "< / - + > : & % ? ^ | *").split(' ');
- // IMPORTANT: this must be sorted longest to shortest or tokenizing many not work.
- // Also, you must update possitionable operators separately from punct
- var punct =
- ">>>= " +
- "... >>= <<= === >>> !== **= " +
- "=> ^= :: /= << <= == && -= >= >> != -- += ** || ?? ++ %= &= *= |= |> " +
- "= ! ? > < : / ^ - + * & % ~ |";
- punct = punct.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
- // ?. but not if followed by a number
- punct = '\\?\\.(?!\\d) ' + punct;
- punct = punct.replace(/ /g, '|');
- var punct_pattern = new RegExp(punct);
- // words which should always start on new line.
- var line_starters = 'continue,try,throw,return,var,let,const,if,switch,case,default,for,while,break,function,import,export'.split(',');
- var reserved_words = line_starters.concat(['do', 'in', 'of', 'else', 'get', 'set', 'new', 'catch', 'finally', 'typeof', 'yield', 'async', 'await', 'from', 'as']);
- var reserved_word_pattern = new RegExp('^(?:' + reserved_words.join('|') + ')$');
- // var template_pattern = /(?:(?:<\?php|<\?=)[\s\S]*?\?>)|(?:<%[\s\S]*?%>)/g;
- var in_html_comment;
- var Tokenizer = function(input_string, options) {
- BaseTokenizer.call(this, input_string, options);
- this._patterns.whitespace = this._patterns.whitespace.matching(
- /\u00A0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000\ufeff/.source,
- /\u2028\u2029/.source);
- var pattern_reader = new Pattern(this._input);
- var templatable = new TemplatablePattern(this._input)
- .read_options(this._options);
- this.__patterns = {
- template: templatable,
- identifier: templatable.starting_with(acorn.identifier).matching(acorn.identifierMatch),
- number: pattern_reader.matching(number_pattern),
- punct: pattern_reader.matching(punct_pattern),
- // comment ends just before nearest linefeed or end of file
- comment: pattern_reader.starting_with(/\/\//).until(/[\n\r\u2028\u2029]/),
- // /* ... */ comment ends with nearest */ or end of file
- block_comment: pattern_reader.starting_with(/\/\*/).until_after(/\*\//),
- html_comment_start: pattern_reader.matching(/<!--/),
- html_comment_end: pattern_reader.matching(/-->/),
- include: pattern_reader.starting_with(/#include/).until_after(acorn.lineBreak),
- shebang: pattern_reader.starting_with(/#!/).until_after(acorn.lineBreak),
- xml: pattern_reader.matching(/[\s\S]*?<(\/?)([-a-zA-Z:0-9_.]+|{[\s\S]+?}|!\[CDATA\[[\s\S]*?\]\])(\s+{[\s\S]+?}|\s+[-a-zA-Z:0-9_.]+|\s+[-a-zA-Z:0-9_.]+\s*=\s*('[^']*'|"[^"]*"|{[\s\S]+?}))*\s*(\/?)\s*>/),
- single_quote: templatable.until(/['\\\n\r\u2028\u2029]/),
- double_quote: templatable.until(/["\\\n\r\u2028\u2029]/),
- template_text: templatable.until(/[`\\$]/),
- template_expression: templatable.until(/[`}\\]/)
- };
- };
- Tokenizer.prototype = new BaseTokenizer();
- Tokenizer.prototype._is_comment = function(current_token) {
- return current_token.type === TOKEN.COMMENT || current_token.type === TOKEN.BLOCK_COMMENT || current_token.type === TOKEN.UNKNOWN;
- };
- Tokenizer.prototype._is_opening = function(current_token) {
- return current_token.type === TOKEN.START_BLOCK || current_token.type === TOKEN.START_EXPR;
- };
- Tokenizer.prototype._is_closing = function(current_token, open_token) {
- return (current_token.type === TOKEN.END_BLOCK || current_token.type === TOKEN.END_EXPR) &&
- (open_token && (
- (current_token.text === ']' && open_token.text === '[') ||
- (current_token.text === ')' && open_token.text === '(') ||
- (current_token.text === '}' && open_token.text === '{')));
- };
- Tokenizer.prototype._reset = function() {
- in_html_comment = false;
- };
- Tokenizer.prototype._get_next_token = function(previous_token, open_token) { // jshint unused:false
- var token = null;
- this._readWhitespace();
- var c = this._input.peek();
- if (c === null) {
- return this._create_token(TOKEN.EOF, '');
- }
- token = token || this._read_non_javascript(c);
- token = token || this._read_string(c);
- token = token || this._read_word(previous_token);
- token = token || this._read_singles(c);
- token = token || this._read_comment(c);
- token = token || this._read_regexp(c, previous_token);
- token = token || this._read_xml(c, previous_token);
- token = token || this._read_punctuation();
- token = token || this._create_token(TOKEN.UNKNOWN, this._input.next());
- return token;
- };
- Tokenizer.prototype._read_word = function(previous_token) {
- var resulting_string;
- resulting_string = this.__patterns.identifier.read();
- if (resulting_string !== '') {
- resulting_string = resulting_string.replace(acorn.allLineBreaks, '\n');
- if (!(previous_token.type === TOKEN.DOT ||
- (previous_token.type === TOKEN.RESERVED && (previous_token.text === 'set' || previous_token.text === 'get'))) &&
- reserved_word_pattern.test(resulting_string)) {
- if (resulting_string === 'in' || resulting_string === 'of') { // hack for 'in' and 'of' operators
- return this._create_token(TOKEN.OPERATOR, resulting_string);
- }
- return this._create_token(TOKEN.RESERVED, resulting_string);
- }
- return this._create_token(TOKEN.WORD, resulting_string);
- }
- resulting_string = this.__patterns.number.read();
- if (resulting_string !== '') {
- return this._create_token(TOKEN.WORD, resulting_string);
- }
- };
- Tokenizer.prototype._read_singles = function(c) {
- var token = null;
- if (c === '(' || c === '[') {
- token = this._create_token(TOKEN.START_EXPR, c);
- } else if (c === ')' || c === ']') {
- token = this._create_token(TOKEN.END_EXPR, c);
- } else if (c === '{') {
- token = this._create_token(TOKEN.START_BLOCK, c);
- } else if (c === '}') {
- token = this._create_token(TOKEN.END_BLOCK, c);
- } else if (c === ';') {
- token = this._create_token(TOKEN.SEMICOLON, c);
- } else if (c === '.' && dot_pattern.test(this._input.peek(1))) {
- token = this._create_token(TOKEN.DOT, c);
- } else if (c === ',') {
- token = this._create_token(TOKEN.COMMA, c);
- }
- if (token) {
- this._input.next();
- }
- return token;
- };
- Tokenizer.prototype._read_punctuation = function() {
- var resulting_string = this.__patterns.punct.read();
- if (resulting_string !== '') {
- if (resulting_string === '=') {
- return this._create_token(TOKEN.EQUALS, resulting_string);
- } else if (resulting_string === '?.') {
- return this._create_token(TOKEN.DOT, resulting_string);
- } else {
- return this._create_token(TOKEN.OPERATOR, resulting_string);
- }
- }
- };
- Tokenizer.prototype._read_non_javascript = function(c) {
- var resulting_string = '';
- if (c === '#') {
- if (this._is_first_token()) {
- resulting_string = this.__patterns.shebang.read();
- if (resulting_string) {
- return this._create_token(TOKEN.UNKNOWN, resulting_string.trim() + '\n');
- }
- }
- // handles extendscript #includes
- resulting_string = this.__patterns.include.read();
- if (resulting_string) {
- return this._create_token(TOKEN.UNKNOWN, resulting_string.trim() + '\n');
- }
- c = this._input.next();
- // Spidermonkey-specific sharp variables for circular references. Considered obsolete.
- var sharp = '#';
- if (this._input.hasNext() && this._input.testChar(digit)) {
- do {
- c = this._input.next();
- sharp += c;
- } while (this._input.hasNext() && c !== '#' && c !== '=');
- if (c === '#') {
- //
- } else if (this._input.peek() === '[' && this._input.peek(1) === ']') {
- sharp += '[]';
- this._input.next();
- this._input.next();
- } else if (this._input.peek() === '{' && this._input.peek(1) === '}') {
- sharp += '{}';
- this._input.next();
- this._input.next();
- }
- return this._create_token(TOKEN.WORD, sharp);
- }
- this._input.back();
- } else if (c === '<' && this._is_first_token()) {
- resulting_string = this.__patterns.html_comment_start.read();
- if (resulting_string) {
- while (this._input.hasNext() && !this._input.testChar(acorn.newline)) {
- resulting_string += this._input.next();
- }
- in_html_comment = true;
- return this._create_token(TOKEN.COMMENT, resulting_string);
- }
- } else if (in_html_comment && c === '-') {
- resulting_string = this.__patterns.html_comment_end.read();
- if (resulting_string) {
- in_html_comment = false;
- return this._create_token(TOKEN.COMMENT, resulting_string);
- }
- }
- return null;
- };
- Tokenizer.prototype._read_comment = function(c) {
- var token = null;
- if (c === '/') {
- var comment = '';
- if (this._input.peek(1) === '*') {
- // peek for comment /* ... */
- comment = this.__patterns.block_comment.read();
- var directives = directives_core.get_directives(comment);
- if (directives && directives.ignore === 'start') {
- comment += directives_core.readIgnored(this._input);
- }
- comment = comment.replace(acorn.allLineBreaks, '\n');
- token = this._create_token(TOKEN.BLOCK_COMMENT, comment);
- token.directives = directives;
- } else if (this._input.peek(1) === '/') {
- // peek for comment // ...
- comment = this.__patterns.comment.read();
- token = this._create_token(TOKEN.COMMENT, comment);
- }
- }
- return token;
- };
- Tokenizer.prototype._read_string = function(c) {
- if (c === '`' || c === "'" || c === '"') {
- var resulting_string = this._input.next();
- this.has_char_escapes = false;
- if (c === '`') {
- resulting_string += this._read_string_recursive('`', true, '${');
- } else {
- resulting_string += this._read_string_recursive(c);
- }
- if (this.has_char_escapes && this._options.unescape_strings) {
- resulting_string = unescape_string(resulting_string);
- }
- if (this._input.peek() === c) {
- resulting_string += this._input.next();
- }
- resulting_string = resulting_string.replace(acorn.allLineBreaks, '\n');
- return this._create_token(TOKEN.STRING, resulting_string);
- }
- return null;
- };
- Tokenizer.prototype._allow_regexp_or_xml = function(previous_token) {
- // regex and xml can only appear in specific locations during parsing
- return (previous_token.type === TOKEN.RESERVED && in_array(previous_token.text, ['return', 'case', 'throw', 'else', 'do', 'typeof', 'yield'])) ||
- (previous_token.type === TOKEN.END_EXPR && previous_token.text === ')' &&
- previous_token.opened.previous.type === TOKEN.RESERVED && in_array(previous_token.opened.previous.text, ['if', 'while', 'for'])) ||
- (in_array(previous_token.type, [TOKEN.COMMENT, TOKEN.START_EXPR, TOKEN.START_BLOCK, TOKEN.START,
- TOKEN.END_BLOCK, TOKEN.OPERATOR, TOKEN.EQUALS, TOKEN.EOF, TOKEN.SEMICOLON, TOKEN.COMMA
- ]));
- };
- Tokenizer.prototype._read_regexp = function(c, previous_token) {
- if (c === '/' && this._allow_regexp_or_xml(previous_token)) {
- // handle regexp
- //
- var resulting_string = this._input.next();
- var esc = false;
- var in_char_class = false;
- while (this._input.hasNext() &&
- ((esc || in_char_class || this._input.peek() !== c) &&
- !this._input.testChar(acorn.newline))) {
- resulting_string += this._input.peek();
- if (!esc) {
- esc = this._input.peek() === '\\';
- if (this._input.peek() === '[') {
- in_char_class = true;
- } else if (this._input.peek() === ']') {
- in_char_class = false;
- }
- } else {
- esc = false;
- }
- this._input.next();
- }
- if (this._input.peek() === c) {
- resulting_string += this._input.next();
- // regexps may have modifiers /regexp/MOD , so fetch those, too
- // Only [gim] are valid, but if the user puts in garbage, do what we can to take it.
- resulting_string += this._input.read(acorn.identifier);
- }
- return this._create_token(TOKEN.STRING, resulting_string);
- }
- return null;
- };
- Tokenizer.prototype._read_xml = function(c, previous_token) {
- if (this._options.e4x && c === "<" && this._allow_regexp_or_xml(previous_token)) {
- var xmlStr = '';
- var match = this.__patterns.xml.read_match();
- // handle e4x xml literals
- //
- if (match) {
- // Trim root tag to attempt to
- var rootTag = match[2].replace(/^{\s+/, '{').replace(/\s+}$/, '}');
- var isCurlyRoot = rootTag.indexOf('{') === 0;
- var depth = 0;
- while (match) {
- var isEndTag = !!match[1];
- var tagName = match[2];
- var isSingletonTag = (!!match[match.length - 1]) || (tagName.slice(0, 8) === "![CDATA[");
- if (!isSingletonTag &&
- (tagName === rootTag || (isCurlyRoot && tagName.replace(/^{\s+/, '{').replace(/\s+}$/, '}')))) {
- if (isEndTag) {
- --depth;
- } else {
- ++depth;
- }
- }
- xmlStr += match[0];
- if (depth <= 0) {
- break;
- }
- match = this.__patterns.xml.read_match();
- }
- // if we didn't close correctly, keep unformatted.
- if (!match) {
- xmlStr += this._input.match(/[\s\S]*/g)[0];
- }
- xmlStr = xmlStr.replace(acorn.allLineBreaks, '\n');
- return this._create_token(TOKEN.STRING, xmlStr);
- }
- }
- return null;
- };
- function unescape_string(s) {
- // You think that a regex would work for this
- // return s.replace(/\\x([0-9a-f]{2})/gi, function(match, val) {
- // return String.fromCharCode(parseInt(val, 16));
- // })
- // However, dealing with '\xff', '\\xff', '\\\xff' makes this more fun.
- var out = '',
- escaped = 0;
- var input_scan = new InputScanner(s);
- var matched = null;
- while (input_scan.hasNext()) {
- // Keep any whitespace, non-slash characters
- // also keep slash pairs.
- matched = input_scan.match(/([\s]|[^\\]|\\\\)+/g);
- if (matched) {
- out += matched[0];
- }
- if (input_scan.peek() === '\\') {
- input_scan.next();
- if (input_scan.peek() === 'x') {
- matched = input_scan.match(/x([0-9A-Fa-f]{2})/g);
- } else if (input_scan.peek() === 'u') {
- matched = input_scan.match(/u([0-9A-Fa-f]{4})/g);
- } else {
- out += '\\';
- if (input_scan.hasNext()) {
- out += input_scan.next();
- }
- continue;
- }
- // If there's some error decoding, return the original string
- if (!matched) {
- return s;
- }
- escaped = parseInt(matched[1], 16);
- if (escaped > 0x7e && escaped <= 0xff && matched[0].indexOf('x') === 0) {
- // we bail out on \x7f..\xff,
- // leaving whole string escaped,
- // as it's probably completely binary
- return s;
- } else if (escaped >= 0x00 && escaped < 0x20) {
- // leave 0x00...0x1f escaped
- out += '\\' + matched[0];
- continue;
- } else if (escaped === 0x22 || escaped === 0x27 || escaped === 0x5c) {
- // single-quote, apostrophe, backslash - escape these
- out += '\\' + String.fromCharCode(escaped);
- } else {
- out += String.fromCharCode(escaped);
- }
- }
- }
- return out;
- }
- // handle string
- //
- Tokenizer.prototype._read_string_recursive = function(delimiter, allow_unescaped_newlines, start_sub) {
- var current_char;
- var pattern;
- if (delimiter === '\'') {
- pattern = this.__patterns.single_quote;
- } else if (delimiter === '"') {
- pattern = this.__patterns.double_quote;
- } else if (delimiter === '`') {
- pattern = this.__patterns.template_text;
- } else if (delimiter === '}') {
- pattern = this.__patterns.template_expression;
- }
- var resulting_string = pattern.read();
- var next = '';
- while (this._input.hasNext()) {
- next = this._input.next();
- if (next === delimiter ||
- (!allow_unescaped_newlines && acorn.newline.test(next))) {
- this._input.back();
- break;
- } else if (next === '\\' && this._input.hasNext()) {
- current_char = this._input.peek();
- if (current_char === 'x' || current_char === 'u') {
- this.has_char_escapes = true;
- } else if (current_char === '\r' && this._input.peek(1) === '\n') {
- this._input.next();
- }
- next += this._input.next();
- } else if (start_sub) {
- if (start_sub === '${' && next === '$' && this._input.peek() === '{') {
- next += this._input.next();
- }
- if (start_sub === next) {
- if (delimiter === '`') {
- next += this._read_string_recursive('}', allow_unescaped_newlines, '`');
- } else {
- next += this._read_string_recursive('`', allow_unescaped_newlines, '${');
- }
- if (this._input.hasNext()) {
- next += this._input.next();
- }
- }
- }
- next += pattern.read();
- resulting_string += next;
- }
- return resulting_string;
- };
- module.exports.Tokenizer = Tokenizer;
- module.exports.TOKEN = TOKEN;
- module.exports.positionable_operators = positionable_operators.slice();
- module.exports.line_starters = line_starters.slice();
|