l.js 4.54 KB
'use strict';

var Lexer = exports.Lexer = function() {
  this.pos = 0;
  this.buf = null;
  this.buflen = 0;

  // Operator table, mapping operator -> token name
  this.optable = {
    '+':  'PLUS',
    '-':  'MINUS',
    '*':  'MULTIPLY',
    '.':  'PERIOD',
    '\\': 'BACKSLASH',
    ':':  'COLON',
    '%':  'PERCENT',
    '|':  'PIPE',
    '!':  'EXCLAMATION',
    '?':  'QUESTION',
    '#':  'POUND',
    '&':  'AMPERSAND',
    ';':  'SEMI',
    ',':  'COMMA',
    '(':  'L_PAREN',
    ')':  'R_PAREN',
    '<':  'L_ANG',
    '>':  'R_ANG',
    '{':  'L_BRACE',
    '}':  'R_BRACE',
    '[':  'L_BRACKET',
    ']':  'R_BRACKET',
    '=':  'EQUALS'
  };
}

// Initialize the Lexer's buffer. This resets the lexer's internal
// state and subsequent tokens will be returned starting with the
// beginning of the new buffer.
Lexer.prototype.input = function(buf) {
  this.pos = 0;
  this.buf = buf;
  this.buflen = buf.length;
}

// Get the next token from the current buffer. A token is an object with
// the following properties:
// - name: name of the pattern that this token matched (taken from rules).
// - value: actual string value of the token.
// - pos: offset in the current buffer where the token starts.
//
// If there are no more tokens in the buffer, returns null. In case of
// an error throws Error.
Lexer.prototype.token = function() {
  this._skipnontokens();
  if (this.pos >= this.buflen) {
    return null;
  }

  // The char at this.pos is part of a real token. Figure out which.
  var c = this.buf.charAt(this.pos);

  // '/' is treated specially, because it starts a comment if followed by
  // another '/'. If not followed by another '/', it's the DIVIDE
  // operator.
  if (c === '/') {
    var next_c = this.buf.charAt(this.pos + 1);
    if (next_c === '/') {
      return this._process_comment();
    } else {
      return {name: 'DIVIDE', value: '/', pos: this.pos++};
    }
  } else {
    // Look it up in the table of operators
    var op = this.optable[c];
    if (op !== undefined) {
      return {name: op, value: c, pos: this.pos++};
    } else {
      // Not an operator - so it's the beginning of another token.
      if (Lexer._isalpha(c)) {
        return this._process_identifier();
      } else if (Lexer._isdigit(c)) {
        return this._process_number();
      } else if (c === '"') {
        return this._process_quote();
      } else {
        throw Error('Token error at ' + this.pos);
      }
    }
  }
}

Lexer._isnewline = function(c) {
  return c === '\r' || c === '\n';
}

Lexer._isdigit = function(c) {
  return c >= '0' && c <= '9';
}

Lexer._isalpha = function(c) {
  return (c >= 'a' && c <= 'z') ||
         (c >= 'A' && c <= 'Z') ||
         c === '_' || c === '$';
}

Lexer._isalphanum = function(c) {
  return (c >= 'a' && c <= 'z') ||
         (c >= 'A' && c <= 'Z') ||
         (c >= '0' && c <= '9') ||
         c === '_' || c === '$';
}

Lexer.prototype._process_number = function() {
  var endpos = this.pos + 1;
  while (endpos < this.buflen &&
         Lexer._isdigit(this.buf.charAt(endpos))) {
    endpos++;
  }

  var tok = {
    name: 'NUMBER',
    value: this.buf.substring(this.pos, endpos),
    pos: this.pos
  };
  this.pos = endpos;
  return tok;
}

Lexer.prototype._process_comment = function() {
  var endpos = this.pos + 2;
  // Skip until the end of the line
  var c = this.buf.charAt(this.pos + 2);
  while (endpos < this.buflen &&
         !Lexer._isnewline(this.buf.charAt(endpos))) {
    endpos++;
  }

  var tok = {
    name: 'COMMENT',
    value: this.buf.substring(this.pos, endpos),
    pos: this.pos
  };
  this.pos = endpos + 1;
  return tok;
}

Lexer.prototype._process_identifier = function() {
  var endpos = this.pos + 1;
  while (endpos < this.buflen &&
         Lexer._isalphanum(this.buf.charAt(endpos))) {
    endpos++;
  }

  var tok = {
    name: 'IDENTIFIER',
    value: this.buf.substring(this.pos, endpos),
    pos: this.pos
  };
  this.pos = endpos;
  return tok;
}

Lexer.prototype._process_quote = function() {
  // this.pos points at the opening quote. Find the ending quote.
  var end_index = this.buf.indexOf('"', this.pos + 1);

  if (end_index === -1) {
    throw Error('Unterminated quote at ' + this.pos);
  } else {
    var tok = {
      name: 'QUOTE',
      value: this.buf.substring(this.pos, end_index + 1),
      pos: this.pos
    };
    this.pos = end_index + 1;
    return tok;
  }
}

Lexer.prototype._skipnontokens = function() {
  while (this.pos < this.buflen) {
    var c = this.buf.charAt(this.pos);
    if (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
      this.pos++;
    } else {
      break;
    }
  }
}