From 0844df8cc2db9bd66c750871bcf0f651e4a96b69 Mon Sep 17 00:00:00 2001
From: Tim Schaub <tim.schaub@gmail.com>
Date: Fri, 7 Jun 2013 11:52:34 -0600
Subject: [PATCH] Scanning identifiers

This includes code that is likely not necessary.  The escape sequence scanning will likely not be used in our case, but I'm committing it here so it can be brought back if needed later.
---
 src/ol/expression/lexer.js            | 191 +++++++++++++++++++++++++-
 test/spec/ol/expression/lexer.test.js |  91 +++++++++++-
 2 files changed, 279 insertions(+), 3 deletions(-)

diff --git a/src/ol/expression/lexer.js b/src/ol/expression/lexer.js
index db47971464..e19111dfcc 100644
--- a/src/ol/expression/lexer.js
+++ b/src/ol/expression/lexer.js
@@ -9,6 +9,7 @@ goog.require('goog.asserts');
  */
 ol.expression.Char = {
   AMPERSAND: 38,
+  BACKSLASH: 92,
   BANG: 33, // !
   CARRIAGE_RETURN: 13,
   COMMA: 44,
@@ -28,6 +29,7 @@ ol.expression.Char = {
   LOWER_A: 97,
   LOWER_E: 101,
   LOWER_F: 102,
+  LOWER_U: 117,
   LOWER_X: 120,
   LOWER_Z: 122,
   MINUS: 45,
@@ -249,6 +251,47 @@ ol.expression.Lexer.prototype.isIdentifierStart_ = function(code) {
 };
 
 
+/**
+ * Determine if the given identifier is an ECMAScript keyword.  These cannot
+ * be used as identifiers in programs.  There is no real reason these could not
+ * be used in ol expressions - so it might be worth allowing them.
+ *
+ * http://www.ecma-international.org/ecma-262/5.1/#sec-7.6.1.1
+ * @param  {string} id Identifier.
+ * @return {boolean} The identifier is a keyword.
+ * @private
+ */
+ol.expression.Lexer.prototype.isKeyword_ = function(id) {
+  return (
+      id === 'break' ||
+      id === 'case' ||
+      id === 'catch' ||
+      id === 'continue' ||
+      id === 'debugger' ||
+      id === 'default' ||
+      id === 'delete' ||
+      id === 'do' ||
+      id === 'else' ||
+      id === 'finally' ||
+      id === 'for' ||
+      id === 'function' ||
+      id === 'if' ||
+      id === 'in' ||
+      id === 'instanceof' ||
+      id === 'new' ||
+      id === 'return' ||
+      id === 'switch' ||
+      id === 'this' ||
+      id === 'throw' ||
+      id === 'try' ||
+      id === 'typeof' ||
+      id === 'var' ||
+      id === 'void' ||
+      id === 'while' ||
+      id === 'with');
+};
+
+
 /**
  * http://www.ecma-international.org/ecma-262/5.1/#sec-7.3
  *
@@ -331,6 +374,129 @@ ol.expression.Lexer.prototype.getCurrentCharCode_ = function() {
 };
 
 
+/**
+ * Get an identifier that includes escape sequences.
+ *
+ * @return {string} The identifier.
+ * @private
+ */
+ol.expression.Lexer.prototype.getEscapedIdentifier_ = function() {
+  var code = this.getCurrentCharCode_();
+  var id = String.fromCharCode(code);
+
+  this.increment_(1);
+
+  // the \u sequence denotes an escaped character
+  if (code === ol.expression.Char.BACKSLASH) {
+    if (this.getCurrentCharCode_() !== ol.expression.Char.LOWER_U) {
+      throw new Error('Unexpected token at index ' + this.index_ +
+          ': ' + this.getCurrentChar_());
+    }
+    this.increment_(1);
+    code = this.scanEscapeSequence_(ol.expression.Char.LOWER_U);
+
+    if (!code || code === ol.expression.Char.BACKSLASH ||
+        !this.isIdentifierStart_(code)) {
+      throw new Error('Unexpected token at index ' + this.index_ +
+          ': ' + this.getCurrentChar_());
+    }
+    id = String.fromCharCode(code);
+  }
+
+  while (this.index_ < this.length_) {
+    code = this.getCurrentCharCode_();
+    if (!this.isIdentifierPart_(code)) {
+      break;
+    }
+    this.increment_(1);
+    id += String.fromCharCode(code);
+
+    // the \u sequence denotes an escaped character
+    if (code === ol.expression.Char.BACKSLASH) {
+      if (this.getCurrentCharCode_() !== ol.expression.Char.LOWER_U) {
+        throw new Error('Unexpected token at index ' + this.index_ +
+            ': ' + this.getCurrentChar_());
+      }
+      id = id.substr(0, id.length - 1);
+      this.increment_(1);
+      code = this.scanEscapeSequence_(ol.expression.Char.LOWER_U);
+
+      if (!code || code === ol.expression.Char.BACKSLASH ||
+          !this.isIdentifierStart_(code)) {
+        throw new Error('Unexpected token at index ' + this.index_ +
+            ': ' + this.getCurrentChar_());
+      }
+      id += String.fromCharCode(code);
+    }
+  }
+
+  return id;
+};
+
+
+/**
+ * Get an identifier.  This assumes we've encountered an identifier that doesn't
+ * start with an escape sequence.  If an escape sequence is encountered during
+ * the scan, we switch to the `getEscapedIdentifier_` method.
+ *
+ * @return {string} The identifier.
+ * @private
+ */
+ol.expression.Lexer.prototype.getIdentifier_ = function() {
+  goog.asserts.assert(
+      this.getCurrentCharCode_() !== ol.expression.Char.BACKSLASH,
+      'Must not be called with first char a backslash');
+
+  var start = this.index_;
+  this.increment_(1);
+
+  var code;
+  while (this.index_ < this.length_) {
+    code = this.getCurrentCharCode_();
+    if (code === ol.expression.Char.BACKSLASH) {
+      // reset cursor and start over scanning escaped identifier
+      this.index_ = start;
+      return this.getEscapedIdentifier_();
+    }
+    if (this.isIdentifierPart_(code)) {
+      this.increment_(1);
+    } else {
+      break;
+    }
+  }
+  return this.source_.slice(start, this.index_);
+};
+
+
+/**
+ * Scan an escape sequence of characters prefixed by the given character
+ * code.  This works for both unicode escape sequences (e.g. \u0123) and
+ * hex escape sequences (e.g. \x12).
+ * http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.4
+ *
+ * @param {number} prefix The character code of the escape prefix.
+ * @return {number} The unicode of the string resulting from the escape
+ *     sequence.  For invalid escape sequences, 0 is returned.
+ * @private
+ */
+ol.expression.Lexer.prototype.scanEscapeSequence_ = function(prefix) {
+  var code = 0;
+  var len = (prefix === ol.expression.Char.LOWER_U) ? 4 : 2;
+  var ch;
+  for (var i = 0; i < len; ++i) {
+    if (this.index_ < this.length_ &&
+        this.isHexDigit_(this.getCurrentCharCode_())) {
+      ch = this.getCurrentChar_();
+      code = (code * 16) + parseInt(ch, 16);
+      this.increment_(1);
+    } else {
+      return 0;
+    }
+  }
+  return code;
+};
+
+
 /**
  * Scan hex literal as numeric token.
  *
@@ -376,7 +542,30 @@ ol.expression.Lexer.prototype.scanHexLiteral_ = function() {
  * @private
  */
 ol.expression.Lexer.prototype.scanIdentifier_ = function() {
-  throw new Error('Not yet implemented');
+  var code = this.getCurrentCharCode_();
+  goog.asserts.assert(this.isIdentifierStart_(code),
+      'Must be called with a valid identifier');
+
+  var id = (code === ol.expression.Char.BACKSLASH) ?
+      this.getEscapedIdentifier_() : this.getIdentifier_();
+
+  var type;
+  if (id.length === 1) {
+    type = ol.expression.TokenType.IDENTIFIER;
+  } else if (this.isKeyword_(id)) {
+    type = ol.expression.TokenType.KEYWORD;
+  } else if (id === 'null') {
+    type = ol.expression.TokenType.NULL_LITERAL;
+  } else if (id === 'true' || id === 'false') {
+    type = ol.expression.TokenType.BOOLEAN_LITERAL;
+  } else {
+    type = ol.expression.TokenType.IDENTIFIER;
+  }
+
+  return {
+    type: type,
+    value: id
+  };
 };
 
 
diff --git a/test/spec/ol/expression/lexer.test.js b/test/spec/ol/expression/lexer.test.js
index 89f37614d5..0bb094a281 100644
--- a/test/spec/ol/expression/lexer.test.js
+++ b/test/spec/ol/expression/lexer.test.js
@@ -9,10 +9,97 @@ describe('ol.expression.Lexer', function() {
     });
   });
 
+  describe('#scanIdentifier_()', function() {
+
+    function scan(source) {
+      var lexer = new ol.expression.Lexer(source);
+      return lexer.scanIdentifier_();
+    }
+
+    it('works for short identifiers', function() {
+      var token = scan('a');
+      expect(token.value).to.be('a');
+      expect(token.type).to.be(ol.expression.TokenType.IDENTIFIER);
+    });
+
+    it('works for longer identifiers', function() {
+      var token = scan('foo');
+      expect(token.value).to.be('foo');
+      expect(token.type).to.be(ol.expression.TokenType.IDENTIFIER);
+    });
+
+    it('works for $ anywhere', function() {
+      var token = scan('$foo$bar$');
+      expect(token.value).to.be('$foo$bar$');
+      expect(token.type).to.be(ol.expression.TokenType.IDENTIFIER);
+    });
+
+    it('works for _ anywhere', function() {
+      var token = scan('_foo_bar_');
+      expect(token.value).to.be('_foo_bar_');
+      expect(token.type).to.be(ol.expression.TokenType.IDENTIFIER);
+    });
+
+    it('works for keywords', function() {
+      var token = scan('delete');
+      expect(token.value).to.be('delete');
+      expect(token.type).to.be(ol.expression.TokenType.KEYWORD);
+    });
+
+    it('works for null', function() {
+      var token = scan('null');
+      expect(token.value).to.be('null');
+      expect(token.type).to.be(ol.expression.TokenType.NULL_LITERAL);
+    });
+
+    it('works for boolean true', function() {
+      var token = scan('true');
+      expect(token.value).to.be('true');
+      expect(token.type).to.be(ol.expression.TokenType.BOOLEAN_LITERAL);
+    });
+
+    it('works for boolean false', function() {
+      var token = scan('false');
+      expect(token.value).to.be('false');
+      expect(token.type).to.be(ol.expression.TokenType.BOOLEAN_LITERAL);
+    });
+
+    it('works with unicode escape sequences', function() {
+      var token = scan('\u006f\u006c\u0033');
+      expect(token.value).to.be('ol3');
+      expect(token.type).to.be(ol.expression.TokenType.IDENTIFIER);
+    });
+
+    it('works with hex escape sequences', function() {
+      var token = scan('\x6f\x6c\x33');
+      expect(token.value).to.be('ol3');
+      expect(token.type).to.be(ol.expression.TokenType.IDENTIFIER);
+    });
+
+    it('throws for identifiers starting with a number', function() {
+      expect(function() {
+        scan('4foo');
+      }).throwException();
+    });
+
+    it('throws for identifiers starting with a punctuation char', function() {
+      expect(function() {
+        scan('!foo');
+      }).throwException();
+    });
+
+    it('only scans valid identifier part', function() {
+      var token = scan('foo>bar');
+      expect(token.value).to.be('foo');
+      expect(token.type).to.be(ol.expression.TokenType.IDENTIFIER);
+    });
+
+  });
+
   describe('#scanNumericLiteral_()', function() {
 
-    function scan(code) {
-      var lexer = new ol.expression.Lexer(code);
+    function scan(source) {
+      var lexer = new ol.expression.Lexer(source);
       return lexer.scanNumericLiteral_();
     }