From 4c03b3b35cb571851d13894aa12e0bb00ef0f6b9 Mon Sep 17 00:00:00 2001 From: Erik Timmers Date: Sun, 13 Jul 2014 22:28:45 +0200 Subject: [PATCH] Parse WKT strings using a lexer/parser --- src/ol/format/wktformat.js | 674 +++++++++++++++++++++++++++---------- 1 file changed, 500 insertions(+), 174 deletions(-) diff --git a/src/ol/format/wktformat.js b/src/ol/format/wktformat.js index 7d99daac03..ceea6de969 100644 --- a/src/ol/format/wktformat.js +++ b/src/ol/format/wktformat.js @@ -2,7 +2,6 @@ goog.provide('ol.format.WKT'); goog.require('goog.array'); goog.require('goog.asserts'); -goog.require('goog.string'); goog.require('ol.Feature'); goog.require('ol.format.TextFeature'); goog.require('ol.geom.Geometry'); @@ -41,135 +40,6 @@ ol.format.WKT = function(opt_options) { goog.inherits(ol.format.WKT, ol.format.TextFeature); -/** - * Constants for regExes. - * @enum {RegExp} - */ -ol.format.WKT.regExes = { - typeStr: /^\s*(\w+)\s*\(\s*(.*)\s*\)\s*$/, - spaces: /\s+/, - parenComma: /\)\s*,\s*\(/, - doubleParenComma: /\)\s*\)\s*,\s*\(\s*\(/, - trimParens: /^\s*\(?(.*?)\)?\s*$/, - geomCollection: /,\s*([A-Za-z])/g, - removeNewLine: /[\n\r]/g -}; - - -/** - * @param {string} str WKT point. - * @return {ol.geom.Point} Parsed point. - * @private - */ -ol.format.WKT.prototype.parsePoint_ = function(str) { - var coords = goog.string.trim(str).split(ol.format.WKT.regExes.spaces); - return new ol.geom.Point(goog.array.map(coords, parseFloat)); -}; - - -/** - * @param {string} str WKT linestring. - * @return {ol.geom.LineString} Parsed linestring. - * @private - */ -ol.format.WKT.prototype.parseLineString_ = function(str) { - var points = goog.string.trim(str).split(','); - var coordinates = []; - for (var i = 0, ii = points.length; i < ii; ++i) { - coordinates.push(this.parsePoint_.apply(this, - [points[i]]).getCoordinates()); - } - return new ol.geom.LineString(coordinates); -}; - - -/** - * @param {string} str WKT multipoint. - * @return {ol.geom.MultiPoint} Parsed multipoint. - * @private - */ -ol.format.WKT.prototype.parseMultiPoint_ = function(str) { - var point; - var points = goog.string.trim(str).split(','); - var geom = new ol.geom.MultiPoint(null); - for (var i = 0, ii = points.length; i < ii; ++i) { - point = points[i].replace(ol.format.WKT.regExes.trimParens, '$1'); - geom.appendPoint(this.parsePoint_.apply(this, [point])); - } - return geom; -}; - - -/** - * @param {string} str WKT multilinestring. - * @return {ol.geom.MultiLineString} Parsed multilinestring. - * @private - */ -ol.format.WKT.prototype.parseMultiLineString_ = function(str) { - var line; - var lines = goog.string.trim(str).split(ol.format.WKT.regExes.parenComma); - var geom = new ol.geom.MultiLineString(null); - for (var i = 0, ii = lines.length; i < ii; ++i) { - line = lines[i].replace(ol.format.WKT.regExes.trimParens, '$1'); - geom.appendLineString(this.parseLineString_.apply(this, [line])); - } - return geom; -}; - - -/** - * @param {string} str WKT polygon. - * @return {ol.geom.Polygon} Parsed polygon. - * @private - */ -ol.format.WKT.prototype.parsePolygon_ = function(str) { - var ring, linestring, linearring; - var rings = goog.string.trim(str).split(ol.format.WKT.regExes.parenComma); - var coordinates = []; - for (var i = 0, ii = rings.length; i < ii; ++i) { - ring = rings[i].replace(ol.format.WKT.regExes.trimParens, '$1'); - linestring = this.parseLineString_.apply(this, [ring]).getCoordinates(); - coordinates.push(linestring); - } - return new ol.geom.Polygon(coordinates); -}; - - -/** - * @param {string} str WKT multipolygon. - * @return {ol.geom.MultiPolygon} Parsed multipolygon. - * @private - */ -ol.format.WKT.prototype.parseMultiPolygon_ = function(str) { - var polygon; - var polygons = goog.string.trim(str).split( - ol.format.WKT.regExes.doubleParenComma); - var geom = new ol.geom.MultiPolygon(null); - for (var i = 0, ii = polygons.length; i < ii; ++i) { - polygon = polygons[i].replace(ol.format.WKT.regExes.trimParens, '$1'); - geom.appendPolygon(this.parsePolygon_.apply(this, [polygon])); - } - return geom; -}; - - -/** - * @param {string} str WKT geometrycollection. - * @return {ol.geom.GeometryCollection} Parsed geometrycollection. - * @private - */ -ol.format.WKT.prototype.parseGeometryCollection_ = function(str) { - // separate components of the collection with | - str = str.replace(ol.format.WKT.regExes.geomCollection, '|$1'); - var wktArray = goog.string.trim(str).split('|'); - var geoms = []; - for (var i = 0, ii = wktArray.length; i < ii; ++i) { - geoms.push(this.parse_.apply(this, [wktArray[i]])); - } - return new ol.geom.GeometryCollection(geoms); -}; - - /** * @param {ol.geom.Point} geom Point geometry. * @return {string} Coordinates part of Point as WKT. @@ -274,50 +144,6 @@ ol.format.WKT.encodeMultiPolygonGeometry_ = function(geom) { }; -/** - * Parse a WKT string. - * @param {string} wkt WKT string. - * @return {ol.geom.Geometry|ol.geom.GeometryCollection|undefined} - * The geometry created. - * @private - */ -ol.format.WKT.prototype.parse_ = function(wkt) { - wkt = wkt.replace(ol.format.WKT.regExes.removeNewLine, ' '); - var matches = ol.format.WKT.regExes.typeStr.exec(wkt); - var geometry; - if (matches) { - var type = matches[1].toLowerCase(); - var str = matches[2]; - switch (type) { - case 'point': - geometry = this.parsePoint_(str); - break; - case 'multipoint': - geometry = this.parseMultiPoint_(str); - break; - case 'linestring': - geometry = this.parseLineString_(str); - break; - case 'multilinestring': - geometry = this.parseMultiLineString_(str); - break; - case 'polygon': - geometry = this.parsePolygon_(str); - break; - case 'multipolygon': - geometry = this.parseMultiPolygon_(str); - break; - case 'geometrycollection': - geometry = this.parseGeometryCollection_(str); - break; - default: - throw new Error('Bad geometry type: ' + type); - } - } - return geometry; -}; - - /** * Encode a geometry as WKT. * @param {ol.geom.Geometry} geom The geometry to encode. @@ -348,6 +174,20 @@ ol.format.WKT.GeometryEncoder_ = { }; +/** + * Parse a WKT string. + * @param {string} wkt WKT string. + * @return {ol.geom.Geometry|ol.geom.GeometryCollection|undefined} + * The geometry created. + * @private + */ +ol.format.WKT.prototype.parse_ = function(wkt) { + var lexer = new ol.format.WKT.Lexer(wkt); + var parser = new ol.format.WKT.Parser(lexer); + return parser.parse(); +}; + + /** * Read a feature from a WKT source. * @@ -495,3 +335,489 @@ ol.format.WKT.prototype.writeGeometry; ol.format.WKT.prototype.writeGeometryText = function(geometry) { return ol.format.WKT.encode_(geometry); }; + + +/** + * @typedef {{type: number, value: (number|string|undefined), position: number}} + */ +ol.format.WKT.Token; + + +/** + * @const + * @enum {number} + */ +ol.format.WKT.TokenType = { + TEXT: 1, + LEFT_PAREN: 2, + RIGHT_PAREN: 3, + NUMBER: 4, + COMMA: 5, + EOF: 6 +}; + + + +/** + * Class to tokenize a WKT string. + * @param {string} wkt WKT string. + * @constructor + * @protected + */ +ol.format.WKT.Lexer = function(wkt) { + + /** + * @type {string} + */ + this.wkt = wkt; + + /** + * @type {number} + * @private + */ + this.index_ = -1; +}; + + +/** + * @param {string} c Character. + * @return {boolean} Whether the character is alphabetic. + * @private + */ +ol.format.WKT.Lexer.prototype.isAlpha_ = function(c) { + return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z'; +}; + + +/** + * @param {string} c Character. + * @param {boolean=} opt_decimal Whether the string number + * contains a dot, i.e. is a decimal number. + * @return {boolean} Whether the character is numeric. + * @private + */ +ol.format.WKT.Lexer.prototype.isNumeric_ = function(c, opt_decimal) { + var decimal = goog.isDef(opt_decimal) ? opt_decimal : false; + return c >= '0' && c <= '9' || c == '.' && !decimal; +}; + + +/** + * @param {string} c Character. + * @return {boolean} Whether the character is whitespace. + * @private + */ +ol.format.WKT.Lexer.prototype.isWhiteSpace_ = function(c) { + return c == ' ' || c == '\t' || c == '\r' || c == '\n'; +}; + + +/** + * @return {string} Next string character. + * @private + */ +ol.format.WKT.Lexer.prototype.nextChar_ = function() { + return this.wkt.charAt(++this.index_); +}; + + +/** + * Fetch and return the next token. + * @return {!ol.format.WKT.Token} Next string token. + */ +ol.format.WKT.Lexer.prototype.nextToken = function() { + var c = this.nextChar_(); + var token = {position: this.index_, value: c}; + + if (c == '(') { + token.type = ol.format.WKT.TokenType.LEFT_PAREN; + } else if (c == ',') { + token.type = ol.format.WKT.TokenType.COMMA; + } else if (c == ')') { + token.type = ol.format.WKT.TokenType.RIGHT_PAREN; + } else if (this.isNumeric_(c) || c == '-') { + token.type = ol.format.WKT.TokenType.NUMBER; + token.value = this.readNumber_(); + } else if (this.isAlpha_(c)) { + token.type = ol.format.WKT.TokenType.TEXT; + token.value = this.readText_(); + } else if (this.isWhiteSpace_(c)) { + return this.nextToken(); + } else if (c === '') { + token.type = ol.format.WKT.TokenType.EOF; + } else { + throw new Error('Unexpected character: ' + c); + } + + return token; +}; + + +/** + * @return {number} Numeric token value. + * @private + */ +ol.format.WKT.Lexer.prototype.readNumber_ = function() { + var c, index = this.index_; + var decimal = false; + do { + if (c == '.') { + decimal = true; + } + c = this.nextChar_(); + } while (this.isNumeric_(c, decimal)); + return parseFloat(this.wkt.substring(index, this.index_--)); +}; + + +/** + * @return {string} String token value. + * @private + */ +ol.format.WKT.Lexer.prototype.readText_ = function() { + var c, index = this.index_; + do { + c = this.nextChar_(); + } while (this.isAlpha_(c)); + return this.wkt.substring(index, this.index_--).toLowerCase(); +}; + + + +/** + * Class to parse the tokens from the WKT string. + * @param {ol.format.WKT.Lexer} lexer + * @constructor + * @protected + */ +ol.format.WKT.Parser = function(lexer) { + + /** + * @type {ol.format.WKT.Lexer} + * @private + */ + this.lexer_ = lexer; + + /** + * @type {ol.format.WKT.Token} + * @private + */ + this.token_; + + /** + * @type {number} + * @private + */ + this.dimension_ = 2; +}; + + +/** + * Fetch the next token form the lexer and replace the active token. + * @private + */ +ol.format.WKT.Parser.prototype.consume_ = function() { + this.token_ = this.lexer_.nextToken(); +}; + + +/** + * If the given type matches the current token, consume it. + * @param {ol.format.WKT.TokenType.} type Token type. + * @return {boolean} Whether the token matches the given type. + */ +ol.format.WKT.Parser.prototype.match = function(type) { + var isMatch = this.token_.type == type; + if (isMatch) { + this.consume_(); + } + return isMatch; +}; + + +/** + * Try to parse the tokens provided by the lexer. + * @return {ol.geom.Geometry|ol.geom.GeometryCollection} The geometry. + */ +ol.format.WKT.Parser.prototype.parse = function() { + this.consume_(); + var geometry = this.parseGeometry_(); + goog.asserts.assert(this.token_.type == ol.format.WKT.TokenType.EOF); + return geometry; +}; + + +/** + * @return {!ol.geom.Geometry|!ol.geom.GeometryCollection} The geometry. + * @private + */ +ol.format.WKT.Parser.prototype.parseGeometry_ = function() { + var token = this.token_; + if (this.match(ol.format.WKT.TokenType.TEXT)) { + var geomType = token.value; + if (geomType == 'geometrycollection') { + var geometries = this.parseGeometryCollectionText_(); + return new ol.geom.GeometryCollection(geometries); + } else { + var parser = ol.format.WKT.Parser.GeometryParser_[geomType]; + var ctor = ol.format.WKT.Parser.GeometryConstructor_[geomType]; + if (!goog.isDef(parser) || !goog.isDef(ctor)) { + throw new Error('Invalid geometry type: ' + geomType); + } + var coordinates = parser.call(this); + return new ctor(coordinates); + } + } + this.raiseError_(); +}; + + +/** + * @return {!Array.} A collection of geometries. + * @private + */ +ol.format.WKT.Parser.prototype.parseGeometryCollectionText_ = function() { + if (this.match(ol.format.WKT.TokenType.LEFT_PAREN)) { + var geometries = []; + do { + geometries.push(this.parseGeometry_()); + } while (this.match(ol.format.WKT.TokenType.COMMA)); + if (this.match(ol.format.WKT.TokenType.RIGHT_PAREN)) { + return geometries; + } + } + this.raiseError_(); +}; + + +/** + * @return {!Array.} All values in a point. + * @private + */ +ol.format.WKT.Parser.prototype.parsePointText_ = function() { + if (this.match(ol.format.WKT.TokenType.LEFT_PAREN)) { + var coordinates = this.parsePoint_(); + if (this.match(ol.format.WKT.TokenType.RIGHT_PAREN)) { + return coordinates; + } + } else if (this.isEmptyGeometry_()) { + return []; + } + this.raiseError_(); +}; + + +/** + * @return {!Array.>} All points in a linestring. + * @private + */ +ol.format.WKT.Parser.prototype.parseLineStringText_ = function() { + if (this.match(ol.format.WKT.TokenType.LEFT_PAREN)) { + var coordinates = this.parsePointList_(); + if (this.match(ol.format.WKT.TokenType.RIGHT_PAREN)) { + return coordinates; + } + } else if (this.isEmptyGeometry_()) { + return []; + } + this.raiseError_(); +}; + + +/** + * @return {!Array.>} All points in a polygon. + * @private + */ +ol.format.WKT.Parser.prototype.parsePolygonText_ = function() { + if (this.match(ol.format.WKT.TokenType.LEFT_PAREN)) { + var coordinates = this.parseLineStringTextList_(); + if (this.match(ol.format.WKT.TokenType.RIGHT_PAREN)) { + return coordinates; + } + } else if (this.isEmptyGeometry_()) { + return []; + } + this.raiseError_(); +}; + + +/** + * @return {!Array.>} All points in a multipoint. + * @private + */ +ol.format.WKT.Parser.prototype.parseMultiPointText_ = function() { + if (this.match(ol.format.WKT.TokenType.LEFT_PAREN)) { + var coordinates; + if (this.token_.type == ol.format.WKT.TokenType.LEFT_PAREN) { + coordinates = this.parsePointTextList_(); + } else { + coordinates = this.parsePointList_(); + } + if (this.match(ol.format.WKT.TokenType.RIGHT_PAREN)) { + return coordinates; + } + } else if (this.isEmptyGeometry_()) { + return []; + } + this.raiseError_(); +}; + + +/** + * @return {!Array.>} All linestring points + * in a multilinestring. + * @private + */ +ol.format.WKT.Parser.prototype.parseMultiLineStringText_ = function() { + if (this.match(ol.format.WKT.TokenType.LEFT_PAREN)) { + var coordinates = this.parseLineStringTextList_(); + if (this.match(ol.format.WKT.TokenType.RIGHT_PAREN)) { + return coordinates; + } + } else if (this.isEmptyGeometry_()) { + return []; + } + this.raiseError_(); +}; + + +/** + * @return {!Array.>} All polygon points in a multipolygon. + * @private + */ +ol.format.WKT.Parser.prototype.parseMultiPolygonText_ = function() { + if (this.match(ol.format.WKT.TokenType.LEFT_PAREN)) { + var coordinates = this.parsePolygonTextList_(); + if (this.match(ol.format.WKT.TokenType.RIGHT_PAREN)) { + return coordinates; + } + } else if (this.isEmptyGeometry_()) { + return []; + } + this.raiseError_(); +}; + + +/** + * @return {!Array.} A point. + * @private + */ +ol.format.WKT.Parser.prototype.parsePoint_ = function() { + var coordinates = []; + for (var i = 0; i < this.dimension_; ++i) { + var token = this.token_; + if (this.match(ol.format.WKT.TokenType.NUMBER)) { + coordinates.push(token.value); + } else { + break; + } + } + if (coordinates.length == this.dimension_) { + return coordinates; + } + this.raiseError_(); +}; + + +/** + * @return {!Array.>} An array of points. + * @private + */ +ol.format.WKT.Parser.prototype.parsePointList_ = function() { + var coordinates = [this.parsePoint_()]; + if (this.match(ol.format.WKT.TokenType.COMMA)) { + goog.array.extend(coordinates, this.parsePointList_()); + } + return coordinates; +}; + + +/** + * @return {!Array.>} An array of points. + * @private + */ +ol.format.WKT.Parser.prototype.parsePointTextList_ = function() { + var coordinates = [this.parsePointText_()]; + if (this.match(ol.format.WKT.TokenType.COMMA)) { + goog.array.extend(coordinates, this.parsePointTextList_()); + } + return coordinates; +}; + + +/** + * @return {!Array.>} An array of points. + * @private + */ +ol.format.WKT.Parser.prototype.parseLineStringTextList_ = function() { + var coordinates = [this.parseLineStringText_()]; + if (this.match(ol.format.WKT.TokenType.COMMA)) { + goog.array.extend(coordinates, this.parseLineStringTextList_()); + } + return coordinates; +}; + + +/** + * @return {!Array.>} An array of points. + * @private + */ +ol.format.WKT.Parser.prototype.parsePolygonTextList_ = function() { + var coordinates = [this.parsePolygonText_()]; + if (this.match(ol.format.WKT.TokenType.COMMA)) { + goog.array.extend(coordinates, this.parsePolygonTextList_()); + } + return coordinates; +}; + + +/** + * @return {boolean} Whether the token implies an empty geometry. + * @private + */ +ol.format.WKT.Parser.prototype.isEmptyGeometry_ = function() { + var isEmpty = this.token_.type == ol.format.WKT.TokenType.TEXT && + this.token_.value == 'empty'; + if (isEmpty) { + this.consume_(); + } + return isEmpty; +}; + + +/** + * @private + */ +ol.format.WKT.Parser.prototype.raiseError_ = function() { + throw new Error('Unexpected `' + this.token_.value + + '` at position ' + this.token_.position + + ' in `' + this.lexer_.wkt + '`'); +}; + + +/** + * @enum {function (new:ol.geom.Geometry, Array, ol.geom.GeometryLayout.=)} + * @private + */ +ol.format.WKT.Parser.GeometryConstructor_ = { + 'point': ol.geom.Point, + 'linestring': ol.geom.LineString, + 'polygon': ol.geom.Polygon, + 'multipoint': ol.geom.MultiPoint, + 'multilinestring': ol.geom.MultiLineString, + 'multipolygon': ol.geom.MultiPolygon +}; + + +/** + * @enum {(function(): !Array)} + * @private + */ +ol.format.WKT.Parser.GeometryParser_ = { + 'point': ol.format.WKT.Parser.prototype.parsePointText_, + 'linestring': ol.format.WKT.Parser.prototype.parseLineStringText_, + 'polygon': ol.format.WKT.Parser.prototype.parsePolygonText_, + 'multipoint': ol.format.WKT.Parser.prototype.parseMultiPointText_, + 'multilinestring': ol.format.WKT.Parser.prototype.parseMultiLineStringText_, + 'multipolygon': ol.format.WKT.Parser.prototype.parseMultiPolygonText_ +};