392 lines
10 KiB
JavaScript
392 lines
10 KiB
JavaScript
// Copyright 2012 The Closure Library Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS-IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
/**
|
|
* @fileoverview Provides a parser that turns a string of well-formed CSV data
|
|
* into an array of objects or an array of arrays. All values are returned as
|
|
* strings; the user has to convert data into numbers or Dates as required.
|
|
* Empty fields (adjacent commas) are returned as empty strings.
|
|
*
|
|
* This parser uses http://tools.ietf.org/html/rfc4180 as the definition of CSV.
|
|
*
|
|
*/
|
|
goog.provide('goog.labs.format.csv');
|
|
goog.provide('goog.labs.format.csv.ParseError');
|
|
goog.provide('goog.labs.format.csv.Token');
|
|
|
|
goog.require('goog.array');
|
|
goog.require('goog.asserts');
|
|
goog.require('goog.debug.Error');
|
|
goog.require('goog.object');
|
|
goog.require('goog.string');
|
|
goog.require('goog.string.newlines');
|
|
|
|
|
|
/**
|
|
* @define {boolean} Enable verbose debugging. This is a flag so it can be
|
|
* enabled in production if necessary post-compilation. Otherwise, debug
|
|
* information will be stripped to minimize final code size.
|
|
*/
|
|
goog.labs.format.csv.ENABLE_VERBOSE_DEBUGGING = goog.DEBUG;
|
|
|
|
|
|
|
|
/**
|
|
* Error thrown when parsing fails.
|
|
*
|
|
* @param {string} text The CSV source text being parsed.
|
|
* @param {number} index The index, in the string, of the position of the
|
|
* error.
|
|
* @param {string=} opt_message A description of the violated parse expectation.
|
|
* @constructor
|
|
* @extends {goog.debug.Error}
|
|
*/
|
|
goog.labs.format.csv.ParseError = function(text, index, opt_message) {
|
|
|
|
var message;
|
|
|
|
/**
|
|
* @type {?{line: number, column: number}} The line and column of the parse
|
|
* error.
|
|
*/
|
|
this.position = null;
|
|
|
|
if (goog.labs.format.csv.ENABLE_VERBOSE_DEBUGGING) {
|
|
message = opt_message || '';
|
|
|
|
var info = goog.labs.format.csv.ParseError.findLineInfo_(text, index);
|
|
if (info) {
|
|
var lineNumber = info.lineIndex + 1;
|
|
var columnNumber = index - info.line.startLineIndex + 1;
|
|
|
|
this.position = {
|
|
line: lineNumber,
|
|
column: columnNumber
|
|
};
|
|
|
|
message += goog.string.subs(' at line %s column %s',
|
|
lineNumber, columnNumber);
|
|
message += '\n' + goog.labs.format.csv.ParseError.getLineDebugString_(
|
|
info.line.getContent(), columnNumber);
|
|
}
|
|
}
|
|
|
|
goog.base(this, message);
|
|
};
|
|
goog.inherits(goog.labs.format.csv.ParseError, goog.debug.Error);
|
|
|
|
|
|
/** @inheritDoc */
|
|
goog.labs.format.csv.ParseError.prototype.name = 'ParseError';
|
|
|
|
|
|
/**
|
|
* Calculate the line and column for an index in a string.
|
|
* TODO(nnaze): Consider moving to goog.string.newlines.
|
|
* @param {string} str A string.
|
|
* @param {number} index An index into the string.
|
|
* @return {?{line: !goog.string.newlines.Line, lineIndex: number}} The line
|
|
* and index of the line.
|
|
* @private
|
|
*/
|
|
goog.labs.format.csv.ParseError.findLineInfo_ = function(str, index) {
|
|
var lines = goog.string.newlines.getLines(str);
|
|
var lineIndex = goog.array.findIndex(lines, function(line) {
|
|
return line.startLineIndex <= index && line.endLineIndex > index;
|
|
});
|
|
|
|
if (goog.isNumber(lineIndex)) {
|
|
var line = lines[lineIndex];
|
|
return {
|
|
line: line,
|
|
lineIndex: lineIndex
|
|
};
|
|
}
|
|
|
|
return null;
|
|
};
|
|
|
|
|
|
/**
|
|
* Get a debug string of a line and a pointing caret beneath it.
|
|
* @param {string} str The string.
|
|
* @param {number} column The column to point at (1-indexed).
|
|
* @return {string} The debug line.
|
|
* @private
|
|
*/
|
|
goog.labs.format.csv.ParseError.getLineDebugString_ = function(str, column) {
|
|
var returnString = str + '\n';
|
|
returnString += goog.string.repeat(' ', column - 1) + '^';
|
|
return returnString;
|
|
};
|
|
|
|
|
|
/**
|
|
* A token -- a single-character string or a sentinel.
|
|
* @typedef {string|!goog.labs.format.csv.Sentinels_}
|
|
*/
|
|
goog.labs.format.csv.Token;
|
|
|
|
|
|
/**
|
|
* Parses a CSV string to create a two-dimensional array.
|
|
*
|
|
* This function does not process header lines, etc -- such transformations can
|
|
* be made on the resulting array.
|
|
*
|
|
* @param {string} text The entire CSV text to be parsed.
|
|
* @return {!Array.<!Array.<string>>} The parsed CSV.
|
|
*/
|
|
goog.labs.format.csv.parse = function(text) {
|
|
|
|
var index = 0; // current char offset being considered
|
|
|
|
|
|
var EOF = goog.labs.format.csv.Sentinels_.EOF;
|
|
var EOR = goog.labs.format.csv.Sentinels_.EOR;
|
|
var NEWLINE = goog.labs.format.csv.Sentinels_.NEWLINE; // \r?\n
|
|
var EMPTY = goog.labs.format.csv.Sentinels_.EMPTY;
|
|
|
|
var pushBackToken = null; // A single-token pushback.
|
|
var sawComma = false; // Special case for terminal comma.
|
|
|
|
/**
|
|
* Push a single token into the push-back variable.
|
|
* @param {goog.labs.format.csv.Token} t Single token.
|
|
*/
|
|
function pushBack(t) {
|
|
goog.labs.format.csv.assertToken_(t);
|
|
goog.asserts.assert(goog.isNull(pushBackToken));
|
|
pushBackToken = t;
|
|
}
|
|
|
|
/**
|
|
* @return {goog.labs.format.csv.Token} The next token in the stream.
|
|
*/
|
|
function nextToken() {
|
|
|
|
// Give the push back token if present.
|
|
if (pushBackToken != null) {
|
|
var c = pushBackToken;
|
|
pushBackToken = null;
|
|
return c;
|
|
}
|
|
|
|
// We're done. EOF.
|
|
if (index >= text.length) {
|
|
return EOF;
|
|
}
|
|
|
|
// Give the next charater.
|
|
var chr = text.charAt(index++);
|
|
goog.labs.format.csv.assertToken_(chr);
|
|
|
|
// Check if this is a newline. If so, give the new line sentinel.
|
|
var isNewline = false;
|
|
if (chr == '\n') {
|
|
isNewline = true;
|
|
} else if (chr == '\r') {
|
|
|
|
// This is a '\r\n' newline. Treat as single token, go
|
|
// forward two indicies.
|
|
if (index < text.length && text.charAt(index) == '\n') {
|
|
index++;
|
|
}
|
|
|
|
isNewline = true;
|
|
}
|
|
|
|
if (isNewline) {
|
|
return NEWLINE;
|
|
}
|
|
|
|
return chr;
|
|
}
|
|
|
|
/**
|
|
* Read a quoted field from input.
|
|
* @return {string} The field, as a string.
|
|
*/
|
|
function readQuotedField() {
|
|
// We've already consumed the first quote by the time we get here.
|
|
var start = index;
|
|
var end = null;
|
|
|
|
for (var token = nextToken(); token != EOF; token = nextToken()) {
|
|
if (token == '"') {
|
|
end = index - 1;
|
|
token = nextToken();
|
|
|
|
// Two double quotes in a row. Keep scanning.
|
|
if (token == '"') {
|
|
end = null;
|
|
continue;
|
|
}
|
|
|
|
// End of field. Break out.
|
|
if (token == ',' || token == EOF || token == NEWLINE) {
|
|
if (token == NEWLINE) {
|
|
pushBack(token);
|
|
}
|
|
break;
|
|
}
|
|
|
|
throw new goog.labs.format.csv.ParseError(
|
|
text, index - 1,
|
|
'Unexpected character "' + token + '" after quote mark');
|
|
}
|
|
}
|
|
|
|
if (goog.isNull(end)) {
|
|
throw new goog.labs.format.csv.ParseError(
|
|
text, text.length - 1,
|
|
'Unexpected end of text after open quote');
|
|
}
|
|
|
|
// Take substring, combine double quotes.
|
|
return text.substring(start, end).replace(/""/g, '"');
|
|
}
|
|
|
|
/**
|
|
* Read a field from input.
|
|
* @return {string|!goog.labs.format.csv.Sentinels_} The field, as a string,
|
|
* or a sentinel (if applicable).
|
|
*/
|
|
function readField() {
|
|
var start = index;
|
|
var didSeeComma = sawComma;
|
|
sawComma = false;
|
|
var token = nextToken();
|
|
if (token == EMPTY) {
|
|
return EOR;
|
|
}
|
|
if (token == EOF || token == NEWLINE) {
|
|
if (didSeeComma) {
|
|
pushBack(EMPTY);
|
|
return '';
|
|
}
|
|
return EOR;
|
|
}
|
|
|
|
// This is the beginning of a quoted field.
|
|
if (token == '"') {
|
|
return readQuotedField();
|
|
}
|
|
|
|
while (true) {
|
|
|
|
// This is the end of line or file.
|
|
if (token == EOF || token == NEWLINE) {
|
|
pushBack(token);
|
|
break;
|
|
}
|
|
|
|
// This is the end of record.
|
|
if (token == ',') {
|
|
sawComma = true;
|
|
break;
|
|
}
|
|
|
|
if (token == '"') {
|
|
throw new goog.labs.format.csv.ParseError(text, index - 1,
|
|
'Unexpected quote mark');
|
|
}
|
|
|
|
token = nextToken();
|
|
}
|
|
|
|
|
|
var returnString = (token == EOF) ?
|
|
text.substring(start) : // Return to end of file.
|
|
text.substring(start, index - 1);
|
|
|
|
return returnString.replace(/[\r\n]+/g, ''); // Squash any CRLFs.
|
|
}
|
|
|
|
/**
|
|
* Read the next record.
|
|
* @return {!Array.<string>|!goog.labs.format.csv.Sentinels_} A single record
|
|
* with multiple fields.
|
|
*/
|
|
function readRecord() {
|
|
if (index >= text.length) {
|
|
return EOF;
|
|
}
|
|
var record = [];
|
|
for (var field = readField(); field != EOR; field = readField()) {
|
|
record.push(field);
|
|
}
|
|
return record;
|
|
}
|
|
|
|
// Read all records and return.
|
|
var records = [];
|
|
for (var record = readRecord(); record != EOF; record = readRecord()) {
|
|
records.push(record);
|
|
}
|
|
return records;
|
|
};
|
|
|
|
|
|
/**
|
|
* Sentinel tracking objects.
|
|
* @enum {Object}
|
|
* @private
|
|
*/
|
|
goog.labs.format.csv.Sentinels_ = {
|
|
/** Empty field */
|
|
EMPTY: {},
|
|
|
|
/** End of file */
|
|
EOF: {},
|
|
|
|
/** End of record */
|
|
EOR: {},
|
|
|
|
/** Newline. \r?\n */
|
|
NEWLINE: {}
|
|
};
|
|
|
|
|
|
/**
|
|
* @param {string} str A string.
|
|
* @return {boolean} Whether the string is a single character.
|
|
* @private
|
|
*/
|
|
goog.labs.format.csv.isCharacterString_ = function(str) {
|
|
return goog.isString(str) && str.length == 1;
|
|
};
|
|
|
|
|
|
/**
|
|
* Assert the parameter is a token.
|
|
* @param {*} o What should be a token.
|
|
* @throws {goog.asserts.AssertionError} If {@ code} is not a token.
|
|
* @private
|
|
*/
|
|
goog.labs.format.csv.assertToken_ = function(o) {
|
|
if (goog.isString(o)) {
|
|
goog.asserts.assertString(o);
|
|
goog.asserts.assert(goog.labs.format.csv.isCharacterString_(o),
|
|
'Should be a string of length 1 or a sentinel.');
|
|
} else {
|
|
goog.asserts.assert(
|
|
goog.object.containsValue(goog.labs.format.csv.Sentinels_, o),
|
|
'Should be a string of length 1 or a sentinel.');
|
|
}
|
|
};
|
|
|
|
|
|
|