356 lines
13 KiB
JavaScript
356 lines
13 KiB
JavaScript
// Copyright 2006 The Closure Library Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS-IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
/**
|
|
* @fileoverview Methods for annotating occurrences of query terms in text or
|
|
* in a DOM tree. Adapted from Gmail code.
|
|
*
|
|
*/
|
|
|
|
goog.provide('goog.dom.annotate');
|
|
|
|
goog.require('goog.array');
|
|
goog.require('goog.dom');
|
|
goog.require('goog.dom.NodeType');
|
|
goog.require('goog.string');
|
|
|
|
|
|
/**
|
|
* Calls {@code annotateFn} for each occurrence of a search term in text nodes
|
|
* under {@code node}. Returns the number of hits.
|
|
*
|
|
* @param {Node} node A DOM node.
|
|
* @param {Array} terms An array of [searchTerm, matchWholeWordOnly] tuples.
|
|
* The matchWholeWordOnly value is a per-term attribute because some terms
|
|
* may be CJK, while others are not. (For correctness, matchWholeWordOnly
|
|
* should always be false for CJK terms.).
|
|
* @param {Function} annotateFn A function that takes
|
|
* (1) the number of the term that is "hit",
|
|
* (2) the HTML string (search term) to be annotated,
|
|
* and returns the annotated term as an HTML string.
|
|
* @param {*=} opt_ignoreCase Whether to ignore the case of the query
|
|
* terms when looking for matches.
|
|
* @param {Array.<string>=} opt_classesToSkip Nodes with one of these CSS class
|
|
* names (and its descendants) will be skipped.
|
|
* @param {number=} opt_maxMs Number of milliseconds after which this function,
|
|
* if still annotating, should stop and return.
|
|
*
|
|
* @return {boolean} Whether any terms were annotated.
|
|
*/
|
|
goog.dom.annotate.annotateTerms = function(node, terms, annotateFn,
|
|
opt_ignoreCase,
|
|
opt_classesToSkip,
|
|
opt_maxMs) {
|
|
if (opt_ignoreCase) {
|
|
terms = goog.dom.annotate.lowercaseTerms_(terms);
|
|
}
|
|
var stopTime = opt_maxMs > 0 ? goog.now() + opt_maxMs : 0;
|
|
|
|
return goog.dom.annotate.annotateTermsInNode_(
|
|
node, terms, annotateFn, opt_ignoreCase, opt_classesToSkip || [],
|
|
stopTime, 0);
|
|
};
|
|
|
|
|
|
/**
|
|
* The maximum recursion depth allowed. Any DOM nodes deeper than this are
|
|
* ignored.
|
|
* @type {number}
|
|
* @private
|
|
*/
|
|
goog.dom.annotate.MAX_RECURSION_ = 200;
|
|
|
|
|
|
/**
|
|
* The node types whose descendants should not be affected by annotation.
|
|
* @type {Array}
|
|
* @private
|
|
*/
|
|
goog.dom.annotate.NODES_TO_SKIP_ = ['SCRIPT', 'STYLE', 'TEXTAREA'];
|
|
|
|
|
|
/**
|
|
* Recursive helper function.
|
|
*
|
|
* @param {Node} node A DOM node.
|
|
* @param {Array} terms An array of [searchTerm, matchWholeWordOnly] tuples.
|
|
* The matchWholeWordOnly value is a per-term attribute because some terms
|
|
* may be CJK, while others are not. (For correctness, matchWholeWordOnly
|
|
* should always be false for CJK terms.).
|
|
* @param {Function} annotateFn function(number, string) : string A function
|
|
* that takes :
|
|
* (1) the number of the term that is "hit",
|
|
* (2) the HTML string (search term) to be annotated,
|
|
* and returns the annotated term as an HTML string.
|
|
* @param {*} ignoreCase Whether to ignore the case of the query terms
|
|
* when looking for matches.
|
|
* @param {Array.<string>} classesToSkip Nodes with one of these CSS class
|
|
* names will be skipped (as will their descendants).
|
|
* @param {number} stopTime Deadline for annotation operation (ignored if 0).
|
|
* @param {number} recursionLevel How deep this recursive call is; pass the
|
|
* value 0 in the initial call.
|
|
* @return {boolean} Whether any terms were annotated.
|
|
* @private
|
|
*/
|
|
goog.dom.annotate.annotateTermsInNode_ =
|
|
function(node, terms, annotateFn, ignoreCase, classesToSkip,
|
|
stopTime, recursionLevel) {
|
|
if ((stopTime > 0 && goog.now() >= stopTime) ||
|
|
recursionLevel > goog.dom.annotate.MAX_RECURSION_) {
|
|
return false;
|
|
}
|
|
|
|
var annotated = false;
|
|
|
|
if (node.nodeType == goog.dom.NodeType.TEXT) {
|
|
var html = goog.dom.annotate.helpAnnotateText_(node.nodeValue, terms,
|
|
annotateFn, ignoreCase);
|
|
if (html != null) {
|
|
// Replace the text with the annotated html. First we put the html into
|
|
// a temporary node, to get its DOM structure. To avoid adding a wrapper
|
|
// element as a side effect, we'll only actually use the temporary node's
|
|
// children.
|
|
var tempNode = goog.dom.getOwnerDocument(node).createElement('SPAN');
|
|
tempNode.innerHTML = html;
|
|
|
|
var parentNode = node.parentNode;
|
|
var nodeToInsert;
|
|
while ((nodeToInsert = tempNode.firstChild) != null) {
|
|
// Each parentNode.insertBefore call removes the inserted node from
|
|
// tempNode's list of children.
|
|
parentNode.insertBefore(nodeToInsert, node);
|
|
}
|
|
|
|
parentNode.removeChild(node);
|
|
annotated = true;
|
|
}
|
|
} else if (node.hasChildNodes() &&
|
|
!goog.array.contains(goog.dom.annotate.NODES_TO_SKIP_,
|
|
node.tagName)) {
|
|
var classes = node.className.split(/\s+/);
|
|
var skip = goog.array.some(classes, function(className) {
|
|
return goog.array.contains(classesToSkip, className);
|
|
});
|
|
|
|
if (!skip) {
|
|
++recursionLevel;
|
|
var curNode = node.firstChild;
|
|
var numTermsAnnotated = 0;
|
|
while (curNode) {
|
|
var nextNode = curNode.nextSibling;
|
|
var curNodeAnnotated = goog.dom.annotate.annotateTermsInNode_(
|
|
curNode, terms, annotateFn, ignoreCase, classesToSkip,
|
|
stopTime, recursionLevel);
|
|
annotated = annotated || curNodeAnnotated;
|
|
curNode = nextNode;
|
|
}
|
|
}
|
|
}
|
|
|
|
return annotated;
|
|
};
|
|
|
|
|
|
/**
|
|
* Regular expression that matches non-word characters.
|
|
*
|
|
* Performance note: Testing a one-character string using this regex is as fast
|
|
* as the equivalent string test ("a-zA-Z0-9_".indexOf(c) < 0), give or take a
|
|
* few percent. (The regex is about 5% faster in IE 6 and about 4% slower in
|
|
* Firefox 1.5.) If performance becomes critical, it may be better to convert
|
|
* the character to a numerical char code and check whether it falls in the
|
|
* word character ranges. A quick test suggests that could be 33% faster.
|
|
*
|
|
* @type {RegExp}
|
|
* @private
|
|
*/
|
|
goog.dom.annotate.NONWORD_RE_ = /\W/;
|
|
|
|
|
|
/**
|
|
* Annotates occurrences of query terms in plain text. This process consists of
|
|
* identifying all occurrences of all query terms, calling a provided function
|
|
* to get the appropriate replacement HTML for each occurrence, and
|
|
* HTML-escaping all the text.
|
|
*
|
|
* @param {string} text The plain text to be searched.
|
|
* @param {Array} terms An array of
|
|
* [{string} searchTerm, {boolean} matchWholeWordOnly] tuples.
|
|
* The matchWholeWordOnly value is a per-term attribute because some terms
|
|
* may be CJK, while others are not. (For correctness, matchWholeWordOnly
|
|
* should always be false for CJK terms.).
|
|
* @param {Function} annotateFn {function(number, string) : string} A function
|
|
* that takes
|
|
* (1) the number of the term that is "hit",
|
|
* (2) the HTML string (search term) to be annotated,
|
|
* and returns the annotated term as an HTML string.
|
|
* @param {*=} opt_ignoreCase Whether to ignore the case of the query
|
|
* terms when looking for matches.
|
|
* @return {?string} The HTML equivalent of {@code text} with terms
|
|
* annotated, or null if the text did not contain any of the terms.
|
|
*/
|
|
goog.dom.annotate.annotateText = function(text, terms, annotateFn,
|
|
opt_ignoreCase) {
|
|
if (opt_ignoreCase) {
|
|
terms = goog.dom.annotate.lowercaseTerms_(terms);
|
|
}
|
|
return goog.dom.annotate.helpAnnotateText_(text, terms, annotateFn,
|
|
opt_ignoreCase);
|
|
};
|
|
|
|
|
|
/**
|
|
* Annotates occurrences of query terms in plain text. This process consists of
|
|
* identifying all occurrences of all query terms, calling a provided function
|
|
* to get the appropriate replacement HTML for each occurrence, and
|
|
* HTML-escaping all the text.
|
|
*
|
|
* @param {string} text The plain text to be searched.
|
|
* @param {Array} terms An array of
|
|
* [{string} searchTerm, {boolean} matchWholeWordOnly] tuples.
|
|
* If {@code ignoreCase} is true, each search term must already be lowercase.
|
|
* The matchWholeWordOnly value is a per-term attribute because some terms
|
|
* may be CJK, while others are not. (For correctness, matchWholeWordOnly
|
|
* should always be false for CJK terms.).
|
|
* @param {Function} annotateFn {function(number, string) : string} A function
|
|
* that takes
|
|
* (1) the number of the term that is "hit",
|
|
* (2) the HTML string (search term) to be annotated,
|
|
* and returns the annotated term as an HTML string.
|
|
* @param {*} ignoreCase Whether to ignore the case of the query terms
|
|
* when looking for matches.
|
|
* @return {?string} The HTML equivalent of {@code text} with terms
|
|
* annotated, or null if the text did not contain any of the terms.
|
|
* @private
|
|
*/
|
|
goog.dom.annotate.helpAnnotateText_ = function(text, terms, annotateFn,
|
|
ignoreCase) {
|
|
var hit = false;
|
|
var resultHtml = null;
|
|
var textToSearch = ignoreCase ? text.toLowerCase() : text;
|
|
var textLen = textToSearch.length;
|
|
var numTerms = terms.length;
|
|
|
|
// Each element will be an array of hit positions for the term.
|
|
var termHits = new Array(numTerms);
|
|
|
|
// First collect all the hits into allHits.
|
|
for (var i = 0; i < numTerms; i++) {
|
|
var term = terms[i];
|
|
var hits = [];
|
|
var termText = term[0];
|
|
if (termText != '') {
|
|
var matchWholeWordOnly = term[1];
|
|
var termLen = termText.length;
|
|
var pos = 0;
|
|
// Find each hit for term t and append to termHits.
|
|
while (pos < textLen) {
|
|
var hitPos = textToSearch.indexOf(termText, pos);
|
|
if (hitPos == -1) {
|
|
break;
|
|
} else {
|
|
var prevCharPos = hitPos - 1;
|
|
var nextCharPos = hitPos + termLen;
|
|
if (!matchWholeWordOnly ||
|
|
((prevCharPos < 0 ||
|
|
goog.dom.annotate.NONWORD_RE_.test(
|
|
textToSearch.charAt(prevCharPos))) &&
|
|
(nextCharPos >= textLen ||
|
|
goog.dom.annotate.NONWORD_RE_.test(
|
|
textToSearch.charAt(nextCharPos))))) {
|
|
hits.push(hitPos);
|
|
hit = true;
|
|
}
|
|
pos = hitPos + termLen;
|
|
}
|
|
}
|
|
}
|
|
termHits[i] = hits;
|
|
}
|
|
|
|
if (hit) {
|
|
var html = [];
|
|
var pos = 0;
|
|
|
|
while (true) {
|
|
// First determine which of the n terms is the next hit.
|
|
var termIndexOfNextHit;
|
|
var posOfNextHit = -1;
|
|
|
|
for (var i = 0; i < numTerms; i++) {
|
|
var hits = termHits[i];
|
|
// pull off the position of the next hit of term t
|
|
// (it's always the first in the array because we're shifting
|
|
// hits off the front of the array as we process them)
|
|
// this is the next candidate to consider for the next overall hit
|
|
if (!goog.array.isEmpty(hits)) {
|
|
var hitPos = hits[0];
|
|
|
|
// Discard any hits embedded in the previous hit.
|
|
while (hitPos >= 0 && hitPos < pos) {
|
|
hits.shift();
|
|
hitPos = goog.array.isEmpty(hits) ? -1 : hits[0];
|
|
}
|
|
|
|
if (hitPos >= 0 && (posOfNextHit < 0 || hitPos < posOfNextHit)) {
|
|
termIndexOfNextHit = i;
|
|
posOfNextHit = hitPos;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Quit if there are no more hits.
|
|
if (posOfNextHit < 0) break;
|
|
|
|
// Remove the next hit from our hit list.
|
|
termHits[termIndexOfNextHit].shift();
|
|
|
|
// Append everything from the end of the last hit up to this one.
|
|
html.push(goog.string.htmlEscape(text.substr(pos, posOfNextHit - pos)));
|
|
|
|
// Append the annotated term.
|
|
var termLen = terms[termIndexOfNextHit][0].length;
|
|
var termHtml = goog.string.htmlEscape(text.substr(posOfNextHit, termLen));
|
|
html.push(annotateFn(termIndexOfNextHit, termHtml));
|
|
|
|
pos = posOfNextHit + termLen;
|
|
}
|
|
|
|
// Append everything after the last hit.
|
|
html.push(goog.string.htmlEscape(text.substr(pos)));
|
|
return html.join('');
|
|
} else {
|
|
return null;
|
|
}
|
|
};
|
|
|
|
|
|
/**
|
|
* Converts terms to lowercase.
|
|
*
|
|
* @param {Array} terms An array of
|
|
* [{string} searchTerm, {boolean} matchWholeWordOnly] tuples.
|
|
* @return {Array} An array of
|
|
* [{string} searchTerm, {boolean} matchWholeWordOnly] tuples.
|
|
* @private
|
|
*/
|
|
goog.dom.annotate.lowercaseTerms_ = function(terms) {
|
|
var lowercaseTerms = [];
|
|
for (var i = 0; i < terms.length; ++i) {
|
|
var term = terms[i];
|
|
lowercaseTerms[i] = [term[0].toLowerCase(), term[1]];
|
|
}
|
|
return lowercaseTerms;
|
|
};
|