/*
* SplitsBrowser SI HTML - Reads in HTML-format 'SI' results data files.
*
* Copyright (C) 2000-2014 Dave Ryder, Reinhard Balling, Andris Strazdins,
* Ed Nash, Luke Woodward
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
(function () {
"use strict";
var isNotNull = SplitsBrowser.isNotNull;
var throwInvalidData = SplitsBrowser.throwInvalidData;
var throwWrongFileFormat = SplitsBrowser.throwWrongFileFormat;
var parseCourseLength = SplitsBrowser.parseCourseLength;
var normaliseLineEndings = SplitsBrowser.normaliseLineEndings;
var parseTime = SplitsBrowser.parseTime;
var fromOriginalCumTimes = SplitsBrowser.Model.Competitor.fromOriginalCumTimes;
var CourseClass = SplitsBrowser.Model.CourseClass;
var Course = SplitsBrowser.Model.Course;
var Event = SplitsBrowser.Model.Event;
// Regexps to help with parsing.
var HTML_TAG_STRIP_REGEXP = /<[^>]+>/g;
var DISTANCE_FIND_REGEXP = /([0-9.,]+)\s*(?:Km|km)/;
var CLIMB_FIND_REGEXP = /(\d+)\s*(?:Cm|Hm|hm|m)/;
/**
* Returns whether the given string is nonempty.
* @param {String} string - The string to check.
* @return True if the string is neither null nor empty, false if it is null
* or empty.
*/
function isNonEmpty(string) {
return string !== null && string !== "";
}
/**
* Returns whether the given string contains a number. The string is
* considered to contain a number if, after stripping whitespace, the string
* is not empty and calling isFinite on it returns true.
* @param {String} string - The string to test.
* @return True if the string contains a number, false if not.
*/
function hasNumber(string) {
string = $.trim(string);
// isFinite is not enough on its own: isFinite("") is true.
return string !== "" && isFinite(string);
}
/**
* Splits a line by whitespace.
* @param {String} line - The line to split.
* @return {Array} Array of whitespace-separated strings.
*/
function splitByWhitespace (line) {
return line.split(/\s+/g).filter(isNonEmpty);
}
/**
* Strips all HTML tags from a string and returns the remaining string.
* @param {String} text - The HTML string to strip tags from.
* @return {String} The input string with HTML tags removed.
*/
function stripHtml(text) {
return text.replace(HTML_TAG_STRIP_REGEXP, "");
}
/**
* Returns all matches of the given regexp within the given text,
* after being stripped of HTML.
*
* Note that it is recommended to pass this function a new regular
* expression each time, rather than using a precompiled regexp.
*
* @param {RegExp} regexp - The regular expression to find all matches of.
* @param {String} text - The text to search for matches within.
* @return {Array} Array of strings representing the HTML-stripped regexp
* matches.
*/
function getHtmlStrippedRegexMatches(regexp, text) {
var matches = [];
var match;
while (true) {
match = regexp.exec(text);
if (match === null) {
break;
} else {
matches.push(stripHtml(match[1]));
}
}
return matches;
}
/**
* Returns the contents of all ... elements within the given
* text. The contents of the elements are stripped of all other HTML
* tags.
* @param {String} text - The HTML string containing the elements.
* @return {Array} Array of strings of text inside elements.
*/
function getFontBits(text) {
return getHtmlStrippedRegexMatches(/]*>(.*?)<\/font>/g, text);
}
/**
* Returns the contents of all
... | elements within the given
* text. The contents of the elements are stripped of all other HTML
* tags.
* @param {String} text - The HTML string containing the | elements.
* @return {Array} Array of strings of text inside | elements.
*/
function getTableDataBits(text) {
return getHtmlStrippedRegexMatches(/ | ]*>(.*?)<\/td>/g, text).map($.trim);
}
/**
* Returns the contents of all | ... | elements within the given
* text. The contents of the elements are stripped of all other HTML
* tags. Empty matches are removed.
* @param {String} text - The HTML string containing the | elements.
* @return {Array} Array of strings of text inside | elements.
*/
function getNonEmptyTableDataBits(text) {
return getTableDataBits(text).filter(function (bit) { return bit !== ""; });
}
/**
* Returns the contents of all | ... | elements within the given
* text. The contents of the elements are stripped of all other HTML
* tags. Empty matches are removed.
* @param {String} text - The HTML string containing the | elements.
* @return {Array} Array of strings of text inside | elements.
*/
function getNonEmptyTableHeaderBits(text) {
var matches = getHtmlStrippedRegexMatches(/ | ]*>(.*?)<\/th>/g, text);
return matches.filter(function (bit) { return bit !== ""; });
}
/**
* Attempts to read a course distance from the given string.
* @param {String} text - The text string to read a course distance from.
* @return {?Number} - The parsed course distance, or null if no
* distance could be parsed.
*/
function tryReadDistance(text) {
var distanceMatch = DISTANCE_FIND_REGEXP.exec(text);
if (distanceMatch === null) {
return null;
} else {
return parseCourseLength(distanceMatch[1]);
}
}
/**
* Attempts to read a course climb from the given string.
* @param {String} text - The text string to read a course climb from.
* @return {?Number} - The parsed course climb, or null if no climb
* could be parsed.
*/
function tryReadClimb(text) {
var climbMatch = CLIMB_FIND_REGEXP.exec(text);
if (climbMatch === null) {
return null;
} else {
return parseInt(climbMatch[1], 10);
}
}
/**
* Reads control codes from an array of strings. Each code should be of the
* form num(code), with the exception of the finish, which, if it appears,
* should contain no parentheses and must be the last. The finish is
* returned as null.
* @param {Array} labels - Array of string labels.
* @return {Array} Array of control codes, with null indicating the finish.
*/
function readControlCodes(labels) {
var controlCodes = [];
for (var labelIdx = 0; labelIdx < labels.length; labelIdx += 1) {
var label = labels[labelIdx];
var parenPos = label.indexOf("(");
if (parenPos > -1 && label[label.length - 1] === ")") {
var controlCode = label.substring(parenPos + 1, label.length - 1);
controlCodes.push(controlCode);
} else if (labelIdx + 1 === labels.length) {
controlCodes.push(null);
} else {
throwInvalidData("Unrecognised control header label: '" + label + "'");
}
}
return controlCodes;
}
/**
* Removes from the given arrays of cumulative and split times any 'extra'
* controls.
*
* An 'extra' control is a control that a competitor punches without it
* being a control on their course. Extra controls are indicated by the
* split 'time' beginning with an asterisk.
*
* This method does not return anything, instead it mutates the arrays
* given.
*
* @param {Array} cumTimes - Array of cumulative times.
* @param {Array} splitTimes - Array of split times.
*/
function removeExtraControls(cumTimes, splitTimes) {
while (splitTimes.length > 0 && splitTimes[splitTimes.length - 1][0] === "*") {
splitTimes.splice(splitTimes.length - 1, 1);
cumTimes.splice(cumTimes.length - 1, 1);
}
}
/**
* Represents the result of parsing lines of competitor data. This can
* represent intermediate data as well as complete data.
* @constructor
* @param {String} name - The name of the competitor.
* @param {String} club - The name of the competitor's club.
* @param {String} className - The class of the competitor.
* @param {?Number} totalTime - The total time taken by the competitor, or
* null for no total time.
* @param {Array} cumTimes - Array of cumulative split times.
* @param {boolean} competitive - Whether the competitor's run is competitive.
*/
function CompetitorParseRecord(name, club, className, totalTime, cumTimes, competitive) {
this.name = name;
this.club = club;
this.className = className;
this.totalTime = totalTime;
this.cumTimes = cumTimes;
this.competitive = competitive;
}
/**
* Returns whether this competitor record is a 'continuation' record.
* A continuation record is one that has no name, club, class name or total
* time. Instead it represents the data read from lines of data other than
* the first two.
* @return {boolean} True if the record is a continuation record, false if not.
*/
CompetitorParseRecord.prototype.isContinuation = function () {
return (this.name === "" && this.club === "" && this.className === null && this.totalTime === "" && !this.competitive);
};
/**
* Appends the cumulative split times in another CompetitorParseRecord to
* this one. The one given must be a 'continuation' record.
* @param {CompetitorParseRecord} other - The record whose cumulative times
* we wish to append.
*/
CompetitorParseRecord.prototype.append = function (other) {
if (other.isContinuation()) {
this.cumTimes = this.cumTimes.concat(other.cumTimes);
} else {
throw new Error("Can only append a continuation CompetitorParseRecord");
}
};
/**
* Creates a Competitor object from this CompetitorParseRecord object.
* @param {Number} order - The number of this competitor within their class
* (1=first, 2=second, ...).
* @return {Competitor} Converted competitor object.
*/
CompetitorParseRecord.prototype.toCompetitor = function (order) {
// Prepend a zero cumulative time.
var cumTimes = [0].concat(this.cumTimes);
// The null is for the start time.
var competitor = fromOriginalCumTimes(order, this.name, this.club, null, cumTimes);
if (competitor.completed() && !this.competitive) {
competitor.setNonCompetitive();
}
if (!competitor.hasAnyTimes()) {
competitor.setNonStarter();
}
return competitor;
};
/*
* There are three types of HTML format supported by this parser: one that is
* based on pre-formatted text, one that is based around a single HTML table,
* and one that uses many HTML tables. The overall strategy when parsing
* any format is largely the same, but the exact details vary.
*
* A 'Recognizer' is used to handle the finer details of the format parsing.
* A recognizer should contain methods 'isTextOfThisFormat',
* 'preprocess', 'canIgnoreThisLine', 'isCourseHeaderLine',
* 'parseCourseHeaderLine', 'parseControlsLine' and 'parseCompetitor'.
* See the documentation on the objects below for more information about
* what these methods do.
*/
/**
* A Recognizer that handles the 'older' HTML format based on preformatted
* text.
* @constructor
*/
var OldHtmlFormatRecognizer = function () {
// There exists variations of the format depending on what the second
// ... element on each row contains. It can be blank,
// contain a number (start number, perhaps?) or something else.
// If blank or containing a number, the competitor's name is in column
// 2 and there are four preceding columns. Otherwise the competitor's
// name is in column 1 and there are three preceding columns.
this.precedingColumnCount = null;
};
/**
* Returns whether this recognizer is likely to recognize the given HTML
* text and possibly be able to parse it. If this method returns true, the
* parser will use this recognizer to attempt to parse the HTML. If it
* returns false, the parser will not use this recognizer. Other methods on
* this object can therefore assume that this method has returned true.
*
* As this recognizer is for recognizing preformatted text which also uses a
* lot of <font> elements, it simply checks for the presence of
* HTML <pre> and <font> elements.
*
* @param {String} text - The entire input text read in.
* @return {boolean} True if the text contains any pre-formatted HTML, false
* otherwise
*/
OldHtmlFormatRecognizer.prototype.isTextOfThisFormat = function (text) {
return (text.indexOf("") >= 0 && text.indexOf("= 0);
};
/**
* Performs some pre-processing on the text before it is read in.
*
* This object strips everything up to and including the opening
* <pre> tag, and everything from the closing </pre> tag
* to the end of the text.
*
* @param {String} text - The HTML text to preprocess.
* @return {String} The preprocessed text.
*/
OldHtmlFormatRecognizer.prototype.preprocess = function (text) {
var prePos = text.indexOf("");
if (prePos === -1) {
throw new Error("Cannot find opening pre tag");
}
var lineEndPos = text.indexOf("\n", prePos);
text = text.substring(lineEndPos + 1);
var closePrePos = text.lastIndexOf(" ");
if (closePrePos === -1) {
throwInvalidData("Found opening but no closing ");
}
lineEndPos = text.lastIndexOf("\n", closePrePos);
text = text.substring(0, lineEndPos);
return $.trim(text);
};
/**
* Returns whether the HTML parser can ignore the given line altogether.
*
* The parser will call this method with every line read in, apart from
* the second line of each pair of competitor data rows. These are always
* assumed to be in pairs.
*
* This recognizer ignores only blank lines.
*
* @param {String} line - The line to check.
* @return {boolean} True if the line should be ignored, false if not.
*/
OldHtmlFormatRecognizer.prototype.canIgnoreThisLine = function (line) {
return line === "";
};
/**
* Returns whether the given line is the first line of a course.
*
* If so, it means the parser has finished processing the previous course
* (if any), and can start a new course.
*
* This recognizer treats a line with exactly two
* <font>...</font> elements as a course header line, and
* anything else not.
*
* @param {String} line - The line to check.
* @return {boolean} True if this is the first line of a course, false
* otherwise.
*/
OldHtmlFormatRecognizer.prototype.isCourseHeaderLine = function (line) {
return (getFontBits(line).length === 2);
};
/**
* Parse a course header line and return the course name, distance and
* climb.
*
* This method can assume that the line given is a course header line.
*
* @param {String} line - The line to parse course details from.
* @return {Object} Object containing the parsed course details.
*/
OldHtmlFormatRecognizer.prototype.parseCourseHeaderLine = function (line) {
var bits = getFontBits(line);
if (bits.length !== 2) {
throw new Error("Course header line should have two parts");
}
var nameAndControls = bits[0];
var distanceAndClimb = bits[1];
var openParenPos = nameAndControls.indexOf("(");
var courseName = (openParenPos > -1) ? nameAndControls.substring(0, openParenPos) : nameAndControls;
var distance = tryReadDistance(distanceAndClimb);
var climb = tryReadClimb(distanceAndClimb);
return {
name: $.trim(courseName),
distance: distance,
climb: climb
};
};
/**
* Parse control codes from the given line and return a list of them.
*
* This method can assume that the previous line was the course header or a
* previous control line. It should also return null for the finish, which
* should have no code. The finish is assumed to he the last.
*
* @param {String} line - The line to parse control codes from.
* @return {Array} Array of control codes.
*/
OldHtmlFormatRecognizer.prototype.parseControlsLine = function (line) {
var lastFontPos = line.lastIndexOf("");
var controlsText = (lastFontPos === -1) ? line : line.substring(lastFontPos + "".length);
var controlLabels = splitByWhitespace($.trim(controlsText));
return readControlCodes(controlLabels);
};
/**
* Read either cumulative or split times from the given line of competitor
* data.
* (This method is not used by the parser, only elsewhere in the recognizer.)
* @param {String} line - The line to read the times from.
* @return {Array} Array of times.
*/
OldHtmlFormatRecognizer.prototype.readCompetitorSplitDataLine = function (line) {
for (var i = 0; i < this.precedingColumnCount; i += 1) {
var closeFontPos = line.indexOf("");
line = line.substring(closeFontPos + "".length);
}
var times = splitByWhitespace(stripHtml(line));
return times;
};
/**
* Parse two lines of competitor data into a CompetitorParseRecord object
* containing the data.
* @param {String} firstLine - The first line of competitor data.
* @param {String} secondLine - The second line of competitor data.
* @return {CompetitorParseRecord} The parsed competitor.
*/
OldHtmlFormatRecognizer.prototype.parseCompetitor = function (firstLine, secondLine) {
var firstLineBits = getFontBits(firstLine);
var secondLineBits = getFontBits(secondLine);
if (this.precedingColumnCount === null) {
// If column 1 is blank or a number, we have four preceding
// columns. Otherwise we have three.
var column1 = $.trim(firstLineBits[1]);
this.precedingColumnCount = (column1.match(/^\d*$/)) ? 4 : 3;
}
var competitive = hasNumber(firstLineBits[0]);
var name = $.trim(firstLineBits[this.precedingColumnCount - 2]);
var totalTime = $.trim(firstLineBits[this.precedingColumnCount - 1]);
var club = $.trim(secondLineBits[this.precedingColumnCount - 2]);
var cumulativeTimes = this.readCompetitorSplitDataLine(firstLine);
var splitTimes = this.readCompetitorSplitDataLine(secondLine);
cumulativeTimes = cumulativeTimes.map(parseTime);
removeExtraControls(cumulativeTimes, splitTimes);
var className = null;
if (name !== null && name !== "") {
var lastCloseFontPos = -1;
for (var i = 0; i < this.precedingColumnCount; i += 1) {
lastCloseFontPos = firstLine.indexOf("", lastCloseFontPos + 1);
}
var firstLineUpToLastPreceding = firstLine.substring(0, lastCloseFontPos + "".length);
var firstLineMinusFonts = firstLineUpToLastPreceding.replace(/]*>(.*?)<\/font>/g, "");
var lineParts = splitByWhitespace(firstLineMinusFonts);
if (lineParts.length > 0) {
className = lineParts[0];
}
}
return new CompetitorParseRecord(name, club, className, totalTime, cumulativeTimes, competitive);
};
/**
* Constructs a recognizer for formatting the 'newer' format of SI HTML
* event results data.
*
* Data in this format is given within a number of HTML tables, three per
* course.
* @constructor
*/
var NewHtmlFormatRecognizer = function () {
this.currentCourseHasClass = false;
};
/**
* Returns whether this recognizer is likely to recognize the given HTML
* text and possibly be able to parse it. If this method returns true, the
* parser will use this recognizer to attempt to parse the HTML. If it
* returns false, the parser will not use this recognizer. Other methods on
* this object can therefore assume that this method has returned true.
*
* As this recognizer is for recognizing HTML formatted in tables, it
* returns whether the number of HTML <table> tags is at least five.
* Each course uses three tables, and there are two HTML tables before the
* courses.
*
* @param {String} text - The entire input text read in.
* @return {boolean} True if the text contains at least five HTML table
* tags.
*/
NewHtmlFormatRecognizer.prototype.isTextOfThisFormat = function (text) {
var tablePos = -1;
for (var i = 0; i < 5; i += 1) {
tablePos = text.indexOf(" it is contained in.
var tableEndPos = text.indexOf(" ");
if (tableEndPos === -1) {
throwInvalidData("Could not find any closing tags");
}
text = text.substring(tableEndPos + "".length);
var closeDivPos = text.indexOf("");
var openTablePos = text.indexOf(" -1 && closeDivPos < openTablePos) {
text = text.substring(closeDivPos + "".length);
}
// Rejig the line endings so that each row of competitor data is on its
// own line, with table and table-row tags starting on new lines,
// and closing table and table-row tags at the end of lines.
text = text.replace(/>\n<").replace(/>/g, ">\n ").replace(/<\/tr>\n<")
.replace(/>\n\n<");
// Remove all elements.
text = text.replace(/<\/col[^>]*>/g, "");
// Remove all rows that contain only a single non-breaking space.
// In the file I have, the entities are missing their
// semicolons. However, this could well be fixed in the future.
text = text.replace(/]*>]*>(?:)? ?(?:<\/nobr>)?<\/td><\/tr>/g, "");
// Finally, remove the trailing | |