4 * Copyright, Moxiecode Systems AB
5 * Released under LGPL License.
7 * License: http://www.tinymce.com/license
8 * Contributing: http://www.tinymce.com/contributing
12 * This class parses HTML code using pure JavaScript and executes various events for each item it finds. It will
13 * always execute the events in the right order for tag soup code like <b><p></b></p>. It will also remove elements
14 * and attributes that doesn't fit the schema if the validate setting is enabled.
17 * var parser = new tinymce.html.SaxParser({
20 * comment: function(text) {
21 * console.log('Comment:', text);
24 * cdata: function(text) {
25 * console.log('CDATA:', text);
28 * text: function(text, raw) {
29 * console.log('Text:', text, 'Raw:', raw);
32 * start: function(name, attrs, empty) {
33 * console.log('Start:', name, attrs, empty);
36 * end: function(name) {
37 * console.log('End:', name);
40 * pi: function(name, text) {
41 * console.log('PI:', name, text);
44 * doctype: function(text) {
45 * console.log('DocType:', text);
48 * @class tinymce.html.SaxParser
51 define("tinymce/html/SaxParser", [
52 "tinymce/html/Schema",
53 "tinymce/html/Entities",
55 ], function(Schema, Entities, Tools) {
56 var each = Tools.each;
59 * Constructs a new SaxParser instance.
63 * @param {Object} settings Name/value collection of settings. comment, cdata, text, start and end are callbacks.
64 * @param {tinymce.html.Schema} schema HTML Schema class to use when parsing.
66 return function(settings, schema) {
67 var self = this, noop = function() {};
69 settings = settings || {};
70 self.schema = schema = schema || new Schema();
72 if (settings.fix_self_closing !== false) {
73 settings.fix_self_closing = true;
76 // Add handler functions from settings and setup default handlers
77 each('comment cdata text start end pi doctype'.split(' '), function(name) {
79 self[name] = settings[name] || noop;
84 * Parses the specified HTML string and executes the callbacks for each item it finds.
87 * new SaxParser({...}).parse('<b>text</b>');
89 * @param {String} html Html string to sax parse.
91 self.parse = function(html) {
92 var self = this, matches, index = 0, value, endRegExp, stack = [], attrList, i, text, name;
93 var isInternalElement, removeInternalElements, shortEndedElements, fillAttrsMap, isShortEnded;
94 var validate, elementRule, isValidElement, attr, attribsValue, validAttributesMap, validAttributePatterns;
95 var attributesRequired, attributesDefault, attributesForced;
96 var anyAttributesRequired, selfClosing, tokenRegExp, attrRegExp, specialElements, attrValue, idCount = 0;
97 var decode = Entities.decode, fixSelfClosing;
99 function processEndTag(name) {
102 // Find position of parent of the same type
105 if (stack[pos].name === name) {
112 // Close all the open elements
113 for (i = stack.length - 1; i >= pos; i--) {
121 // Remove the open elements from the stack
126 function parseAttribute(match, name, value, val2, val3) {
129 name = name.toLowerCase();
130 value = name in fillAttrsMap ? name : decode(value || val2 || val3 || ''); // Handle boolean attribute than value attribute
132 // Validate name and value pass through all data- attributes
133 if (validate && !isInternalElement && name.indexOf('data-') !== 0) {
134 attrRule = validAttributesMap[name];
136 // Find rule by pattern matching
137 if (!attrRule && validAttributePatterns) {
138 i = validAttributePatterns.length;
140 attrRule = validAttributePatterns[i];
141 if (attrRule.pattern.test(name)) {
152 // No attribute rule found
158 if (attrRule.validValues && !(value in attrRule.validValues)) {
163 // Add attribute to list and map
164 attrList.map[name] = value;
171 // Precompile RegExps and map objects
172 tokenRegExp = new RegExp('<(?:' +
173 '(?:!--([\\w\\W]*?)-->)|' + // Comment
174 '(?:!\\[CDATA\\[([\\w\\W]*?)\\]\\]>)|' + // CDATA
175 '(?:!DOCTYPE([\\w\\W]*?)>)|' + // DOCTYPE
176 '(?:\\?([^\\s\\/<>]+) ?([\\w\\W]*?)[?/]>)|' + // PI
177 '(?:\\/([^>]+)>)|' + // End element
178 '(?:([A-Za-z0-9\\-\\:\\.]+)((?:\\s+[^"\'>]+(?:(?:"[^"]*")|(?:\'[^\']*\')|[^>]*))*|\\/|\\s+)>)' + // Start element
181 attrRegExp = /([\w:\-]+)(?:\s*=\s*(?:(?:\"((?:[^\"])*)\")|(?:\'((?:[^\'])*)\')|([^>\s]+)))?/g;
183 // Setup lookup tables for empty elements and boolean attributes
184 shortEndedElements = schema.getShortEndedElements();
185 selfClosing = settings.self_closing_elements || schema.getSelfClosingElements();
186 fillAttrsMap = schema.getBoolAttrs();
187 validate = settings.validate;
188 removeInternalElements = settings.remove_internals;
189 fixSelfClosing = settings.fix_self_closing;
190 specialElements = schema.getSpecialElements();
192 while ((matches = tokenRegExp.exec(html))) {
194 if (index < matches.index) {
195 self.text(decode(html.substr(index, matches.index - index)));
198 if ((value = matches[6])) { // End element
199 value = value.toLowerCase();
201 // IE will add a ":" in front of elements it doesn't understand like custom elements or HTML5 elements
202 if (value.charAt(0) === ':') {
203 value = value.substr(1);
206 processEndTag(value);
207 } else if ((value = matches[7])) { // Start element
208 value = value.toLowerCase();
210 // IE will add a ":" in front of elements it doesn't understand like custom elements or HTML5 elements
211 if (value.charAt(0) === ':') {
212 value = value.substr(1);
215 isShortEnded = value in shortEndedElements;
217 // Is self closing tag for example an <li> after an open <li>
218 if (fixSelfClosing && selfClosing[value] && stack.length > 0 && stack[stack.length - 1].name === value) {
219 processEndTag(value);
223 if (!validate || (elementRule = schema.getElementRule(value))) {
224 isValidElement = true;
226 // Grab attributes map and patters when validation is enabled
228 validAttributesMap = elementRule.attributes;
229 validAttributePatterns = elementRule.attributePatterns;
233 if ((attribsValue = matches[8])) {
234 isInternalElement = attribsValue.indexOf('data-mce-type') !== -1; // Check if the element is an internal element
236 // If the element has internal attributes then remove it if we are told to do so
237 if (isInternalElement && removeInternalElements) {
238 isValidElement = false;
244 attribsValue.replace(attrRegExp, parseAttribute);
250 // Process attributes if validation is enabled
251 if (validate && !isInternalElement) {
252 attributesRequired = elementRule.attributesRequired;
253 attributesDefault = elementRule.attributesDefault;
254 attributesForced = elementRule.attributesForced;
255 anyAttributesRequired = elementRule.removeEmptyAttrs;
257 // Check if any attribute exists
258 if (anyAttributesRequired && !attrList.length) {
259 isValidElement = false;
262 // Handle forced attributes
263 if (attributesForced) {
264 i = attributesForced.length;
266 attr = attributesForced[i];
268 attrValue = attr.value;
270 if (attrValue === '{$uid}') {
271 attrValue = 'mce_' + idCount++;
274 attrList.map[name] = attrValue;
275 attrList.push({name: name, value: attrValue});
279 // Handle default attributes
280 if (attributesDefault) {
281 i = attributesDefault.length;
283 attr = attributesDefault[i];
286 if (!(name in attrList.map)) {
287 attrValue = attr.value;
289 if (attrValue === '{$uid}') {
290 attrValue = 'mce_' + idCount++;
293 attrList.map[name] = attrValue;
294 attrList.push({name: name, value: attrValue});
299 // Handle required attributes
300 if (attributesRequired) {
301 i = attributesRequired.length;
303 if (attributesRequired[i] in attrList.map) {
308 // None of the required attributes where found
310 isValidElement = false;
314 // Invalidate element if it's marked as bogus
315 if (attrList.map['data-mce-bogus']) {
316 isValidElement = false;
320 if (isValidElement) {
321 self.start(value, attrList, isShortEnded);
324 isValidElement = false;
327 // Treat script, noscript and style a bit different since they may include code that looks like elements
328 if ((endRegExp = specialElements[value])) {
329 endRegExp.lastIndex = index = matches.index + matches[0].length;
331 if ((matches = endRegExp.exec(html))) {
332 if (isValidElement) {
333 text = html.substr(index, matches.index - index);
336 index = matches.index + matches[0].length;
338 text = html.substr(index);
342 if (isValidElement) {
343 if (text.length > 0) {
344 self.text(text, true);
350 tokenRegExp.lastIndex = index;
354 // Push value on to stack
356 if (!attribsValue || attribsValue.indexOf('/') != attribsValue.length - 1) {
357 stack.push({name: value, valid: isValidElement});
358 } else if (isValidElement) {
362 } else if ((value = matches[1])) { // Comment
364 } else if ((value = matches[2])) { // CDATA
366 } else if ((value = matches[3])) { // DOCTYPE
368 } else if ((value = matches[4])) { // PI
369 self.pi(value, matches[5]);
372 index = matches.index + matches[0].length;
376 if (index < html.length) {
377 self.text(decode(html.substr(index)));
380 // Close any open elements
381 for (i = stack.length - 1; i >= 0; i--) {
385 self.end(value.name);