4 * Copyright, Moxiecode Systems AB
5 * Released under LGPL License.
7 * License: http://www.tinymce.com/license
8 * Contributing: http://www.tinymce.com/contributing
12 * This class parses HTML code into a DOM like structure of nodes it will remove redundant whitespace and make
13 * sure that the node tree is valid according to the specified schema.
14 * So for example: <p>a<p>b</p>c</p> will become <p>a</p><p>b</p><p>c</p>
17 * var parser = new tinymce.html.DomParser({validate: true}, schema);
18 * var rootNode = parser.parse('<h1>content</h1>');
20 * @class tinymce.html.DomParser
23 define("tinymce/html/DomParser", [
25 "tinymce/html/Schema",
26 "tinymce/html/SaxParser",
28 ], function(Node, Schema, SaxParser, Tools) {
29 var makeMap = Tools.makeMap, each = Tools.each, explode = Tools.explode, extend = Tools.extend;
32 * Constructs a new DomParser instance.
36 * @param {Object} settings Name/value collection of settings. comment, cdata, text, start and end are callbacks.
37 * @param {tinymce.html.Schema} schema HTML Schema class to use when parsing.
39 return function(settings, schema) {
40 var self = this, nodeFilters = {}, attributeFilters = [], matchedNodes = {}, matchedAttributes = {};
42 settings = settings || {};
43 settings.validate = "validate" in settings ? settings.validate : true;
44 settings.root_name = settings.root_name || 'body';
45 self.schema = schema = schema || new Schema();
47 function fixInvalidChildren(nodes) {
48 var ni, node, parent, parents, newParent, currentNode, tempNode, childNode, i;
49 var nonEmptyElements, nonSplitableElements, textBlockElements, sibling, nextNode;
51 nonSplitableElements = makeMap('tr,td,th,tbody,thead,tfoot,table');
52 nonEmptyElements = schema.getNonEmptyElements();
53 textBlockElements = schema.getTextBlockElements();
55 for (ni = 0; ni < nodes.length; ni++) {
58 // Already removed or fixed
59 if (!node.parent || node.fixed) {
63 // If the invalid element is a text block and the text block is within a parent LI element
64 // Then unwrap the first text block and convert other sibling text blocks to LI elements similar to Word/Open Office
65 if (textBlockElements[node.name] && node.parent.name == 'li') {
66 // Move sibling text blocks after LI element
69 if (textBlockElements[sibling.name]) {
72 node.parent.insert(sibling, node.parent);
77 sibling = sibling.next;
80 // Unwrap current text block
85 // Get list of all parent nodes until we find a valid parent to stick the child into
87 for (parent = node.parent; parent && !schema.isValidChild(parent.name, node.name) &&
88 !nonSplitableElements[parent.name]; parent = parent.parent) {
92 // Found a suitable parent
93 if (parent && parents.length > 1) {
94 // Reverse the array since it makes looping easier
97 // Clone the related parent and insert that after the moved node
98 newParent = currentNode = self.filterNode(parents[0].clone());
100 // Start cloning and moving children on the left side of the target node
101 for (i = 0; i < parents.length - 1; i++) {
102 if (schema.isValidChild(currentNode.name, parents[i].name)) {
103 tempNode = self.filterNode(parents[i].clone());
104 currentNode.append(tempNode);
106 tempNode = currentNode;
109 for (childNode = parents[i].firstChild; childNode && childNode != parents[i + 1]; ) {
110 nextNode = childNode.next;
111 tempNode.append(childNode);
112 childNode = nextNode;
115 currentNode = tempNode;
118 if (!newParent.isEmpty(nonEmptyElements)) {
119 parent.insert(newParent, parents[0], true);
120 parent.insert(node, newParent);
122 parent.insert(node, parents[0], true);
125 // Check if the element is empty by looking through it's contents and special treatment for <p><br /></p>
127 if (parent.isEmpty(nonEmptyElements) || parent.firstChild === parent.lastChild && parent.firstChild.name === 'br') {
128 parent.empty().remove();
130 } else if (node.parent) {
131 // If it's an LI try to find a UL/OL for it or wrap it
132 if (node.name === 'li') {
134 if (sibling && (sibling.name === 'ul' || sibling.name === 'ul')) {
135 sibling.append(node);
140 if (sibling && (sibling.name === 'ul' || sibling.name === 'ul')) {
141 sibling.insert(node, sibling.firstChild, true);
145 node.wrap(self.filterNode(new Node('ul', 1)));
149 // Try wrapping the element in a DIV
150 if (schema.isValidChild(node.parent.name, 'div') && schema.isValidChild('div', node.name)) {
151 node.wrap(self.filterNode(new Node('div', 1)));
153 // We failed wrapping it, then remove or unwrap it
154 if (node.name === 'style' || node.name === 'script') {
155 node.empty().remove();
165 * Runs the specified node though the element and attributes filters.
168 * @param {tinymce.html.Node} Node the node to run filters on.
169 * @return {tinymce.html.Node} The passed in node.
171 self.filterNode = function(node) {
174 // Run element filters
175 if (name in nodeFilters) {
176 list = matchedNodes[name];
181 matchedNodes[name] = [node];
185 // Run attribute filters
186 i = attributeFilters.length;
188 name = attributeFilters[i].name;
190 if (name in node.attributes.map) {
191 list = matchedAttributes[name];
196 matchedAttributes[name] = [node];
205 * Adds a node filter function to the parser, the parser will collect the specified nodes by name
206 * and then execute the callback ones it has finished parsing the document.
209 * parser.addNodeFilter('p,h1', function(nodes, name) {
210 * for (var i = 0; i < nodes.length; i++) {
211 * console.log(nodes[i].name);
214 * @method addNodeFilter
215 * @method {String} name Comma separated list of nodes to collect.
216 * @param {function} callback Callback function to execute once it has collected nodes.
218 self.addNodeFilter = function(name, callback) {
219 each(explode(name), function(name) {
220 var list = nodeFilters[name];
223 nodeFilters[name] = list = [];
231 * Adds a attribute filter function to the parser, the parser will collect nodes that has the specified attributes
232 * and then execute the callback ones it has finished parsing the document.
235 * parser.addAttributeFilter('src,href', function(nodes, name) {
236 * for (var i = 0; i < nodes.length; i++) {
237 * console.log(nodes[i].name);
240 * @method addAttributeFilter
241 * @method {String} name Comma separated list of nodes to collect.
242 * @param {function} callback Callback function to execute once it has collected nodes.
244 self.addAttributeFilter = function(name, callback) {
245 each(explode(name), function(name) {
248 for (i = 0; i < attributeFilters.length; i++) {
249 if (attributeFilters[i].name === name) {
250 attributeFilters[i].callbacks.push(callback);
255 attributeFilters.push({name: name, callbacks: [callback]});
260 * Parses the specified HTML string into a DOM like node tree and returns the result.
263 * var rootNode = new DomParser({...}).parse('<b>text</b>');
265 * @param {String} html Html string to sax parse.
266 * @param {Object} args Optional args object that gets passed to all filter functions.
267 * @return {tinymce.html.Node} Root node containing the tree.
269 self.parse = function(html, args) {
270 var parser, rootNode, node, nodes, i, l, fi, fl, list, name, validate;
271 var blockElements, startWhiteSpaceRegExp, invalidChildren = [], isInWhiteSpacePreservedElement;
272 var endWhiteSpaceRegExp, allWhiteSpaceRegExp, isAllWhiteSpaceRegExp, whiteSpaceElements;
273 var children, nonEmptyElements, rootBlockName;
277 matchedAttributes = {};
278 blockElements = extend(makeMap('script,style,head,html,body,title,meta,param'), schema.getBlockElements());
279 nonEmptyElements = schema.getNonEmptyElements();
280 children = schema.children;
281 validate = settings.validate;
282 rootBlockName = "forced_root_block" in args ? args.forced_root_block : settings.forced_root_block;
284 whiteSpaceElements = schema.getWhiteSpaceElements();
285 startWhiteSpaceRegExp = /^[ \t\r\n]+/;
286 endWhiteSpaceRegExp = /[ \t\r\n]+$/;
287 allWhiteSpaceRegExp = /[ \t\r\n]+/g;
288 isAllWhiteSpaceRegExp = /^[ \t\r\n]+$/;
290 function addRootBlocks() {
291 var node = rootNode.firstChild, next, rootBlockNode;
293 // Removes whitespace at beginning and end of block so:
294 // <p> x </p> -> <p>x</p>
295 function trim(rootBlockNode) {
297 node = rootBlockNode.firstChild;
298 if (node && node.type == 3) {
299 node.value = node.value.replace(startWhiteSpaceRegExp, '');
302 node = rootBlockNode.lastChild;
303 if (node && node.type == 3) {
304 node.value = node.value.replace(endWhiteSpaceRegExp, '');
309 // Check if rootBlock is valid within rootNode for example if P is valid in H1 if H1 is the contentEditabe root
310 if (!schema.isValidChild(rootNode.name, rootBlockName.toLowerCase())) {
317 if (node.type == 3 || (node.type == 1 && node.name !== 'p' &&
318 !blockElements[node.name] && !node.attr('data-mce-type'))) {
319 if (!rootBlockNode) {
320 // Create a new root block element
321 rootBlockNode = createNode(rootBlockName, 1);
322 rootNode.insert(rootBlockNode, node);
323 rootBlockNode.append(node);
325 rootBlockNode.append(node);
329 rootBlockNode = null;
338 function createNode(name, type) {
339 var node = new Node(name, type), list;
341 if (name in nodeFilters) {
342 list = matchedNodes[name];
347 matchedNodes[name] = [node];
354 function removeWhitespaceBefore(node) {
355 var textNode, textVal, sibling;
357 for (textNode = node.prev; textNode && textNode.type === 3; ) {
358 textVal = textNode.value.replace(endWhiteSpaceRegExp, '');
360 if (textVal.length > 0) {
361 textNode.value = textVal;
362 textNode = textNode.prev;
364 sibling = textNode.prev;
371 function cloneAndExcludeBlocks(input) {
372 var name, output = {};
374 for (name in input) {
375 if (name !== 'li' && name != 'p') {
376 output[name] = input[name];
383 parser = new SaxParser({
386 // Exclude P and LI from DOM parsing since it's treated better by the DOM parser
387 self_closing_elements: cloneAndExcludeBlocks(schema.getSelfClosingElements()),
389 cdata: function(text) {
390 node.append(createNode('#cdata', 4)).value = text;
393 text: function(text, raw) {
396 // Trim all redundant whitespace on non white space elements
397 if (!isInWhiteSpacePreservedElement) {
398 text = text.replace(allWhiteSpaceRegExp, ' ');
400 if (node.lastChild && blockElements[node.lastChild.name]) {
401 text = text.replace(startWhiteSpaceRegExp, '');
405 // Do we need to create the node
406 if (text.length !== 0) {
407 textNode = createNode('#text', 3);
408 textNode.raw = !!raw;
409 node.append(textNode).value = text;
413 comment: function(text) {
414 node.append(createNode('#comment', 8)).value = text;
417 pi: function(name, text) {
418 node.append(createNode(name, 7)).value = text;
419 removeWhitespaceBefore(node);
422 doctype: function(text) {
425 newNode = node.append(createNode('#doctype', 10));
426 newNode.value = text;
427 removeWhitespaceBefore(node);
430 start: function(name, attrs, empty) {
431 var newNode, attrFiltersLen, elementRule, attrName, parent;
433 elementRule = validate ? schema.getElementRule(name) : {};
435 newNode = createNode(elementRule.outputName || name, 1);
436 newNode.attributes = attrs;
437 newNode.shortEnded = empty;
439 node.append(newNode);
441 // Check if node is valid child of the parent node is the child is
442 // unknown we don't collect it since it's probably a custom element
443 parent = children[node.name];
444 if (parent && children[newNode.name] && !parent[newNode.name]) {
445 invalidChildren.push(newNode);
448 attrFiltersLen = attributeFilters.length;
449 while (attrFiltersLen--) {
450 attrName = attributeFilters[attrFiltersLen].name;
452 if (attrName in attrs.map) {
453 list = matchedAttributes[attrName];
458 matchedAttributes[attrName] = [newNode];
463 // Trim whitespace before block
464 if (blockElements[name]) {
465 removeWhitespaceBefore(newNode);
468 // Change current node if the element wasn't empty i.e not <br /> or <img />
473 // Check if we are inside a whitespace preserved element
474 if (!isInWhiteSpacePreservedElement && whiteSpaceElements[name]) {
475 isInWhiteSpacePreservedElement = true;
480 end: function(name) {
481 var textNode, elementRule, text, sibling, tempNode;
483 elementRule = validate ? schema.getElementRule(name) : {};
485 if (blockElements[name]) {
486 if (!isInWhiteSpacePreservedElement) {
487 // Trim whitespace of the first node in a block
488 textNode = node.firstChild;
489 if (textNode && textNode.type === 3) {
490 text = textNode.value.replace(startWhiteSpaceRegExp, '');
492 // Any characters left after trim or should we remove it
493 if (text.length > 0) {
494 textNode.value = text;
495 textNode = textNode.next;
497 sibling = textNode.next;
501 // Remove any pure whitespace siblings
502 while (textNode && textNode.type === 3) {
503 text = textNode.value;
504 sibling = textNode.next;
506 if (text.length === 0 || isAllWhiteSpaceRegExp.test(text)) {
516 // Trim whitespace of the last node in a block
517 textNode = node.lastChild;
518 if (textNode && textNode.type === 3) {
519 text = textNode.value.replace(endWhiteSpaceRegExp, '');
521 // Any characters left after trim or should we remove it
522 if (text.length > 0) {
523 textNode.value = text;
524 textNode = textNode.prev;
526 sibling = textNode.prev;
530 // Remove any pure whitespace siblings
531 while (textNode && textNode.type === 3) {
532 text = textNode.value;
533 sibling = textNode.prev;
535 if (text.length === 0 || isAllWhiteSpaceRegExp.test(text)) {
546 // Trim start white space
547 // Removed due to: #5424
548 /*textNode = node.prev;
549 if (textNode && textNode.type === 3) {
550 text = textNode.value.replace(startWhiteSpaceRegExp, '');
553 textNode.value = text;
559 // Check if we exited a whitespace preserved element
560 if (isInWhiteSpacePreservedElement && whiteSpaceElements[name]) {
561 isInWhiteSpacePreservedElement = false;
564 // Handle empty nodes
565 if (elementRule.removeEmpty || elementRule.paddEmpty) {
566 if (node.isEmpty(nonEmptyElements)) {
567 if (elementRule.paddEmpty) {
568 node.empty().append(new Node('#text', '3')).value = '\u00a0';
570 // Leave nodes that have a name like <a name="name">
571 if (!node.attributes.map.name && !node.attributes.map.id) {
572 tempNode = node.parent;
573 node.empty().remove();
586 rootNode = node = new Node(args.context || settings.root_name, 11);
590 // Fix invalid children or report invalid children in a contextual parsing
591 if (validate && invalidChildren.length) {
593 fixInvalidChildren(invalidChildren);
599 // Wrap nodes in the root into block elements if the root is body
600 if (rootBlockName && (rootNode.name == 'body' || args.isRootContent)) {
604 // Run filters only when the contents is valid
607 for (name in matchedNodes) {
608 list = nodeFilters[name];
609 nodes = matchedNodes[name];
611 // Remove already removed children
614 if (!nodes[fi].parent) {
619 for (i = 0, l = list.length; i < l; i++) {
620 list[i](nodes, name, args);
624 // Run attribute filters
625 for (i = 0, l = attributeFilters.length; i < l; i++) {
626 list = attributeFilters[i];
628 if (list.name in matchedAttributes) {
629 nodes = matchedAttributes[list.name];
631 // Remove already removed children
634 if (!nodes[fi].parent) {
639 for (fi = 0, fl = list.callbacks.length; fi < fl; fi++) {
640 list.callbacks[fi](nodes, list.name, args);
649 // Remove <br> at end of block elements Gecko and WebKit injects BR elements to
650 // make it possible to place the caret inside empty blocks. This logic tries to remove
651 // these elements and keep br elements that where intended to be there intact
652 if (settings.remove_trailing_brs) {
653 self.addNodeFilter('br', function(nodes) {
654 var i, l = nodes.length, node, blockElements = extend({}, schema.getBlockElements());
655 var nonEmptyElements = schema.getNonEmptyElements(), parent, lastParent, prev, prevName;
656 var elementRule, textNode;
658 // Remove brs from body element as well
659 blockElements.body = 1;
661 // Must loop forwards since it will otherwise remove all brs in <p>a<br><br><br></p>
662 for (i = 0; i < l; i++) {
664 parent = node.parent;
666 if (blockElements[node.parent.name] && node === parent.lastChild) {
667 // Loop all nodes to the left of the current node and check for other BR elements
668 // excluding bookmarks since they are invisible
671 prevName = prev.name;
674 if (prevName !== "span" || prev.attr('data-mce-type') !== 'bookmark') {
675 // Found a non BR element
676 if (prevName !== "br") {
680 // Found another br it's a <br><br> structure then don't remove anything
681 if (prevName === 'br') {
693 // Is the parent to be considered empty after we removed the BR
694 if (parent.isEmpty(nonEmptyElements)) {
695 elementRule = schema.getElementRule(parent.name);
697 // Remove or padd the element depending on schema rule
699 if (elementRule.removeEmpty) {
701 } else if (elementRule.paddEmpty) {
702 parent.empty().append(new Node('#text', 3)).value = '\u00a0';
708 // Replaces BR elements inside inline elements like <p><b><i><br></i></b></p>
709 // so they become <p><b><i> </i></b></p>
711 while (parent && parent.firstChild === lastParent && parent.lastChild === lastParent) {
714 if (blockElements[parent.name]) {
718 parent = parent.parent;
721 if (lastParent === parent) {
722 textNode = new Node('#text', 3);
723 textNode.value = '\u00a0';
724 node.replace(textNode);
731 // Force anchor names closed, unless the setting "allow_html_in_named_anchor" is explicitly included.
732 if (!settings.allow_html_in_named_anchor) {
733 self.addAttributeFilter('id,name', function(nodes) {
734 var i = nodes.length, sibling, prevSibling, parent, node;
738 if (node.name === 'a' && node.firstChild && !node.attr('href')) {
739 parent = node.parent;
741 // Move children after current node
742 sibling = node.lastChild;
744 prevSibling = sibling.prev;
745 parent.insert(sibling, node);
746 sibling = prevSibling;