javascript HTML from document.body.innerHTML

There is the W3C DOM 3 Core textContent property supported by some browsers, or the MS/HTML5 innerText property supported by other browsers (some support both). Likely the content of the script element is unwanted, so a recursive traverse of the related part of the DOM tree seems best:

// Get the text within an element
// Doesn't do any normalising, returns a string
// of text as found.
function getTextRecursive(element) {
  var text = [];
  var self = arguments.callee;
  var el, els = element.childNodes;

  for (var i=0, iLen=els.length; i<iLen; i++) {
    el = els[i];

    // May need to add other node types here
    // Exclude script element content
    if (el.nodeType == 1 && el.tagName && el.tagName.toLowerCase() != 'script') {
      text.push(self(el));

    // If working with XML, add nodeType 4 to get text from CDATA nodes
    } else if (el.nodeType == 3) {

      // Deal with extra whitespace and returns in text here.
      text.push(el.data);
    }
  }
  return text.join('');
}

Leave a Comment