The html tags can be removed in two different ways:
- Reg Exp - Regular Expression
- Converting the HTML to XML an using XmlService to get every element and then get the value of each element
The Reg Exp is better because you don't need to find every HTML element, which requires a lot more code.
The HTML must first be converted to XML so that XmlService.getPrettyFormat() can be used. If the html tags were removed first with a Regular Expression, then the code wouldn't know where the line breaks were supposed to be.
Using XmlService.getPrettyFormat() will format the html with line breaks. But to use XmlService, the html string must first be converted to XML. And there are a couple of things that you need to do when converting the html string to XML in order to avoid errors.
function parseHtml() {
var html = 'This is just a Test<br><br>Here is my List<br>\
<ol><li>one</li><li>Two</li><li>Three</li></ol><br>And a bulleted one<br><ul>\
<li>Bullet One</li><li>Bullet Two</li><li>Bullet Three</li></ul>';
html = '<div>' + html + '</div>';//To avoid the "Content is not allowed in prolog." error
html = html.replace(/<br>/g,"");//To avoid an error when parsing to xml
//Logger.log('html: ' + html)
var document = XmlService.parse(html);
var output = XmlService.getPrettyFormat().format(document);
//Logger.log(output);
output = output.replace(/<[^>]*>/g,"");
Logger.log(output)
}
Another way to do it, which is just provided as a learning example is to parse the HTML as Xml with XmlService and then loop through all the elements.
The following code only goes down through a couple layers of children.
function parseHtml() {
var html = 'This is just a Test<br><br>Here is my List<br>\
<ol><li>one</li><li>Two</li><li>Three</li></ol><br>And a bulleted one<br><ul>\
<li>Bullet One</li><li>Bullet Two</li><li>Bullet Three</li></ul>';
html = '<div>' + html + '</div>';
html = html.replace(/<br>/g,"");
//Logger.log('html: ' + html)
var allText = "";
var thisTxt;
var document = XmlService.parse(html);
var root = document.getRootElement();
//Logger.log('root: ' + JSON.stringify(root))
var content = root.getAllContent();
//Logger.log('content: ' + JSON.stringify(content))
var L = content.length;
for (var i=0;i<L;i++) {
var thisEl = content[i];
if (!thisEl) {continue;}
var theType = thisEl.getType();
//Logger.log('theType: ' + theType)
//Logger.log('typeof theType: ' + typeof theType)
if (theType === theType.ELEMENT) {
var asElmt = thisEl.asElement();
var allChildren = asElmt.getChildren();
if (allChildren) {
var nmbrOfChildren = allChildren.length;
//Logger.log('nmbrOfChildren: ' + nmbrOfChildren)
}
if (!nmbrOfChildren) {
thisTxt = asElmt.getValue();
//Logger.log('thisTxt 43: ' + thisTxt)
allText = allText + thisTxt + "\n";
continue;
}
for (var j=0;j<nmbrOfChildren;j++) {
thisTxt = allChildren[j].getValue();
if (!thisTxt) {
continue;
}
allText = allText + thisTxt + "\n";
}
continue;
}
//Logger.log(thisEl.getValue())
allText = allText + thisEl.getValue() + "\n";
}
//Logger.log('allText: ' + allText + "\n")
}