0

I want to remove all html tags except <a> <img> and <iframe> from document using this code:

var regex = "<(?!a )(?!img )(?!iframe )([\s\S]*?)>";
var temp;
while (source.match(regex)) {
    temp = source.match(regex)[0];
    source = source.replace(temp, "");
}
return source;

It works in online regex testers, but for some reason it doesn't work on my page. For example it returns an original string when the input is:

    "<p class="MsoNormal" style="margin-left:202.5pt;line-height:200%;background:white"><b><span style="font-size: 16pt; line-height: 200%; color: rgb(131, 60, 11); background-image: initial; background-attachment: initial; background-size: initial; background-origin: initial; background-clip: initial; background-position: initial; background-repeat: initial;">test</span></b><span style="font-size:16.0pt;
line-height:200%;color:#833C0B;letter-spacing:-.15pt;mso-ansi-language:EN-US"><o:p></o:p></span></p>"

Please help!

6
  • Can you tell what exactly you are trying to do? Commented Jan 4, 2015 at 0:01
  • Isn't [\s\S] the equivalent of .? Have you tried adding console.log(temp) inside your while loop (or setting a break point) to see what is actually happening? Commented Jan 4, 2015 at 0:02
  • [\s\S] allows regex to match across multiple lines Commented Jan 4, 2015 at 0:05
  • Use regex literal syntax: var regex = /<(?!a )(?!img )(?!iframe )([\s\S]*?)>/; (note the forward slashes). You are declaring a string, which means [\s\S] ends up as simply [sS]. (To use a string you need to escape the backslashes: "[\\s\\S]".) Commented Jan 4, 2015 at 0:11
  • 2
    Obligatory: stackoverflow.com/questions/1732348/… Commented Jan 4, 2015 at 0:19

2 Answers 2

2

You can do it without a regex. It's usually not a good idea to try parsing HTML with regexes, unless the use case is very simple...

The way I implemented stripHtmlElementsMatching, you can pass it any CSS selector and it will strip all matching entities.

Therefore, to remove anything but a, img, iframe you can pass :not(a):not(img):not(iframe).

PS: The htmlstripping-root custom tag is only to avoid creating a parser element that interferes with the passed selector. For instance, if I used div as a parser element and you would pass the selector div > div, all divs would be removed even if they were not nested in your html string.

var stripHtmlElementsMatching = (function(doc) {
  
  doc.registerElement('htmlstripping-root');
  
  return function(text, selector) {
    
    var parser = document.createElement('htmlstripping-root'),
        matchingEls, i, len, el;
    
    selector = typeof selector == 'string' ? selector : ':not(*)';
    parser.innerHTML = text;
    
    matchingEls = parser.querySelectorAll(selector);
    
    for (i = 0, len = matchingEls.length; i < len; i++) {
      el = matchingEls[i];
      el.parentNode.replaceChild(newFragFrom(el.childNodes), el);
    }
    
    return parser.innerHTML;
  };
  
  function newFragFrom(nodes) {
    var frag = document.createDocumentFragment();
    
    while (nodes.length) frag.appendChild(nodes[0]);
    
    return frag;
  }
  
})(document);


var text = '<p class="MsoNormal" style="margin-left:202.5pt;line-height:200%;background:white"><b><span style="font-size: 16pt; line-height: 200%; color: rgb(131, 60, 11); background-image: initial; background-attachment: initial; background-size: initial; background-origin: initial; background-clip: initial; background-position: initial; background-repeat: initial;">test</span></b><span style="font-size:16.0pt; line-height:200%;color:#833C0B;letter-spacing:-.15pt;mso-ansi-language:EN-US"><o:p></o:p></span></p>';

var tagsToKeep = ['a', 'img', 'iframe'];

var sanitizeSelector = tagsToKeep.map(function(tag) {
  return ':not(' + tag + ')';
}).join('');

var sanitizedText = stripHtmlElementsMatching(text, sanitizeSelector);

document.body.appendChild(document.createTextNode(sanitizedText));

Sign up to request clarification or add additional context in comments.

Comments

2

This is the best that I could come up with!

<((?!a)|a\w)(?!\/a)(?!img)(?!iframe)(?!\/iframe)+([\s\S]*?)>

The first capturing group, the not a or a followed by a word, allows audio, abbr, address, etc. to all pass through.

Just replace the matches from the above regex with nothing.

Please see: http://regexr.com/3a5hp

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.