Last active
November 11, 2021 09:22
-
-
Save baotuo/e444e236af08f873435b to your computer and use it in GitHub Desktop.
Remove HTML formatting from a String
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // removes MS Office generated guff | |
| function cleanHTML(input) { | |
| // 1. remove line breaks / Mso classes | |
| var stringStripper = /(\n|\r| class=(")?Mso[a-zA-Z]+(")?)/g; | |
| var output = input.replace(stringStripper, ' '); | |
| // 2. strip Word generated HTML comments | |
| var commentSripper = new RegExp('<!--(.*?)-->','g'); | |
| var output = output.replace(commentSripper, ''); | |
| var tagStripper = new RegExp('<(/)*(meta|link|span|\\?xml:|st1:|o:|font)(.*?)>','gi'); | |
| // 3. remove tags leave content if any | |
| output = output.replace(tagStripper, ''); | |
| // 4. Remove everything in between and including tags '<style(.)style(.)>' | |
| var badTags = ['style', 'script','applet','embed','noframes','noscript']; | |
| for (var i=0; i< badTags.length; i++) { | |
| tagStripper = new RegExp('<'+badTags[i]+'.*?'+badTags[i]+'(.*?)>', 'gi'); | |
| output = output.replace(tagStripper, ''); | |
| } | |
| // 5. remove attributes ' style="..."' | |
| var badAttributes = ['style', 'start']; | |
| for (var i=0; i< badAttributes.length; i++) { | |
| var attributeStripper = new RegExp(' ' + badAttributes[i] + '="(.*?)"','gi'); | |
| output = output.replace(attributeStripper, ''); | |
| } | |
| return output; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment