Welcome to Software Development on Codidact!
Will you help us build our independent community of developers helping developers? We're small and trying to grow. We welcome questions about all aspects of software development, from design to code to QA and more. Got questions? Got answers? Got code you'd like someone to review? Please join us.
Post History
I wrote this HTML sanitizer for use in web scraping. The idea is to safely copy content from a site but apply my own style-sheet and remove any unsafe elements. This whitelist approach seems very d...
#2: Post edited
Is this HTML sanitizer safe?
I wrote this HTML sanitizer for use in web scrapping. The idea is to safely copy content from a site but apply my own style-sheet and remove any unsafe elements. This whitelist approach seems very different from other sanitizers that I have seen. Is there any disadvantage of doing it this way that I should be aware of?- ```javascript
- const allowedNodeTypes = [
- 'p',
- 'span',
- 'div',
- 'a',
- 'img',
- 'h1',
- 'h2',
- 'h3',
- 'h4',
- 'h5',
- 'strong',
- 'b',
- 'em',
- 'pre',
- 'blockquote',
- 'table',
- 'tr',
- 'td',
- 'tbody',
- 'u',
- 'br',
- 'i',
- 'ul',
- 'ol',
- 'li',
- 'figure',
- 'code',
- 'hr',
- 'italic',
- 'font',
- 'sup',
- 'sub'
- ]
- const allowedAttributeTypes = [
- 'href',
- 'src',
- 'alt',
- 'title'
- ]
- const urlTransformAttributeTypes = [
- 'src',
- 'href'
- ]
- function recursivelyCopyContent(elem1, elem2, keep_tags) {
- while (elem2.firstChild){
- elem2.removeChild(elem2.firstChild);
- }
- if (keep_tags === undefined) {
- keep_tags = true;
- }
- elem1.childNodes.forEach(child=>{
- if (child.nodeType === Node.TEXT_NODE){
- elem2.appendChild(
- document.createTextNode(child.data)
- );
- }
- else if (child.nodeType === Node.ELEMENT_NODE) {
- let tag = child.tagName.toLowerCase();
- if (allowedNodeTypes.indexOf(tag)>=0){
- let element = document.createElement(keep_tags?tag:'span');
- if (keep_tags){
- for (var name of child.attributes){
- if (urlTransformAttributeTypes.indexOf(name.nodeName)>=0){
- element.setAttribute(name.nodeName, new URL(name.value, 'https://target_site.com/folder/'));
- }
- else if (allowedAttributeTypes.indexOf(name.nodeName)>=0){
- element.setAttribute(name.nodeName, name.value);
- } else {
- console.log("not alled attribute", name.nodeName);
- }
- }
- }
- recursivelyCopyContent(child, element);
- elem2.appendChild(element);
- } else {
- console.log("Not allowed", tag);
- }
- }
- });
- }
- let htmlDoc = parser.parseFromString(await page.text(), 'text/html');
- recursivelyCopyContent(htmlDoc.getElementsByClassName('article-body')[0],
- document.getElementById('content')
- );
- I wrote this HTML sanitizer for use in web scraping. The idea is to safely copy content from a site but apply my own style-sheet and remove any unsafe elements. This whitelist approach seems very different from other sanitizers that I have seen. Is there any disadvantage of doing it this way that I should be aware of?
- ```javascript
- const allowedNodeTypes = [
- 'p',
- 'span',
- 'div',
- 'a',
- 'img',
- 'h1',
- 'h2',
- 'h3',
- 'h4',
- 'h5',
- 'strong',
- 'b',
- 'em',
- 'pre',
- 'blockquote',
- 'table',
- 'tr',
- 'td',
- 'tbody',
- 'u',
- 'br',
- 'i',
- 'ul',
- 'ol',
- 'li',
- 'figure',
- 'code',
- 'hr',
- 'italic',
- 'font',
- 'sup',
- 'sub'
- ]
- const allowedAttributeTypes = [
- 'href',
- 'src',
- 'alt',
- 'title'
- ]
- const urlTransformAttributeTypes = [
- 'src',
- 'href'
- ]
- function recursivelyCopyContent(elem1, elem2, keep_tags) {
- while (elem2.firstChild){
- elem2.removeChild(elem2.firstChild);
- }
- if (keep_tags === undefined) {
- keep_tags = true;
- }
- elem1.childNodes.forEach(child=>{
- if (child.nodeType === Node.TEXT_NODE){
- elem2.appendChild(
- document.createTextNode(child.data)
- );
- }
- else if (child.nodeType === Node.ELEMENT_NODE) {
- let tag = child.tagName.toLowerCase();
- if (allowedNodeTypes.indexOf(tag)>=0){
- let element = document.createElement(keep_tags?tag:'span');
- if (keep_tags){
- for (var name of child.attributes){
- if (urlTransformAttributeTypes.indexOf(name.nodeName)>=0){
- element.setAttribute(name.nodeName, new URL(name.value, 'https://target_site.com/folder/'));
- }
- else if (allowedAttributeTypes.indexOf(name.nodeName)>=0){
- element.setAttribute(name.nodeName, name.value);
- } else {
- console.log("not alled attribute", name.nodeName);
- }
- }
- }
- recursivelyCopyContent(child, element);
- elem2.appendChild(element);
- } else {
- console.log("Not allowed", tag);
- }
- }
- });
- }
- let htmlDoc = parser.parseFromString(await page.text(), 'text/html');
- recursivelyCopyContent(htmlDoc.getElementsByClassName('article-body')[0],
- document.getElementById('content')
- );
#1: Initial revision
Is this HTML sanitizer safe?
I wrote this HTML sanitizer for use in web scrapping. The idea is to safely copy content from a site but apply my own style-sheet and remove any unsafe elements. This whitelist approach seems very different from other sanitizers that I have seen. Is there any disadvantage of doing it this way that I should be aware of? ```javascript const allowedNodeTypes = [ 'p', 'span', 'div', 'a', 'img', 'h1', 'h2', 'h3', 'h4', 'h5', 'strong', 'b', 'em', 'pre', 'blockquote', 'table', 'tr', 'td', 'tbody', 'u', 'br', 'i', 'ul', 'ol', 'li', 'figure', 'code', 'hr', 'italic', 'font', 'sup', 'sub' ] const allowedAttributeTypes = [ 'href', 'src', 'alt', 'title' ] const urlTransformAttributeTypes = [ 'src', 'href' ] function recursivelyCopyContent(elem1, elem2, keep_tags) { while (elem2.firstChild){ elem2.removeChild(elem2.firstChild); } if (keep_tags === undefined) { keep_tags = true; } elem1.childNodes.forEach(child=>{ if (child.nodeType === Node.TEXT_NODE){ elem2.appendChild( document.createTextNode(child.data) ); } else if (child.nodeType === Node.ELEMENT_NODE) { let tag = child.tagName.toLowerCase(); if (allowedNodeTypes.indexOf(tag)>=0){ let element = document.createElement(keep_tags?tag:'span'); if (keep_tags){ for (var name of child.attributes){ if (urlTransformAttributeTypes.indexOf(name.nodeName)>=0){ element.setAttribute(name.nodeName, new URL(name.value, 'https://target_site.com/folder/')); } else if (allowedAttributeTypes.indexOf(name.nodeName)>=0){ element.setAttribute(name.nodeName, name.value); } else { console.log("not alled attribute", name.nodeName); } } } recursivelyCopyContent(child, element); elem2.appendChild(element); } else { console.log("Not allowed", tag); } } }); } let htmlDoc = parser.parseFromString(await page.text(), 'text/html'); recursivelyCopyContent(htmlDoc.getElementsByClassName('article-body')[0], document.getElementById('content') );