Welcome to Software Development on Codidact!

Will you help us build our independent community of developers helping developers? We're small and trying to grow. We welcome questions about all aspects of software development, from design to code to QA and more. Got questions? Got answers? Got code you'd like someone to review? Please join us.

Post History

81%

+7 −0

Code Reviews Is this HTML sanitizer safe?

I wrote this HTML sanitizer for use in web scraping. The idea is to safely copy content from a site but apply my own style-sheet and remove any unsafe elements. This whitelist approach seems very d...

1 answer · posted 4y ago by mousetail‭ · edited 4y ago by Peter Taylor‭

Question javascript web-scraping

#2: Post edited by

Peter Taylor‭ · 2021-07-07T14:56:50Z (almost 4 years ago)
The present participle of *scrape* has one *p*; two *p*s gives the present participle of *scrap*.

Copy Link

Raw

Markdown

Is this HTML sanitizer safe?

I wrote this HTML sanitizer for use in web scrapping. The idea is to safely copy content from a site but apply my own style-sheet and remove any unsafe elements. This whitelist approach seems very different from other sanitizers that I have seen. Is there any disadvantage of doing it this way that I should be aware of?
```javascript
const allowedNodeTypes = [
'p',
'span',
'div',
'a',
'img',
'h1',
'h2',
'h3',
'h4',
'h5',
'strong',
'b',
'em',
'pre',
'blockquote',
'table',
'tr',
'td',
'tbody',
'u',
'br',
'i',
'ul',
'ol',
'li',
'figure',
'code',
'hr',
'italic',
'font',
'sup',
'sub'
]
const allowedAttributeTypes = [
'href',
'src',
'alt',
'title'
]
const urlTransformAttributeTypes = [
'src',
'href'
]
function recursivelyCopyContent(elem1, elem2, keep_tags) {
while (elem2.firstChild){
elem2.removeChild(elem2.firstChild);
}
if (keep_tags === undefined) {
keep_tags = true;
}
elem1.childNodes.forEach(child=>{
if (child.nodeType === Node.TEXT_NODE){
elem2.appendChild(
document.createTextNode(child.data)
);
}
else if (child.nodeType === Node.ELEMENT_NODE) {
let tag = child.tagName.toLowerCase();
if (allowedNodeTypes.indexOf(tag)>=0){
let element = document.createElement(keep_tags?tag:'span');
if (keep_tags){
for (var name of child.attributes){
if (urlTransformAttributeTypes.indexOf(name.nodeName)>=0){
element.setAttribute(name.nodeName, new URL(name.value, 'https://target_site.com/folder/'));
}
else if (allowedAttributeTypes.indexOf(name.nodeName)>=0){
element.setAttribute(name.nodeName, name.value);
} else {
console.log("not alled attribute", name.nodeName);
}
}
}
recursivelyCopyContent(child, element);
elem2.appendChild(element);
} else {
console.log("Not allowed", tag);
}
}
});
}
let htmlDoc = parser.parseFromString(await page.text(), 'text/html');
recursivelyCopyContent(htmlDoc.getElementsByClassName('article-body')[0],
document.getElementById('content')
);

I wrote this HTML sanitizer for use in web scraping. The idea is to safely copy content from a site but apply my own style-sheet and remove any unsafe elements. This whitelist approach seems very different from other sanitizers that I have seen. Is there any disadvantage of doing it this way that I should be aware of?
```javascript
const allowedNodeTypes = [
'p',
'span',
'div',
'a',
'img',
'h1',
'h2',
'h3',
'h4',
'h5',
'strong',
'b',
'em',
'pre',
'blockquote',
'table',
'tr',
'td',
'tbody',
'u',
'br',
'i',
'ul',
'ol',
'li',
'figure',
'code',
'hr',
'italic',
'font',
'sup',
'sub'
]
const allowedAttributeTypes = [
'href',
'src',
'alt',
'title'
]
const urlTransformAttributeTypes = [
'src',
'href'
]
function recursivelyCopyContent(elem1, elem2, keep_tags) {
while (elem2.firstChild){
elem2.removeChild(elem2.firstChild);
}
if (keep_tags === undefined) {
keep_tags = true;
}
elem1.childNodes.forEach(child=>{
if (child.nodeType === Node.TEXT_NODE){
elem2.appendChild(
document.createTextNode(child.data)
);
}
else if (child.nodeType === Node.ELEMENT_NODE) {
let tag = child.tagName.toLowerCase();
if (allowedNodeTypes.indexOf(tag)>=0){
let element = document.createElement(keep_tags?tag:'span');
if (keep_tags){
for (var name of child.attributes){
if (urlTransformAttributeTypes.indexOf(name.nodeName)>=0){
element.setAttribute(name.nodeName, new URL(name.value, 'https://target_site.com/folder/'));
}
else if (allowedAttributeTypes.indexOf(name.nodeName)>=0){
element.setAttribute(name.nodeName, name.value);
} else {
console.log("not alled attribute", name.nodeName);
}
}
}
recursivelyCopyContent(child, element);
elem2.appendChild(element);
} else {
console.log("Not allowed", tag);
}
}
});
}
let htmlDoc = parser.parseFromString(await page.text(), 'text/html');
recursivelyCopyContent(htmlDoc.getElementsByClassName('article-body')[0],
document.getElementById('content')
);

#1: Initial revision by

mousetail‭ · 2021-06-23T13:16:21Z (almost 4 years ago)

Copy Link

Raw

Markdown

Is this HTML sanitizer safe?

I wrote this HTML sanitizer for use in web scrapping. The idea is to safely copy content from a site but apply my own style-sheet and remove any unsafe elements. This whitelist approach seems very different from other sanitizers that I have seen. Is there any disadvantage of doing it this way that I should be aware of?

```javascript
const allowedNodeTypes = [
	'p',
	'span',
	'div',
	'a',
	'img',
	'h1',
	'h2',
	'h3',
	'h4',
	'h5',
	'strong',
	'b',
	'em',
	'pre',
	'blockquote',
	'table',
	'tr',
	'td',
	'tbody',
	'u',
	'br',
	'i',
	'ul',
	'ol',
	'li',
	'figure',
	'code',
	'hr',
	'italic',
	'font',
	'sup',
	'sub'
]
	
const allowedAttributeTypes = [
	'href',
	'src',
	'alt',
	'title'
]
const urlTransformAttributeTypes = [
	'src', 
	'href'
]

function recursivelyCopyContent(elem1, elem2, keep_tags) {
	while (elem2.firstChild){
		elem2.removeChild(elem2.firstChild);
	}
	
	if (keep_tags === undefined) {
		keep_tags = true;
	}
	
	elem1.childNodes.forEach(child=>{
		if (child.nodeType === Node.TEXT_NODE){
			elem2.appendChild(
				document.createTextNode(child.data)
			);
		}
		else if (child.nodeType === Node.ELEMENT_NODE)  {
			let tag = child.tagName.toLowerCase();
			if (allowedNodeTypes.indexOf(tag)>=0){
				let element = document.createElement(keep_tags?tag:'span');
				
				if (keep_tags){
					for (var name of child.attributes){
						if (urlTransformAttributeTypes.indexOf(name.nodeName)>=0){
							element.setAttribute(name.nodeName, new URL(name.value, 'https://target_site.com/folder/'));
						}
						else if (allowedAttributeTypes.indexOf(name.nodeName)>=0){
							element.setAttribute(name.nodeName, name.value);
						} else {
							console.log("not alled attribute", name.nodeName);
						}
					}
				}
				
				recursivelyCopyContent(child, element);
				elem2.appendChild(element);
			} else {
				console.log("Not allowed", tag);
			}
		}
	});
}


let htmlDoc = parser.parseFromString(await page.text(), 'text/html');

recursivelyCopyContent(htmlDoc.getElementsByClassName('article-body')[0],
	document.getElementById('content')
);

javascript web-scraping

Communities

Post History