Welcome to Software Development on Codidact!

Will you help us build our independent community of developers helping developers? We're small and trying to grow. We welcome questions about all aspects of software development, from design to code to QA and more. Got questions? Got answers? Got code you'd like someone to review? Please join us.

Review Suggested Edit

You can't approve or reject suggested edits because you haven't yet earned the Edit Posts ability.

Is this HTML sanitizer safe?

I wrote this HTML sanitizer for use in web scrapping. The idea is to safely copy content from a site but apply my own style-sheet and remove any unsafe elements. This whitelist approach seems very different from other sanitizers that I have seen. Is there any disadvantage of doing it this way that I should be aware of?
```javascript
const allowedNodeTypes = [
'p',
'span',
'div',
'a',
'img',
'h1',
'h2',
'h3',
'h4',
'h5',
'strong',
'b',
'em',
'pre',
'blockquote',
'table',
'tr',
'td',
'tbody',
'u',
'br',
'i',
'ul',
'ol',
'li',
'figure',
'code',
'hr',
'italic',
'font',
'sup',
'sub'
]
const allowedAttributeTypes = [
'href',
'src',
'alt',
'title'
]
const urlTransformAttributeTypes = [
'src',
'href'
]
function recursivelyCopyContent(elem1, elem2, keep_tags) {
while (elem2.firstChild){
elem2.removeChild(elem2.firstChild);
}
if (keep_tags === undefined) {
keep_tags = true;
}
elem1.childNodes.forEach(child=>{
if (child.nodeType === Node.TEXT_NODE){
elem2.appendChild(
document.createTextNode(child.data)
);
}
else if (child.nodeType === Node.ELEMENT_NODE) {
let tag = child.tagName.toLowerCase();
if (allowedNodeTypes.indexOf(tag)>=0){
let element = document.createElement(keep_tags?tag:'span');
if (keep_tags){
for (var name of child.attributes){
if (urlTransformAttributeTypes.indexOf(name.nodeName)>=0){
element.setAttribute(name.nodeName, new URL(name.value, 'https://target_site.com/folder/'));
}
else if (allowedAttributeTypes.indexOf(name.nodeName)>=0){
element.setAttribute(name.nodeName, name.value);
} else {
console.log("not alled attribute", name.nodeName);
}
}
}
recursivelyCopyContent(child, element);
elem2.appendChild(element);
} else {
console.log("Not allowed", tag);
}
}
});
}
let htmlDoc = parser.parseFromString(await page.text(), 'text/html');
recursivelyCopyContent(htmlDoc.getElementsByClassName('article-body')[0],
document.getElementById('content')
);

I wrote this HTML sanitizer for use in web scraping. The idea is to safely copy content from a site but apply my own style-sheet and remove any unsafe elements. This whitelist approach seems very different from other sanitizers that I have seen. Is there any disadvantage of doing it this way that I should be aware of?
```javascript
const allowedNodeTypes = [
'p',
'span',
'div',
'a',
'img',
'h1',
'h2',
'h3',
'h4',
'h5',
'strong',
'b',
'em',
'pre',
'blockquote',
'table',
'tr',
'td',
'tbody',
'u',
'br',
'i',
'ul',
'ol',
'li',
'figure',
'code',
'hr',
'italic',
'font',
'sup',
'sub'
]
const allowedAttributeTypes = [
'href',
'src',
'alt',
'title'
]
const urlTransformAttributeTypes = [
'src',
'href'
]
function recursivelyCopyContent(elem1, elem2, keep_tags) {
while (elem2.firstChild){
elem2.removeChild(elem2.firstChild);
}
if (keep_tags === undefined) {
keep_tags = true;
}
elem1.childNodes.forEach(child=>{
if (child.nodeType === Node.TEXT_NODE){
elem2.appendChild(
document.createTextNode(child.data)
);
}
else if (child.nodeType === Node.ELEMENT_NODE) {
let tag = child.tagName.toLowerCase();
if (allowedNodeTypes.indexOf(tag)>=0){
let element = document.createElement(keep_tags?tag:'span');
if (keep_tags){
for (var name of child.attributes){
if (urlTransformAttributeTypes.indexOf(name.nodeName)>=0){
element.setAttribute(name.nodeName, new URL(name.value, 'https://target_site.com/folder/'));
}
else if (allowedAttributeTypes.indexOf(name.nodeName)>=0){
element.setAttribute(name.nodeName, name.value);
} else {
console.log("not alled attribute", name.nodeName);
}
}
}
recursivelyCopyContent(child, element);
elem2.appendChild(element);
} else {
console.log("Not allowed", tag);
}
}
});
}
let htmlDoc = parser.parseFromString(await page.text(), 'text/html');
recursivelyCopyContent(htmlDoc.getElementsByClassName('article-body')[0],
document.getElementById('content')
);

javascript

javascript web-scraping

Communities

Review Suggested Edit