3 namespace BookStack\Util;
11 class HtmlContentFilter
14 * Remove all the script elements from the given HTML.
16 public static function removeScripts(string $html): string
22 $html = '<body>' . $html . '</body>';
23 libxml_use_internal_errors(true);
24 $doc = new DOMDocument();
25 $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
26 $xPath = new DOMXPath($doc);
28 // Remove standard script tags
29 $scriptElems = $xPath->query('//script');
30 static::removeNodes($scriptElems);
32 // Remove clickable links to JavaScript URI
33 $badLinks = $xPath->query('//*[' . static::xpathContains('@href', 'javascript:') . ']');
34 static::removeNodes($badLinks);
36 // Remove forms with calls to JavaScript URI
37 $badForms = $xPath->query('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']');
38 static::removeNodes($badForms);
40 // Remove meta tag to prevent external redirects
41 $metaTags = $xPath->query('//meta[' . static::xpathContains('@content', 'url') . ']');
42 static::removeNodes($metaTags);
44 // Remove data or JavaScript iFrames
45 $badIframes = $xPath->query('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]');
46 static::removeNodes($badIframes);
48 // Remove elements with a xlink:href attribute
49 // Used in SVG but deprecated anyway, so we'll be a bit more heavy-handed here.
50 $xlinkHrefAttributes = $xPath->query('//@*[contains(name(), \'xlink:href\')]');
51 static::removeAttributes($xlinkHrefAttributes);
53 // Remove 'on*' attributes
54 $onAttributes = $xPath->query('//@*[starts-with(name(), \'on\')]');
55 static::removeAttributes($onAttributes);
58 $topElems = $doc->documentElement->childNodes->item(0)->childNodes;
59 foreach ($topElems as $child) {
60 $html .= $doc->saveHTML($child);
67 * Create a xpath contains statement with a translation automatically built within
68 * to affectively search in a cases-insensitive manner.
70 protected static function xpathContains(string $property, string $value): string
72 $value = strtolower($value);
73 $upperVal = strtoupper($value);
75 return 'contains(translate(' . $property . ', \'' . $upperVal . '\', \'' . $value . '\'), \'' . $value . '\')';
79 * Remove all the given DOMNodes.
81 protected static function removeNodes(DOMNodeList $nodes): void
83 foreach ($nodes as $node) {
84 $node->parentNode->removeChild($node);
89 * Remove all the given attribute nodes.
91 protected static function removeAttributes(DOMNodeList $attrs): void
93 /** @var DOMAttr $attr */
94 foreach ($attrs as $attr) {
95 $attrName = $attr->nodeName;
96 /** @var DOMElement $parentNode */
97 $parentNode = $attr->parentNode;
98 $parentNode->removeAttribute($attrName);