From d6874a87c6eca858d528a88ba5531b460624230e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Ausw=C3=B6ger?= Date: Tue, 2 Apr 2024 14:30:28 +0200 Subject: [PATCH] [DomCrawler] Encode html entities only if nessecary --- src/Symfony/Component/DomCrawler/Crawler.php | 22 +++++++++++++++++-- .../Tests/AbstractCrawlerTestCase.php | 4 ++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/Symfony/Component/DomCrawler/Crawler.php b/src/Symfony/Component/DomCrawler/Crawler.php index a6c214add0964..239ace66ef262 100644 --- a/src/Symfony/Component/DomCrawler/Crawler.php +++ b/src/Symfony/Component/DomCrawler/Crawler.php @@ -1151,12 +1151,30 @@ protected function sibling(\DOMNode $node, string $siblingDir = 'nextSibling') private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument { - return $this->html5Parser->parse($this->convertToHtmlEntities($htmlContent, $charset)); + if (!$this->supportsEncoding($charset)) { + $htmlContent = $this->convertToHtmlEntities($htmlContent, $charset); + $charset = 'UTF-8'; + } + + return $this->html5Parser->parse($htmlContent, ['encoding' => $charset]); + } + + private function supportsEncoding(string $encoding): bool + { + try { + return '' === @mb_convert_encoding('', $encoding, 'UTF-8'); + } catch (\Throwable $e) { + return false; + } } private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument { - $htmlContent = $this->convertToHtmlEntities($htmlContent, $charset); + if ('UTF-8' === $charset && preg_match('//u', $htmlContent)) { + $htmlContent = ''.$htmlContent; + } else { + $htmlContent = $this->convertToHtmlEntities($htmlContent, $charset); + } $internalErrors = libxml_use_internal_errors(true); if (\LIBXML_VERSION < 20900) { diff --git a/src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTestCase.php b/src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTestCase.php index 831c2f060b012..e60f3e4ef8b0b 100644 --- a/src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTestCase.php +++ b/src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTestCase.php @@ -194,6 +194,10 @@ public function testAddContent() $crawler = $this->createCrawler(); $crawler->addContent($this->getDoctype().'
'); $this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() ignores bad charset'); + + $crawler = $this->createCrawler(); + $crawler->addContent($this->getDoctype().'', 'text/html; charset=UTF-8'); + $this->assertEquals('var foo = "bär";', $crawler->filterXPath('//script')->text(), '->addContent() does not interfere with script content'); } /**