Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 96b5f69

Browse filesBrowse files
committed
[DomCrawler] Improve Crawler HTML5 parser need detection
1 parent 45fd75e commit 96b5f69
Copy full SHA for 96b5f69

File tree

4 files changed

+59
-64
lines changed
Filter options

4 files changed

+59
-64
lines changed

‎src/Symfony/Component/DomCrawler/Crawler.php

Copy file name to clipboardExpand all lines: src/Symfony/Component/DomCrawler/Crawler.php
+12-14Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -61,23 +61,16 @@ class Crawler implements \Countable, \IteratorAggregate
6161
private $html5Parser;
6262

6363
/**
64-
* @param mixed $node A Node to use as the base for the crawling
65-
* @param string $uri The current URI
66-
* @param string $baseHref The base href value
67-
* @param bool|null $useHtml5Parser Whether the Crawler should use the HTML5 parser or the native DOM parser
64+
* @param mixed $node A Node to use as the base for the crawling
65+
* @param string $uri The current URI
66+
* @param string $baseHref The base href value
67+
* @param HTML5 $html5Parser A default HTML5 parser instance
6868
*/
69-
public function __construct($node = null, string $uri = null, string $baseHref = null, bool $useHtml5Parser = null)
69+
public function __construct($node = null, string $uri = null, string $baseHref = null, HTML5 $html5Parser = null)
7070
{
7171
$this->uri = $uri;
7272
$this->baseHref = $baseHref ?: $uri;
73-
74-
if ($useHtml5Parser && !class_exists(HTML5::class)) {
75-
throw new \LogicException('Using the DomCrawler HTML5 parser requires the html5-php library. Try running "composer require masterminds/html5".');
76-
}
77-
78-
if ($useHtml5Parser ?? class_exists(HTML5::class)) {
79-
$this->html5Parser = new HTML5(['disable_html_ns' => true]);
80-
}
73+
$this->html5Parser = $html5Parser;
8174

8275
$this->add($node);
8376
}
@@ -198,6 +191,11 @@ public function addContent($content, $type = null)
198191
*/
199192
public function addHtmlContent($content, $charset = 'UTF-8')
200193
{
194+
// Use HTML5 parser if the content is HTML5 and the library is available
195+
if (!$this->html5Parser && class_exists(HTML5::class) && 0 === stripos(ltrim($content), '<!DOCTYPE html>')) {
196+
$this->html5Parser = new HTML5(['disable_html_ns' => true]);
197+
}
198+
201199
$dom = null !== $this->html5Parser ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
202200
$this->addDocument($dom);
203201

@@ -1215,7 +1213,7 @@ private function findNamespacePrefixes(string $xpath): array
12151213
*/
12161214
private function createSubCrawler($nodes)
12171215
{
1218-
$crawler = new static($nodes, $this->uri, $this->baseHref);
1216+
$crawler = new static($nodes, $this->uri, $this->baseHref, $this->html5Parser);
12191217
$crawler->isHtml = $this->isHtml;
12201218
$crawler->document = $this->document;
12211219
$crawler->namespaces = $this->namespaces;

‎src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTest.php

Copy file name to clipboardExpand all lines: src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTest.php
+33-40Lines changed: 33 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,12 @@
1616

1717
abstract class AbstractCrawlerTest extends TestCase
1818
{
19-
/**
20-
* @param mixed $node
21-
* @param string|null $uri
22-
* @param string|null $baseHref
23-
*
24-
* @return Crawler
25-
*/
26-
abstract public function createCrawler($node = null, string $uri = null, string $baseHref = null);
19+
abstract public function getDoctype(): string;
20+
21+
protected function createCrawler($node = null, string $uri = null, string $baseHref = null)
22+
{
23+
return new Crawler($node, $uri, $baseHref);
24+
}
2725

2826
public function testConstructor()
2927
{
@@ -74,7 +72,7 @@ public function testAdd()
7472
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->add() adds nodes from a \DOMNode');
7573

7674
$crawler = $this->createCrawler();
77-
$crawler->add('<html><body>Foo</body></html>');
75+
$crawler->add($this->getDoctype().'<html><body>Foo</body></html>');
7876
$this->assertEquals('Foo', $crawler->filterXPath('//body')->text(), '->add() adds nodes from a string');
7977
}
8078

@@ -94,22 +92,21 @@ public function testAddInvalidType()
9492
public function testAddMultipleDocumentNode()
9593
{
9694
$crawler = $this->createTestCrawler();
97-
$crawler->addHtmlContent('<html><div class="foo"></html>', 'UTF-8');
95+
$crawler->addHtmlContent($this->getDoctype().'<html><div class="foo"></html>', 'UTF-8');
9896
}
9997

10098
public function testAddHtmlContent()
10199
{
102100
$crawler = $this->createCrawler();
103-
$crawler->addHtmlContent('<html><div class="foo"></html>', 'UTF-8');
101+
$crawler->addHtmlContent($this->getDoctype().'<html><div class="foo"></html>', 'UTF-8');
104102

105103
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addHtmlContent() adds nodes from an HTML string');
106104
}
107105

108106
public function testAddHtmlContentWithBaseTag()
109107
{
110108
$crawler = $this->createCrawler();
111-
112-
$crawler->addHtmlContent('<html><head><base href="http://symfony.com"></head><a href="/contact"></a></html>', 'UTF-8');
109+
$crawler->addHtmlContent($this->getDoctype().'<html><head><base href="http://symfony.com"></head><a href="/contact"></a></html>', 'UTF-8');
113110

114111
$this->assertEquals('http://symfony.com', $crawler->filterXPath('//base')->attr('href'), '->addHtmlContent() adds nodes from an HTML string');
115112
$this->assertEquals('http://symfony.com/contact', $crawler->filterXPath('//a')->link()->getUri(), '->addHtmlContent() adds nodes from an HTML string');
@@ -121,15 +118,15 @@ public function testAddHtmlContentWithBaseTag()
121118
public function testAddHtmlContentCharset()
122119
{
123120
$crawler = $this->createCrawler();
124-
$crawler->addHtmlContent('<html><div class="foo">Tiếng Việt</html>', 'UTF-8');
121+
$crawler->addHtmlContent($this->getDoctype().'<html><div class="foo">Tiếng Việt</html>', 'UTF-8');
125122

126123
$this->assertEquals('Tiếng Việt', $crawler->filterXPath('//div')->text());
127124
}
128125

129126
public function testAddHtmlContentInvalidBaseTag()
130127
{
131128
$crawler = $this->createCrawler(null, 'http://symfony.com');
132-
$crawler->addHtmlContent('<html><head><base target="_top"></head><a href="/contact"></a></html>', 'UTF-8');
129+
$crawler->addHtmlContent($this->getDoctype().'<html><head><base target="_top"></head><a href="/contact"></a></html>', 'UTF-8');
133130

134131
$this->assertEquals('http://symfony.com/contact', current($crawler->filterXPath('//a')->links())->getUri(), '->addHtmlContent() correctly handles a non-existent base tag href attribute');
135132
}
@@ -141,55 +138,55 @@ public function testAddHtmlContentCharsetGbk()
141138
{
142139
$crawler = $this->createCrawler();
143140
//gbk encode of <html><p>中文</p></html>
144-
$crawler->addHtmlContent(base64_decode('PGh0bWw+PHA+1tDOxDwvcD48L2h0bWw+'), 'gbk');
141+
$crawler->addHtmlContent($this->getDoctype().base64_decode('PGh0bWw+PHA+1tDOxDwvcD48L2h0bWw+'), 'gbk');
145142

146143
$this->assertEquals('中文', $crawler->filterXPath('//p')->text());
147144
}
148145

149146
public function testAddXmlContent()
150147
{
151148
$crawler = $this->createCrawler();
152-
$crawler->addXmlContent('<html><div class="foo"></div></html>', 'UTF-8');
149+
$crawler->addXmlContent($this->getDoctype().'<html><div class="foo"></div></html>', 'UTF-8');
153150

154151
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addXmlContent() adds nodes from an XML string');
155152
}
156153

157154
public function testAddXmlContentCharset()
158155
{
159156
$crawler = $this->createCrawler();
160-
$crawler->addXmlContent('<html><div class="foo">Tiếng Việt</div></html>', 'UTF-8');
157+
$crawler->addXmlContent($this->getDoctype().'<html><div class="foo">Tiếng Việt</div></html>', 'UTF-8');
161158

162159
$this->assertEquals('Tiếng Việt', $crawler->filterXPath('//div')->text());
163160
}
164161

165162
public function testAddContent()
166163
{
167164
$crawler = $this->createCrawler();
168-
$crawler->addContent('<html><div class="foo"></html>', 'text/html; charset=UTF-8');
165+
$crawler->addContent($this->getDoctype().'<html><div class="foo"></html>', 'text/html; charset=UTF-8');
169166
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an HTML string');
170167

171168
$crawler = $this->createCrawler();
172-
$crawler->addContent('<html><div class="foo"></html>', 'text/html; charset=UTF-8; dir=RTL');
169+
$crawler->addContent($this->getDoctype().'<html><div class="foo"></html>', 'text/html; charset=UTF-8; dir=RTL');
173170
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an HTML string with extended content type');
174171

175172
$crawler = $this->createCrawler();
176-
$crawler->addContent('<html><div class="foo"></html>');
173+
$crawler->addContent($this->getDoctype().'<html><div class="foo"></html>');
177174
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() uses text/html as the default type');
178175

179176
$crawler = $this->createCrawler();
180-
$crawler->addContent('<html><div class="foo"></div></html>', 'text/xml; charset=UTF-8');
177+
$crawler->addContent($this->getDoctype().'<html><div class="foo"></div></html>', 'text/xml; charset=UTF-8');
181178
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an XML string');
182179

183180
$crawler = $this->createCrawler();
184-
$crawler->addContent('<html><div class="foo"></div></html>', 'text/xml');
181+
$crawler->addContent($this->getDoctype().'<html><div class="foo"></div></html>', 'text/xml');
185182
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an XML string');
186183

187184
$crawler = $this->createCrawler();
188185
$crawler->addContent('foo bar', 'text/plain');
189186
$this->assertCount(0, $crawler, '->addContent() does nothing if the type is not (x|ht)ml');
190187

191188
$crawler = $this->createCrawler();
192-
$crawler->addContent('<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
189+
$crawler->addContent($this->getDoctype().'<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
193190
$this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset');
194191
}
195192

@@ -199,7 +196,7 @@ public function testAddContent()
199196
public function testAddContentNonUtf8()
200197
{
201198
$crawler = $this->createCrawler();
202-
$crawler->addContent(iconv('UTF-8', 'SJIS', '<html><head><meta charset="Shift_JIS"></head><body>日本語</body></html>'));
199+
$crawler->addContent(iconv('UTF-8', 'SJIS', $this->getDoctype().'<html><head><meta charset="Shift_JIS"></head><body>日本語</body></html>'));
203200
$this->assertEquals('日本語', $crawler->filterXPath('//body')->text(), '->addContent() can recognize "Shift_JIS" in html5 meta charset tag');
204201
}
205202

@@ -314,7 +311,7 @@ public function testAttr()
314311
public function testMissingAttrValueIsNull()
315312
{
316313
$crawler = $this->createCrawler();
317-
$crawler->addContent('<html><div non-empty-attr="sample value" empty-attr=""></div></html>', 'text/html; charset=UTF-8');
314+
$crawler->addContent($this->getDoctype().'<html><div non-empty-attr="sample value" empty-attr=""></div></html>', 'text/html; charset=UTF-8');
318315
$div = $crawler->filterXPath('//div');
319316

320317
$this->assertEquals('sample value', $div->attr('non-empty-attr'), '->attr() reads non-empty attributes correctly');
@@ -670,7 +667,6 @@ public function testSelectButton()
670667
public function testSelectButtonWithSingleQuotesInNameAttribute()
671668
{
672669
$html = <<<'HTML'
673-
<!DOCTYPE html>
674670
<html lang="en">
675671
<body>
676672
<div id="action">
@@ -683,15 +679,14 @@ public function testSelectButtonWithSingleQuotesInNameAttribute()
683679
</html>
684680
HTML;
685681

686-
$crawler = $this->createCrawler($html);
682+
$crawler = $this->createCrawler($this->getDoctype().$html);
687683

688684
$this->assertCount(1, $crawler->selectButton('Click \'Here\''));
689685
}
690686

691687
public function testSelectButtonWithDoubleQuotesInNameAttribute()
692688
{
693689
$html = <<<'HTML'
694-
<!DOCTYPE html>
695690
<html lang="en">
696691
<body>
697692
<div id="action">
@@ -704,7 +699,7 @@ public function testSelectButtonWithDoubleQuotesInNameAttribute()
704699
</html>
705700
HTML;
706701

707-
$crawler = $this->createCrawler($html);
702+
$crawler = $this->createCrawler($this->getDoctype().$html);
708703

709704
$this->assertCount(1, $crawler->selectButton('Click "Here"'));
710705
}
@@ -763,7 +758,6 @@ public function testImage()
763758
public function testSelectLinkAndLinkFiltered()
764759
{
765760
$html = <<<'HTML'
766-
<!DOCTYPE html>
767761
<html lang="en">
768762
<body>
769763
<div id="action">
@@ -776,7 +770,7 @@ public function testSelectLinkAndLinkFiltered()
776770
</html>
777771
HTML;
778772

779-
$crawler = $this->createCrawler($html);
773+
$crawler = $this->createCrawler($this->getDoctype().$html);
780774
$filtered = $crawler->filterXPath("descendant-or-self::*[@id = 'login-form']");
781775

782776
$this->assertCount(0, $filtered->selectLink('Login'));
@@ -793,7 +787,7 @@ public function testSelectLinkAndLinkFiltered()
793787

794788
public function testChaining()
795789
{
796-
$crawler = $this->createCrawler('<div name="a"><div name="b"><div name="c"></div></div></div>');
790+
$crawler = $this->createCrawler($this->getDoctype().'<div name="a"><div name="b"><div name="c"></div></div></div>');
797791

798792
$this->assertEquals('a', $crawler->filterXPath('//div')->filterXPath('div')->filterXPath('div')->attr('name'));
799793
}
@@ -965,7 +959,6 @@ public function testChildren()
965959
public function testFilteredChildren()
966960
{
967961
$html = <<<'HTML'
968-
<!DOCTYPE html>
969962
<html lang="en">
970963
<body>
971964
<div id="foo">
@@ -981,7 +974,7 @@ public function testFilteredChildren()
981974
</html>
982975
HTML;
983976

984-
$crawler = $this->createCrawler($html);
977+
$crawler = $this->createCrawler($this->getDoctype().$html);
985978
$foo = $crawler->filter('#foo');
986979

987980
$this->assertEquals(3, $foo->children()->count());
@@ -1018,7 +1011,7 @@ public function testParents()
10181011
*/
10191012
public function testBaseTag($baseValue, $linkValue, $expectedUri, $currentUri = null, $description = '')
10201013
{
1021-
$crawler = $this->createCrawler('<html><base href="'.$baseValue.'"><a href="'.$linkValue.'"></a></html>', $currentUri);
1014+
$crawler = $this->createCrawler($this->getDoctype().'<html><base href="'.$baseValue.'"><a href="'.$linkValue.'"></a></html>', $currentUri);
10221015
$this->assertEquals($expectedUri, $crawler->filterXPath('//a')->link()->getUri(), $description);
10231016
}
10241017

@@ -1038,7 +1031,7 @@ public function getBaseTagData()
10381031
*/
10391032
public function testBaseTagWithForm($baseValue, $actionValue, $expectedUri, $currentUri = null, $description = null)
10401033
{
1041-
$crawler = $this->createCrawler('<html><base href="'.$baseValue.'"><form method="post" action="'.$actionValue.'"><button type="submit" name="submit"/></form></html>', $currentUri);
1034+
$crawler = $this->createCrawler($this->getDoctype().'<html><base href="'.$baseValue.'"><form method="post" action="'.$actionValue.'"><button type="submit" name="submit"/></form></html>', $currentUri);
10421035
$this->assertEquals($expectedUri, $crawler->filterXPath('//button')->form()->getUri(), $description);
10431036
}
10441037

@@ -1113,7 +1106,7 @@ public function testEvaluateThrowsAnExceptionIfDocumentIsEmpty()
11131106
public function testInheritedClassCallChildrenWithoutArgument()
11141107
{
11151108
$dom = new \DOMDocument();
1116-
$dom->loadHTML('
1109+
$dom->loadHTML($this->getDoctype().'
11171110
<html>
11181111
<body>
11191112
<a href="foo">Foo</a>
@@ -1165,15 +1158,15 @@ public function testInheritedClassCallChildrenWithoutArgument()
11651158
public function testAddHtmlContentUnsupportedCharset()
11661159
{
11671160
$crawler = $this->createCrawler();
1168-
$crawler->addHtmlContent(file_get_contents(__DIR__.'/Fixtures/windows-1250.html'), 'Windows-1250');
1161+
$crawler->addHtmlContent($this->getDoctype().file_get_contents(__DIR__.'/Fixtures/windows-1250.html'), 'Windows-1250');
11691162

11701163
$this->assertEquals('Žťčýů', $crawler->filterXPath('//p')->text());
11711164
}
11721165

11731166
public function createTestCrawler($uri = null)
11741167
{
11751168
$dom = new \DOMDocument();
1176-
$dom->loadHTML('
1169+
$dom->loadHTML($this->getDoctype().'
11771170
<html>
11781171
<body>
11791172
<a href="foo">Foo</a>

‎src/Symfony/Component/DomCrawler/Tests/Html5ParserCrawlerTest.php

Copy file name to clipboardExpand all lines: src/Symfony/Component/DomCrawler/Tests/Html5ParserCrawlerTest.php
+10-4Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,18 @@
1111

1212
namespace Symfony\Component\DomCrawler\Tests;
1313

14-
use Symfony\Component\DomCrawler\Crawler;
15-
1614
class Html5ParserCrawlerTest extends AbstractCrawlerTest
1715
{
18-
public function createCrawler($node = null, string $uri = null, string $baseHref = null)
16+
public function getDoctype(): string
17+
{
18+
return '<!DOCTYPE html>';
19+
}
20+
21+
public function testAddHtml5()
1922
{
20-
return new Crawler($node, $uri, $baseHref, true);
23+
// Ensure a bug specific to the DOM extension is fixed (see https://github.com/symfony/symfony/issues/28596)
24+
$crawler = $this->createCrawler();
25+
$crawler->add($this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>');
26+
$this->assertEquals('Foo', $crawler->filterXPath('//h1')->text(), '->add() adds nodes from a string');
2127
}
2228
}

‎src/Symfony/Component/DomCrawler/Tests/NativeParserCrawlerTest.php

Copy file name to clipboardExpand all lines: src/Symfony/Component/DomCrawler/Tests/NativeParserCrawlerTest.php
+4-6Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,11 @@
1111

1212
namespace Symfony\Component\DomCrawler\Tests;
1313

14-
use Symfony\Component\DomCrawler\Crawler;
15-
1614
class NativeParserCrawlerTest extends AbstractCrawlerTest
1715
{
18-
public function createCrawler($node = null, string $uri = null, string $baseHref = null)
16+
public function getDoctype(): string
1917
{
20-
return new Crawler($node, $uri, $baseHref, false);
18+
return '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
2119
}
2220

2321
public function testAddHtmlContentWithErrors()
@@ -26,7 +24,7 @@ public function testAddHtmlContentWithErrors()
2624

2725
$crawler = $this->createCrawler();
2826
$crawler->addHtmlContent(<<<'EOF'
29-
<!DOCTYPE html>
27+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3028
<html>
3129
<head>
3230
</head>
@@ -51,7 +49,7 @@ public function testAddXmlContentWithErrors()
5149

5250
$crawler = $this->createCrawler();
5351
$crawler->addXmlContent(<<<'EOF'
54-
<!DOCTYPE html>
52+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
5553
<html>
5654
<head>
5755
</head>

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.