Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 1183aca

Browse filesBrowse files
committed
feature #17585 [DomCrawler] Abstract URI logic and crawl images (valeriangalliat)
This PR was squashed before being merged into the 3.1-dev branch (closes #17585). Discussion ---------- [DomCrawler] Abstract URI logic and crawl images | Q | A | ------------- | --- | Bug fix? | no | New feature? | yes | BC breaks? | no | Deprecations? | no | Tests pass? | yes | Fixed tickets | #12429 | License | MIT | Doc PR | symfony/symfony-docs#4971 This is a backward-compatible version of #13620, and a rebase of #13649 on current `master`. Commits ------- 1553b07 [DomCrawler] Abstract URI logic and crawl images
2 parents ba25521 + 1553b07 commit 1183aca
Copy full SHA for 1183aca

File tree

Expand file treeCollapse file tree

7 files changed

+395
-193
lines changed
Filter options
Expand file treeCollapse file tree

7 files changed

+395
-193
lines changed
+212Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <fabien@symfony.com>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
namespace Symfony\Component\DomCrawler;
13+
14+
/**
15+
* Any HTML element that can link to an URI.
16+
*
17+
* @author Fabien Potencier <fabien@symfony.com>
18+
*/
19+
abstract class AbstractUriElement
20+
{
21+
/**
22+
* @var \DOMElement
23+
*/
24+
protected $node;
25+
26+
/**
27+
* @var string The method to use for the element
28+
*/
29+
protected $method;
30+
31+
/**
32+
* @var string The URI of the page where the element is embedded (or the base href)
33+
*/
34+
protected $currentUri;
35+
36+
/**
37+
* @param \DOMElement $node A \DOMElement instance
38+
* @param string $currentUri The URI of the page where the link is embedded (or the base href)
39+
* @param string $method The method to use for the link (get by default)
40+
*
41+
* @throws \InvalidArgumentException if the node is not a link
42+
*/
43+
public function __construct(\DOMElement $node, $currentUri, $method = 'GET')
44+
{
45+
if (!in_array(strtolower(substr($currentUri, 0, 4)), array('http', 'file'))) {
46+
throw new \InvalidArgumentException(sprintf('Current URI must be an absolute URL ("%s").', $currentUri));
47+
}
48+
49+
$this->setNode($node);
50+
$this->method = $method ? strtoupper($method) : null;
51+
$this->currentUri = $currentUri;
52+
}
53+
54+
/**
55+
* Gets the node associated with this link.
56+
*
57+
* @return \DOMElement A \DOMElement instance
58+
*/
59+
public function getNode()
60+
{
61+
return $this->node;
62+
}
63+
64+
/**
65+
* Gets the method associated with this link.
66+
*
67+
* @return string The method
68+
*/
69+
public function getMethod()
70+
{
71+
return $this->method;
72+
}
73+
74+
/**
75+
* Gets the URI associated with this link.
76+
*
77+
* @return string The URI
78+
*/
79+
public function getUri()
80+
{
81+
$uri = trim($this->getRawUri());
82+
83+
// absolute URL?
84+
if (null !== parse_url($uri, PHP_URL_SCHEME)) {
85+
return $uri;
86+
}
87+
88+
// empty URI
89+
if (!$uri) {
90+
return $this->currentUri;
91+
}
92+
93+
// an anchor
94+
if ('#' === $uri[0]) {
95+
return $this->cleanupAnchor($this->currentUri).$uri;
96+
}
97+
98+
$baseUri = $this->cleanupUri($this->currentUri);
99+
100+
if ('?' === $uri[0]) {
101+
return $baseUri.$uri;
102+
}
103+
104+
// absolute URL with relative schema
105+
if (0 === strpos($uri, '//')) {
106+
return preg_replace('#^([^/]*)//.*$#', '$1', $baseUri).$uri;
107+
}
108+
109+
$baseUri = preg_replace('#^(.*?//[^/]*)(?:\/.*)?$#', '$1', $baseUri);
110+
111+
// absolute path
112+
if ('/' === $uri[0]) {
113+
return $baseUri.$uri;
114+
}
115+
116+
// relative path
117+
$path = parse_url(substr($this->currentUri, strlen($baseUri)), PHP_URL_PATH);
118+
$path = $this->canonicalizePath(substr($path, 0, strrpos($path, '/')).'/'.$uri);
119+
120+
return $baseUri.('' === $path || '/' !== $path[0] ? '/' : '').$path;
121+
}
122+
123+
/**
124+
* Returns raw URI data.
125+
*
126+
* @return string
127+
*/
128+
abstract protected function getRawUri();
129+
130+
/**
131+
* Returns the canonicalized URI path (see RFC 3986, section 5.2.4).
132+
*
133+
* @param string $path URI path
134+
*
135+
* @return string
136+
*/
137+
protected function canonicalizePath($path)
138+
{
139+
if ('' === $path || '/' === $path) {
140+
return $path;
141+
}
142+
143+
if ('.' === substr($path, -1)) {
144+
$path .= '/';
145+
}
146+
147+
$output = array();
148+
149+
foreach (explode('/', $path) as $segment) {
150+
if ('..' === $segment) {
151+
array_pop($output);
152+
} elseif ('.' !== $segment) {
153+
$output[] = $segment;
154+
}
155+
}
156+
157+
return implode('/', $output);
158+
}
159+
160+
/**
161+
* Sets current \DOMElement instance.
162+
*
163+
* @param \DOMElement $node A \DOMElement instance
164+
*
165+
* @throws \LogicException If given node is not an anchor
166+
*/
167+
abstract protected function setNode(\DOMElement $node);
168+
169+
/**
170+
* Removes the query string and the anchor from the given uri.
171+
*
172+
* @param string $uri The uri to clean
173+
*
174+
* @return string
175+
*/
176+
private function cleanupUri($uri)
177+
{
178+
return $this->cleanupQuery($this->cleanupAnchor($uri));
179+
}
180+
181+
/**
182+
* Remove the query string from the uri.
183+
*
184+
* @param string $uri
185+
*
186+
* @return string
187+
*/
188+
private function cleanupQuery($uri)
189+
{
190+
if (false !== $pos = strpos($uri, '?')) {
191+
return substr($uri, 0, $pos);
192+
}
193+
194+
return $uri;
195+
}
196+
197+
/**
198+
* Remove the anchor from the uri.
199+
*
200+
* @param string $uri
201+
*
202+
* @return string
203+
*/
204+
private function cleanupAnchor($uri)
205+
{
206+
if (false !== $pos = strpos($uri, '#')) {
207+
return substr($uri, 0, $pos);
208+
}
209+
210+
return $uri;
211+
}
212+
}

‎src/Symfony/Component/DomCrawler/CHANGELOG.md

Copy file name to clipboardExpand all lines: src/Symfony/Component/DomCrawler/CHANGELOG.md
+6Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
CHANGELOG
22
=========
33

4+
3.1.0
5+
-----
6+
7+
* All the URI parsing logic have been abstracted in the `AbstractUriElement` class. The `Link` class is now a child of `AbstractUriElement` which implements the new `UriElementInterface`, describing the common `getNode`, `getMethod` and `getUri` methods.
8+
* Added an `Image` class to crawl images and parse their `src` attribute, and `selectImage`, `image`, `images` methods in `Crawler`, the image version of the equivalent `link` methods.
9+
410
2.5.0
511
-----
612

‎src/Symfony/Component/DomCrawler/Crawler.php

Copy file name to clipboardExpand all lines: src/Symfony/Component/DomCrawler/Crawler.php
+55-2Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,6 @@ class Crawler implements \Countable, \IteratorAggregate
5858
private $isHtml = true;
5959

6060
/**
61-
* Constructor.
62-
*
6361
* @param mixed $node A Node to use as the base for the crawling
6462
* @param string $currentUri The current URI
6563
* @param string $baseHref The base href value
@@ -668,6 +666,20 @@ public function selectLink($value)
668666
return $this->filterRelativeXPath($xpath);
669667
}
670668

669+
/**
670+
* Selects images by alt value.
671+
*
672+
* @param string $value The image alt
673+
*
674+
* @return Crawler A new instance of Crawler with the filtered list of nodes
675+
*/
676+
public function selectImage($value)
677+
{
678+
$xpath = sprintf('descendant-or-self::img[contains(normalize-space(string(@alt)), %s)]', static::xpathLiteral($value));
679+
680+
return $this->filterRelativeXPath($xpath);
681+
}
682+
671683
/**
672684
* Selects a button by name or alt value for images.
673685
*
@@ -730,6 +742,47 @@ public function links()
730742
return $links;
731743
}
732744

745+
/**
746+
* Returns an Image object for the first node in the list.
747+
*
748+
* @return Image An Image instance
749+
*
750+
* @throws \InvalidArgumentException If the current node list is empty
751+
*/
752+
public function image()
753+
{
754+
if (!count($this)) {
755+
throw new \InvalidArgumentException('The current node list is empty.');
756+
}
757+
758+
$node = $this->getNode(0);
759+
760+
if (!$node instanceof \DOMElement) {
761+
throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node)));
762+
}
763+
764+
return new Image($node, $this->baseHref);
765+
}
766+
767+
/**
768+
* Returns an array of Image objects for the nodes in the list.
769+
*
770+
* @return Image[] An array of Image instances
771+
*/
772+
public function images()
773+
{
774+
$images = array();
775+
foreach ($this as $node) {
776+
if (!$node instanceof \DOMElement) {
777+
throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_class($node)));
778+
}
779+
780+
$images[] = new Image($node, $this->baseHref);
781+
}
782+
783+
return $images;
784+
}
785+
733786
/**
734787
* Returns a Form object for the first node in the list.
735788
*
+37Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <fabien@symfony.com>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
namespace Symfony\Component\DomCrawler;
13+
14+
/**
15+
* Image represents an HTML image (an HTML img tag).
16+
*/
17+
class Image extends AbstractUriElement
18+
{
19+
public function __construct(\DOMElement $node, $currentUri)
20+
{
21+
parent::__construct($node, $currentUri, 'GET');
22+
}
23+
24+
protected function getRawUri()
25+
{
26+
return $this->node->getAttribute('src');
27+
}
28+
29+
protected function setNode(\DOMElement $node)
30+
{
31+
if ('img' !== $node->nodeName) {
32+
throw new \LogicException(sprintf('Unable to visualize a "%s" tag.', $node->nodeName));
33+
}
34+
35+
$this->node = $node;
36+
}
37+
}

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.