Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 9b69b08

Browse filesBrowse files
committed
feature #35415 Extracted code to expand an URI to UriExpander (lyrixx)
This PR was merged into the 5.1-dev branch. Discussion ---------- Extracted code to expand an URI to `UriExpander` | Q | A | ------------- | --- | Branch? | master | Bug fix? | no | New feature? | yes | Deprecations? | no | Tickets | | License | MIT | Doc PR | When building a crawler we need to extract and to expand all links on a web pages. ATM, we need to create a DomDocument, attach the href, and ask for the full URL. This is a bit slow, and unecessary. This is why I extracted the minimal code to expand the URL to its onw trait for better re-usability. I benched (a specific part of) my application: * before: 2.16ms * after: 1.42ms Commits ------- 0c499c6 Extracted code to expand an URI to `UriExpanderTrait`
2 parents ef30ef5 + 0c499c6 commit 9b69b08
Copy full SHA for 9b69b08

File tree

4 files changed

+223
-72
lines changed
Filter options

4 files changed

+223
-72
lines changed

‎src/Symfony/Component/DomCrawler/AbstractUriElement.php

Copy file name to clipboardExpand all lines: src/Symfony/Component/DomCrawler/AbstractUriElement.php
+1-72Lines changed: 1 addition & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -80,46 +80,7 @@ public function getMethod()
8080
*/
8181
public function getUri()
8282
{
83-
$uri = trim($this->getRawUri());
84-
85-
// absolute URL?
86-
if (null !== parse_url($uri, PHP_URL_SCHEME)) {
87-
return $uri;
88-
}
89-
90-
// empty URI
91-
if (!$uri) {
92-
return $this->currentUri;
93-
}
94-
95-
// an anchor
96-
if ('#' === $uri[0]) {
97-
return $this->cleanupAnchor($this->currentUri).$uri;
98-
}
99-
100-
$baseUri = $this->cleanupUri($this->currentUri);
101-
102-
if ('?' === $uri[0]) {
103-
return $baseUri.$uri;
104-
}
105-
106-
// absolute URL with relative schema
107-
if (0 === strpos($uri, '//')) {
108-
return preg_replace('#^([^/]*)//.*$#', '$1', $baseUri).$uri;
109-
}
110-
111-
$baseUri = preg_replace('#^(.*?//[^/]*)(?:\/.*)?$#', '$1', $baseUri);
112-
113-
// absolute path
114-
if ('/' === $uri[0]) {
115-
return $baseUri.$uri;
116-
}
117-
118-
// relative path
119-
$path = parse_url(substr($this->currentUri, \strlen($baseUri)), PHP_URL_PATH);
120-
$path = $this->canonicalizePath(substr($path, 0, strrpos($path, '/')).'/'.$uri);
121-
122-
return $baseUri.('' === $path || '/' !== $path[0] ? '/' : '').$path;
83+
return UriExpander::expand($this->getRawUri(), $this->currentUri);
12384
}
12485

12586
/**
@@ -167,36 +128,4 @@ protected function canonicalizePath(string $path)
167128
* @throws \LogicException If given node is not an anchor
168129
*/
169130
abstract protected function setNode(\DOMElement $node);
170-
171-
/**
172-
* Removes the query string and the anchor from the given uri.
173-
*/
174-
private function cleanupUri(string $uri): string
175-
{
176-
return $this->cleanupQuery($this->cleanupAnchor($uri));
177-
}
178-
179-
/**
180-
* Remove the query string from the uri.
181-
*/
182-
private function cleanupQuery(string $uri): string
183-
{
184-
if (false !== $pos = strpos($uri, '?')) {
185-
return substr($uri, 0, $pos);
186-
}
187-
188-
return $uri;
189-
}
190-
191-
/**
192-
* Remove the anchor from the uri.
193-
*/
194-
private function cleanupAnchor(string $uri): string
195-
{
196-
if (false !== $pos = strpos($uri, '#')) {
197-
return substr($uri, 0, $pos);
198-
}
199-
200-
return $uri;
201-
}
202131
}

‎src/Symfony/Component/DomCrawler/CHANGELOG.md

Copy file name to clipboardExpand all lines: src/Symfony/Component/DomCrawler/CHANGELOG.md
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ CHANGELOG
55
-----
66

77
* Added an internal cache layer on top of the CssSelectorConverter
8+
* Added `UriExpander` to expand an URL according to another URL
89

910
5.0.0
1011
-----
+86Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <fabien@symfony.com>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
namespace Symfony\Component\DomCrawler\Tests;
13+
14+
use PHPUnit\Framework\TestCase;
15+
use Symfony\Component\DomCrawler\UriExpander;
16+
17+
class UriExpanderTest extends TestCase
18+
{
19+
/**
20+
* @dataProvider provideExpandUriTests
21+
*/
22+
public function testExpandUri(string $uri, string $currentUri, string $expected)
23+
{
24+
$this->assertEquals($expected, UriExpander::expand($uri, $currentUri));
25+
}
26+
27+
public function provideExpandUriTests()
28+
{
29+
return [
30+
['/foo', 'http://localhost/bar/foo/', 'http://localhost/foo'],
31+
['/foo', 'http://localhost/bar/foo', 'http://localhost/foo'],
32+
['
33+
/foo', 'http://localhost/bar/foo/', 'http://localhost/foo'],
34+
['/foo
35+
', 'http://localhost/bar/foo', 'http://localhost/foo'],
36+
37+
['foo', 'http://localhost/bar/foo/', 'http://localhost/bar/foo/foo'],
38+
['foo', 'http://localhost/bar/foo', 'http://localhost/bar/foo'],
39+
40+
['', 'http://localhost/bar/', 'http://localhost/bar/'],
41+
['#', 'http://localhost/bar/', 'http://localhost/bar/#'],
42+
['#bar', 'http://localhost/bar?a=b', 'http://localhost/bar?a=b#bar'],
43+
['#bar', 'http://localhost/bar/#foo', 'http://localhost/bar/#bar'],
44+
['?a=b', 'http://localhost/bar#foo', 'http://localhost/bar?a=b'],
45+
['?a=b', 'http://localhost/bar/', 'http://localhost/bar/?a=b'],
46+
47+
['http://login.foo.com/foo', 'http://localhost/bar/', 'http://login.foo.com/foo'],
48+
['https://login.foo.com/foo', 'https://localhost/bar/', 'https://login.foo.com/foo'],
49+
['mailto:foo@bar.com', 'http://localhost/foo', 'mailto:foo@bar.com'],
50+
51+
// tests schema relative URL (issue #7169)
52+
['//login.foo.com/foo', 'http://localhost/bar/', 'http://login.foo.com/foo'],
53+
['//login.foo.com/foo', 'https://localhost/bar/', 'https://login.foo.com/foo'],
54+
55+
['?foo=2', 'http://localhost?foo=1', 'http://localhost?foo=2'],
56+
['?foo=2', 'http://localhost/?foo=1', 'http://localhost/?foo=2'],
57+
['?foo=2', 'http://localhost/bar?foo=1', 'http://localhost/bar?foo=2'],
58+
['?foo=2', 'http://localhost/bar/?foo=1', 'http://localhost/bar/?foo=2'],
59+
['?bar=2', 'http://localhost?foo=1', 'http://localhost?bar=2'],
60+
61+
['foo', 'http://login.foo.com/bar/baz?/query/string', 'http://login.foo.com/bar/foo'],
62+
63+
['.', 'http://localhost/foo/bar/baz', 'http://localhost/foo/bar/'],
64+
['./', 'http://localhost/foo/bar/baz', 'http://localhost/foo/bar/'],
65+
['./foo', 'http://localhost/foo/bar/baz', 'http://localhost/foo/bar/foo'],
66+
['..', 'http://localhost/foo/bar/baz', 'http://localhost/foo/'],
67+
['../', 'http://localhost/foo/bar/baz', 'http://localhost/foo/'],
68+
['../foo', 'http://localhost/foo/bar/baz', 'http://localhost/foo/foo'],
69+
['../..', 'http://localhost/foo/bar/baz', 'http://localhost/'],
70+
['../../', 'http://localhost/foo/bar/baz', 'http://localhost/'],
71+
['../../foo', 'http://localhost/foo/bar/baz', 'http://localhost/foo'],
72+
['../../foo', 'http://localhost/bar/foo/', 'http://localhost/foo'],
73+
['../bar/../../foo', 'http://localhost/bar/foo/', 'http://localhost/foo'],
74+
['../bar/./../../foo', 'http://localhost/bar/foo/', 'http://localhost/foo'],
75+
['../../', 'http://localhost/', 'http://localhost/'],
76+
['../../', 'http://localhost', 'http://localhost/'],
77+
78+
['/foo', 'http://localhost?bar=1', 'http://localhost/foo'],
79+
['/foo', 'http://localhost#bar', 'http://localhost/foo'],
80+
['/foo', 'file:///', 'file:///foo'],
81+
['/foo', 'file:///bar/baz', 'file:///foo'],
82+
['foo', 'file:///', 'file:///foo'],
83+
['foo', 'file:///bar/baz', 'file:///bar/foo'],
84+
];
85+
}
86+
}
+135Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <fabien@symfony.com>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
namespace Symfony\Component\DomCrawler;
13+
14+
/**
15+
* Expand an URI according a current URI.
16+
*
17+
* @author Fabien Potencier <fabien@symfony.com>
18+
* @author Grégoire Pineau <lyrixx@lyrixx.info>
19+
*/
20+
class UriExpander
21+
{
22+
/**
23+
* Expand an URI according to a current Uri.
24+
*
25+
* For example if $uri=/foo/bar and $currentUri=https://symfony.com it will
26+
* return https://symfony.com/foo/bar
27+
*
28+
* If the $uri is not absolute you must pass an absolute $currentUri
29+
*/
30+
public static function expand(string $uri, ?string $currentUri): string
31+
{
32+
$uri = trim($uri);
33+
34+
// absolute URL?
35+
if (null !== parse_url($uri, PHP_URL_SCHEME)) {
36+
return $uri;
37+
}
38+
39+
if (null === $currentUri) {
40+
throw new \InvalidArgumentException('The URI is relative, so you must define its base URI passing an absolute URL.');
41+
}
42+
43+
// empty URI
44+
if (!$uri) {
45+
return $currentUri;
46+
}
47+
48+
// an anchor
49+
if ('#' === $uri[0]) {
50+
return self::cleanupAnchor($currentUri).$uri;
51+
}
52+
53+
$baseUri = self::cleanupUri($currentUri);
54+
55+
if ('?' === $uri[0]) {
56+
return $baseUri.$uri;
57+
}
58+
59+
// absolute URL with relative schema
60+
if (0 === strpos($uri, '//')) {
61+
return preg_replace('#^([^/]*)//.*$#', '$1', $baseUri).$uri;
62+
}
63+
64+
$baseUri = preg_replace('#^(.*?//[^/]*)(?:\/.*)?$#', '$1', $baseUri);
65+
66+
// absolute path
67+
if ('/' === $uri[0]) {
68+
return $baseUri.$uri;
69+
}
70+
71+
// relative path
72+
$path = parse_url(substr($currentUri, \strlen($baseUri)), PHP_URL_PATH);
73+
$path = self::canonicalizePath(substr($path, 0, strrpos($path, '/')).'/'.$uri);
74+
75+
return $baseUri.('' === $path || '/' !== $path[0] ? '/' : '').$path;
76+
}
77+
78+
/**
79+
* Returns the canonicalized URI path (see RFC 3986, section 5.2.4).
80+
*/
81+
private static function canonicalizePath(string $path): string
82+
{
83+
if ('' === $path || '/' === $path) {
84+
return $path;
85+
}
86+
87+
if ('.' === substr($path, -1)) {
88+
$path .= '/';
89+
}
90+
91+
$output = [];
92+
93+
foreach (explode('/', $path) as $segment) {
94+
if ('..' === $segment) {
95+
array_pop($output);
96+
} elseif ('.' !== $segment) {
97+
$output[] = $segment;
98+
}
99+
}
100+
101+
return implode('/', $output);
102+
}
103+
104+
/**
105+
* Removes the query string and the anchor from the given uri.
106+
*/
107+
private static function cleanupUri(string $uri): string
108+
{
109+
return self::cleanupQuery(self::cleanupAnchor($uri));
110+
}
111+
112+
/**
113+
* Removes the query string from the uri.
114+
*/
115+
private static function cleanupQuery(string $uri): string
116+
{
117+
if (false !== $pos = strpos($uri, '?')) {
118+
return substr($uri, 0, $pos);
119+
}
120+
121+
return $uri;
122+
}
123+
124+
/**
125+
* Removes the anchor from the uri.
126+
*/
127+
private static function cleanupAnchor(string $uri): string
128+
{
129+
if (false !== $pos = strpos($uri, '#')) {
130+
return substr($uri, 0, $pos);
131+
}
132+
133+
return $uri;
134+
}
135+
}

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.