@@ -170,7 +170,7 @@ public function addHtmlContent($content, $charset = 'UTF-8')
170
170
171
171
$ this ->addDocument ($ dom );
172
172
173
- $ base = $ this ->filterXPath ('descendant-or-self::base ' )->extract (array ('href ' ));
173
+ $ base = $ this ->filterRelativeXPath ('descendant-or-self::base ' )->extract (array ('href ' ));
174
174
175
175
$ baseHref = current ($ base );
176
176
if (count ($ base ) && !empty ($ baseHref )) {
@@ -580,6 +580,11 @@ public function extract($attributes)
580
580
/**
581
581
* Filters the list of nodes with an XPath expression.
582
582
*
583
+ * The XPath expression is evaluated in the context of the crawler, which
584
+ * is considered as a fake parent of the elements inside it.
585
+ * This means that a child selector "div" or "./div" will match only
586
+ * the div elements of the current crawler, not their children.
587
+ *
583
588
* @param string $xpath An XPath expression
584
589
*
585
590
* @return Crawler A new instance of Crawler with the filtered list of nodes
@@ -588,14 +593,14 @@ public function extract($attributes)
588
593
*/
589
594
public function filterXPath ($ xpath )
590
595
{
591
- $ crawler = new static ( null , $ this ->uri );
596
+ $ xpath = $ this ->relativize ( $ xpath );
592
597
593
- foreach ( $ this as $ node ) {
594
- $ domxpath = new \ DOMXPath ( $ node -> ownerDocument );
595
- $ crawler -> add ( $ domxpath -> query ( $ xpath , $ node ) );
598
+ // If we dropped all expressions in the XPath while preparing it, there would be no match
599
+ if ( '' === $ xpath ) {
600
+ return new static ( null , $ this -> uri );
596
601
}
597
602
598
- return $ crawler ;
603
+ return $ this -> filterRelativeXPath ( $ xpath ) ;
599
604
}
600
605
601
606
/**
@@ -619,7 +624,8 @@ public function filter($selector)
619
624
// @codeCoverageIgnoreEnd
620
625
}
621
626
622
- return $ this ->filterXPath (CssSelector::toXPath ($ selector ));
627
+ // The CssSelector already prefixes the selector with descendant-or-self::
628
+ return $ this ->filterRelativeXPath (CssSelector::toXPath ($ selector ));
623
629
}
624
630
625
631
/**
@@ -633,10 +639,10 @@ public function filter($selector)
633
639
*/
634
640
public function selectLink ($ value )
635
641
{
636
- $ xpath = sprintf ('// a[contains(concat( \' \', normalize-space(string(.)), \' \'), %s)] ' , static ::xpathLiteral (' ' .$ value .' ' )).
637
- sprintf ('| //a/ img[contains(concat( \' \', normalize-space(string(@alt)), \' \'), %s)]/ancestor::a ' , static ::xpathLiteral (' ' .$ value .' ' ));
642
+ $ xpath = sprintf ('descendant-or-self:: a[contains(concat( \' \', normalize-space(string(.)), \' \'), %s) ' , static ::xpathLiteral (' ' .$ value .' ' )).
643
+ sprintf ('or ./ img[contains(concat( \' \', normalize-space(string(@alt)), \' \'), %s)]] ' , static ::xpathLiteral (' ' .$ value .' ' ));
638
644
639
- return $ this ->filterXPath ($ xpath );
645
+ return $ this ->filterRelativeXPath ($ xpath );
640
646
}
641
647
642
648
/**
@@ -651,11 +657,11 @@ public function selectLink($value)
651
657
public function selectButton ($ value )
652
658
{
653
659
$ translate = 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz") ' ;
654
- $ xpath = sprintf ('// input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat( \' \', normalize-space(string(@value)), \' \'), %s)) ' , $ translate , $ translate , static ::xpathLiteral (' ' .$ value .' ' )).
660
+ $ xpath = sprintf ('descendant-or-self:: input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat( \' \', normalize-space(string(@value)), \' \'), %s)) ' , $ translate , $ translate , static ::xpathLiteral (' ' .$ value .' ' )).
655
661
sprintf ('or (contains(%s, "image") and contains(concat( \' \', normalize-space(string(@alt)), \' \'), %s)) or @id="%s" or @name="%s"] ' , $ translate , static ::xpathLiteral (' ' .$ value .' ' ), $ value , $ value ).
656
- sprintf ('| // button[contains(concat( \' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"] ' , static ::xpathLiteral (' ' .$ value .' ' ), $ value , $ value );
662
+ sprintf ('| descendant-or-self:: button[contains(concat( \' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"] ' , static ::xpathLiteral (' ' .$ value .' ' ), $ value , $ value );
657
663
658
- return $ this ->filterXPath ($ xpath );
664
+ return $ this ->filterRelativeXPath ($ xpath );
659
665
}
660
666
661
667
/**
@@ -771,6 +777,88 @@ public static function xpathLiteral($s)
771
777
return sprintf ("concat(%s) " , implode ($ parts , ', ' ));
772
778
}
773
779
780
+ /**
781
+ * Filters the list of nodes with an XPath expression.
782
+ *
783
+ * The XPath expression should already be processed to apply it in the context of each node.
784
+ *
785
+ * @param string $xpath
786
+ *
787
+ * @return Crawler
788
+ */
789
+ private function filterRelativeXPath ($ xpath )
790
+ {
791
+ $ crawler = new static (null , $ this ->uri );
792
+
793
+ foreach ($ this as $ node ) {
794
+ $ domxpath = new \DOMXPath ($ node ->ownerDocument );
795
+ $ crawler ->add ($ domxpath ->query ($ xpath , $ node ));
796
+ }
797
+
798
+ return $ crawler ;
799
+ }
800
+
801
+ /**
802
+ * Make the XPath relative to the current context.
803
+ *
804
+ * The returned XPath will match elements matching the XPath inside the current crawler
805
+ * when running in the context of a node of the crawler.
806
+ *
807
+ * @param string $xpath
808
+ *
809
+ * @return string
810
+ */
811
+ private function relativize ($ xpath )
812
+ {
813
+ $ expressions = array ();
814
+
815
+ $ unionPattern = '/\|(?![^\[]*\])/ ' ;
816
+ // An expression which will never match to replace expressions which cannot match in the crawler
817
+ // We cannot simply drop
818
+ $ nonMatchingExpression = 'a[name() = "b"] ' ;
819
+
820
+ // Split any unions into individual expressions.
821
+ foreach (preg_split ($ unionPattern , $ xpath ) as $ expression ) {
822
+ $ expression = trim ($ expression );
823
+ $ parenthesis = '' ;
824
+
825
+ // If the union is inside some braces, we need to preserve the opening braces and apply
826
+ // the change only inside it.
827
+ if (preg_match ('/^[\(\s*]+/ ' , $ expression , $ matches )) {
828
+ $ parenthesis = $ matches [0 ];
829
+ $ expression = substr ($ expression , strlen ($ parenthesis ));
830
+ }
831
+
832
+ // BC for Symfony 2.4 and lower were elements were adding in a fake _root parent
833
+ if (0 === strpos ($ expression , '/_root/ ' )) {
834
+ $ expression = './ ' .substr ($ expression , 7 );
835
+ }
836
+
837
+ // add prefix before absolute element selector
838
+ if (empty ($ expression )) {
839
+ $ expression = $ nonMatchingExpression ;
840
+ } elseif (0 === strpos ($ expression , '// ' )) {
841
+ $ expression = 'descendant-or-self:: ' . substr ($ expression , 2 );
842
+ } elseif (0 === strpos ($ expression , './ ' )) {
843
+ $ expression = 'self:: ' . substr ($ expression , 2 );
844
+ } elseif ('/ ' === $ expression [0 ]) {
845
+ // the only direct child in Symfony 2.4 and lower is _root, which is already handled previously
846
+ // so let's drop the expression entirely
847
+ $ expression = $ nonMatchingExpression ;
848
+ } elseif ('. ' === $ expression [0 ]) {
849
+ // '.' is the fake root element in Symfony 2.4 and lower, which is excluded from results
850
+ $ expression = $ nonMatchingExpression ;
851
+ } elseif (0 === strpos ($ expression , 'descendant:: ' )) {
852
+ $ expression = 'descendant-or-self:: ' . substr ($ expression , strlen ('descendant:: ' ));
853
+ } elseif (0 !== strpos ($ expression , 'descendant-or-self:: ' )) {
854
+ $ expression = 'self:: ' .$ expression ;
855
+ }
856
+ $ expressions [] = $ parenthesis .$ expression ;
857
+ }
858
+
859
+ return implode (' | ' , $ expressions );
860
+ }
861
+
774
862
/**
775
863
* @param int $position
776
864
*
0 commit comments