|
1 |
| -# No Change in README |
| 1 | +# Web Scraping With PHP |
| 2 | + |
| 3 | +[<img src="https://img.shields.io/static/v1?label=&message=PHP&color=brightgreen" />](https://github.com/topics/php) [<img src="https://img.shields.io/static/v1?label=&message=Web%20Scraping&color=important" />](https://github.com/topics/web-scraping) |
| 4 | + |
| 5 | +- [Installing Prerequisites](#installing-prerequisites) |
| 6 | +- [Making an HTTP GET request](#making-an-http-get-request) |
| 7 | +- [Web scraping in PHP with Goutte](#web-scraping-in-php-with-goutte) |
| 8 | +- [Web scraping with Symfony Panther](#web-scraping-with-symfony-panther) |
| 9 | + |
| 10 | +PHP is a general-purpose scripting language and one of the most popular options for web development. For example, WordPress, the most common content management system to create websites, is built using PHP. |
| 11 | + |
| 12 | +PHP offers various building blocks required to build a web scraper, although it can quickly become an increasingly complicated task. Conveniently, there are many open-source libraries that can make web scraping with PHP more accessible. |
| 13 | + |
| 14 | +This article will guide you through the step-by-step process of writing various PHP web scraping routines that can extract public data from static and dynamic web pages |
| 15 | + |
| 16 | +For a detailed explanation, see our [blog post](https://oxy.yt/Jr3d). |
| 17 | + |
| 18 | +## Installing Prerequisites |
| 19 | + |
| 20 | +```sh |
| 21 | +# Windows |
| 22 | +choco install php |
| 23 | +choco install composer |
| 24 | +``` |
| 25 | + |
| 26 | +or |
| 27 | + |
| 28 | +```sh |
| 29 | +# macOS |
| 30 | +brew install php |
| 31 | +brew install composer |
| 32 | +``` |
| 33 | + |
| 34 | +## Making an HTTP GET request |
| 35 | + |
| 36 | +```php |
| 37 | +<?php |
| 38 | +$html = file_get_contents('https://books.toscrape.com/'); |
| 39 | +echo $html; |
| 40 | +``` |
| 41 | + |
| 42 | +## Web scraping in PHP with Goutte |
| 43 | + |
| 44 | +```sh |
| 45 | +composer init --no-interaction --require="php >=7.1" |
| 46 | +composer require fabpot/goutte |
| 47 | +composer update |
| 48 | +``` |
| 49 | + |
| 50 | +```php |
| 51 | +<?php |
| 52 | +require 'vendor/autoload.php'; |
| 53 | +use Goutte\Client; |
| 54 | +$client = new Client(); |
| 55 | +$crawler = $client->request('GET', 'https://books.toscrape.com'); |
| 56 | +echo $crawler->html(); |
| 57 | +``` |
| 58 | + |
| 59 | +### Locating HTML elements via CSS Selectors |
| 60 | + |
| 61 | +```php |
| 62 | +echo $crawler->filter('title')->text(); //CSS |
| 63 | +echo $crawler->filterXPath('//title')->text(); //XPath |
| 64 | +``` |
| 65 | + |
| 66 | +### Extracting the elements |
| 67 | + |
| 68 | +```php |
| 69 | +function scrapePage($url, $client){ |
| 70 | + $crawler = $client->request('GET', $url); |
| 71 | + $crawler->filter('.product_pod')->each(function ($node) { |
| 72 | + $title = $node->filter('.image_container img')->attr('alt'); |
| 73 | + $price = $node->filter('.price_color')->text(); |
| 74 | + echo $title . "-" . $price . PHP_EOL; |
| 75 | + }); |
| 76 | + } |
| 77 | +``` |
| 78 | + |
| 79 | + |
| 80 | + |
| 81 | +### Handling pagination |
| 82 | + |
| 83 | +```php |
| 84 | +function scrapePage($url, $client, $file) |
| 85 | +{ |
| 86 | + //... |
| 87 | + // Handling Pagination |
| 88 | + try { |
| 89 | + $next_page = $crawler->filter('.next > a')->attr('href'); |
| 90 | + } catch (InvalidArgumentException) { //Next page not found |
| 91 | + return null; |
| 92 | + } |
| 93 | + return "https://books.toscrape.com/catalogue/" . $next_page; |
| 94 | +} |
| 95 | +``` |
| 96 | + |
| 97 | +### Writing Data to CSV |
| 98 | + |
| 99 | +```php |
| 100 | +function scrapePage($url, $client, $file) |
| 101 | +{ |
| 102 | + $crawler = $client->request('GET', $url); |
| 103 | + $crawler->filter('.product_pod')->each(function ($node) use ($file) { |
| 104 | + $title = $node->filter('.image_container img')->attr('alt'); |
| 105 | + $price = $node->filter('.price_color')->text(); |
| 106 | + fputcsv($file, [$title, $price]); |
| 107 | + }); |
| 108 | + try { |
| 109 | + $next_page = $crawler->filter('.next > a')->attr('href'); |
| 110 | + } catch (InvalidArgumentException) { //Next page not found |
| 111 | + return null; |
| 112 | + } |
| 113 | + return "https://books.toscrape.com/catalogue/" . $next_page; |
| 114 | +} |
| 115 | +$client = new Client(); |
| 116 | +$file = fopen("books.csv", "a"); |
| 117 | +$nextUrl = "https://books.toscrape.com/catalogue/page-1.html"; |
| 118 | +while ($nextUrl) { |
| 119 | + echo "<h2>" . $nextUrl . "</h2>" . PHP_EOL; |
| 120 | + $nextUrl = scrapePage($nextUrl, $client, $file); |
| 121 | +} |
| 122 | +fclose($file); |
| 123 | +``` |
| 124 | + |
| 125 | + |
| 126 | + |
| 127 | +## Web scraping with Symfony Panther |
| 128 | + |
| 129 | +```sh |
| 130 | +composer init --no-interaction --require="php >=7.1" |
| 131 | +composer require symfony/panther |
| 132 | +composer update |
| 133 | +brew install chromedriver |
| 134 | +``` |
| 135 | + |
| 136 | +### Sending HTTP requests with Panther |
| 137 | + |
| 138 | +```php |
| 139 | +<?php |
| 140 | +require 'vendor/autoload.php'; |
| 141 | +use \Symfony\Component\Panther\Client; |
| 142 | +$client = Client::createChromeClient(); |
| 143 | +$client->get('https://quotes.toscrape.com/js/'); |
| 144 | +``` |
| 145 | + |
| 146 | +### Locating HTML elements via CSS Selectors |
| 147 | + |
| 148 | +```php |
| 149 | + $crawler = $client->waitFor('.quote'); |
| 150 | + $crawler->filter('.quote')->each(function ($node) { |
| 151 | + $author = $node->filter('.author')->text(); |
| 152 | + $quote = $node->filter('.text')->text(); |
| 153 | + echo $autor." - ".$quote |
| 154 | + }); |
| 155 | +``` |
| 156 | + |
| 157 | +### Handling pagination |
| 158 | + |
| 159 | +```php |
| 160 | +while (true) { |
| 161 | + $crawler = $client->waitFor('.quote'); |
| 162 | +… |
| 163 | + try { |
| 164 | + $client->clickLink('Next'); |
| 165 | + } catch (Exception) { |
| 166 | + break; |
| 167 | + } |
| 168 | +} |
| 169 | +``` |
| 170 | + |
| 171 | +### Writing data to a CSV file |
| 172 | + |
| 173 | +```php |
| 174 | +$file = fopen("quotes.csv", "a"); |
| 175 | +while (true) { |
| 176 | + $crawler = $client->waitFor('.quote'); |
| 177 | + $crawler->filter('.quote')->each(function ($node) use ($file) { |
| 178 | + $author = $node->filter('.author')->text(); |
| 179 | + $quote = $node->filter('.text')->text(); |
| 180 | + fputcsv($file, [$author, $quote]); |
| 181 | + }); |
| 182 | + try { |
| 183 | + $client->clickLink('Next'); |
| 184 | + } catch (Exception) { |
| 185 | + break; |
| 186 | + } |
| 187 | +} |
| 188 | +fclose($file); |
| 189 | +``` |
| 190 | + |
| 191 | + |
| 192 | + |
| 193 | +If you wish to find out more about web scraping with PHP, see our [blog post](https://oxy.yt/Jr3d). |
0 commit comments