diff --git a/docs/guides/code_examples/http_crawlers/__init__.py b/docs/guides/code_examples/http_crawlers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/guides/code_examples/http_crawlers/lexbor_parser.py b/docs/guides/code_examples/http_crawlers/lexbor_parser.py new file mode 100644 index 0000000000..ef279793ed --- /dev/null +++ b/docs/guides/code_examples/http_crawlers/lexbor_parser.py @@ -0,0 +1,63 @@ +import asyncio + +from pydantic import ValidationError +from selectolax.lexbor import LexborHTMLParser +from yarl import URL + +from crawlee import Request +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=1, + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Parse the HTML content using Selectolax with Lexbor backend. + parsed_html = LexborHTMLParser(await context.http_response.read()) + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': parsed_html.css_first('title').text(), + 'h1s': [h1.text() for h1 in parsed_html.css('h1')], + 'h2s': [h2.text() for h2 in parsed_html.css('h2')], + 'h3s': [h3.text() for h3 in parsed_html.css('h3')], + } + await context.push_data(data) + + # Css selector to extract valid href attributes. + links_selector = ( + 'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])' + ) + base_url = URL(context.request.url) + extracted_requests = [] + + # Extract links. + for item in parsed_html.css(links_selector): + href = item.attributes.get('href') + if not href: + continue + + # Convert relative URLs to absolute if needed. + url = str(base_url.join(URL(href))) + try: + request = Request.from_url(url) + except ValidationError as exc: + context.log.warning(f'Skipping invalid URL "{url}": {exc}') + continue + extracted_requests.append(request) + + # Add extracted requests to the queue with the same-domain strategy. + await context.add_requests(extracted_requests, strategy='same-domain') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/http_crawlers/lxml_parser.py b/docs/guides/code_examples/http_crawlers/lxml_parser.py new file mode 100644 index 0000000000..b50fda4293 --- /dev/null +++ b/docs/guides/code_examples/http_crawlers/lxml_parser.py @@ -0,0 +1,61 @@ +import asyncio + +from lxml import html +from pydantic import ValidationError + +from crawlee import Request +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=1, + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Parse the HTML content using lxml. + parsed_html = html.fromstring(await context.http_response.read()) + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': parsed_html.findtext('.//title'), + 'h1s': [h1.text_content() for h1 in parsed_html.findall('.//h1')], + 'h2s': [h2.text_content() for h2 in parsed_html.findall('.//h2')], + 'h3s': [h3.text_content() for h3 in parsed_html.findall('.//h3')], + } + await context.push_data(data) + + # Convert relative URLs to absolute before extracting links. + parsed_html.make_links_absolute(context.request.url, resolve_base_href=True) + + # Xpath 1.0 selector for extracting valid href attributes. + links_xpath = ( + '//a/@href[not(starts-with(., "#")) ' + 'and not(starts-with(., "javascript:")) ' + 'and not(starts-with(., "mailto:"))]' + ) + + extracted_requests = [] + + # Extract links. + for url in parsed_html.xpath(links_xpath): + try: + request = Request.from_url(url) + except ValidationError as exc: + context.log.warning(f'Skipping invalid URL "{url}": {exc}') + continue + extracted_requests.append(request) + + # Add extracted requests to the queue with the same-domain strategy. + await context.add_requests(extracted_requests, strategy='same-domain') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/http_crawlers/lxml_saxonche_parser.py b/docs/guides/code_examples/http_crawlers/lxml_saxonche_parser.py new file mode 100644 index 0000000000..ac839a6164 --- /dev/null +++ b/docs/guides/code_examples/http_crawlers/lxml_saxonche_parser.py @@ -0,0 +1,77 @@ +import asyncio + +from lxml import html +from pydantic import ValidationError +from saxonche import PySaxonProcessor + +from crawlee import Request +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=1, + max_requests_per_crawl=10, + ) + + # Create Saxon processor once and reuse across requests. + saxon_proc = PySaxonProcessor(license=False) + xpath_proc = saxon_proc.new_xpath_processor() + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Parse HTML with lxml. + parsed_html = html.fromstring(await context.http_response.read()) + # Convert relative URLs to absolute before extracting links. + parsed_html.make_links_absolute(context.request.url, resolve_base_href=True) + # Convert parsed HTML to XML for Saxon processing. + xml = html.tostring(parsed_html, encoding='unicode', method='xml') + # Parse XML with Saxon. + parsed_xml = saxon_proc.parse_xml(xml_text=xml) + # Set the parsed context for XPath evaluation. + xpath_proc.set_context(xdm_item=parsed_xml) + + # Extract data using XPath 2.0 string() function. + data = { + 'url': context.request.url, + 'title': xpath_proc.evaluate_single('.//title/string()'), + 'h1s': [str(h) for h in (xpath_proc.evaluate('//h1/string()') or [])], + 'h2s': [str(h) for h in (xpath_proc.evaluate('//h2/string()') or [])], + 'h3s': [str(h) for h in (xpath_proc.evaluate('//h3/string()') or [])], + } + await context.push_data(data) + + # XPath 2.0 with distinct-values() to get unique links and remove fragments. + links_xpath = """ + distinct-values( + for $href in //a/@href[ + not(starts-with(., "#")) + and not(starts-with(., "javascript:")) + and not(starts-with(., "mailto:")) + ] + return replace($href, "#.*$", "") + ) + """ + + extracted_requests = [] + + # Extract links. + for item in xpath_proc.evaluate(links_xpath) or []: + url = item.string_value + try: + request = Request.from_url(url) + except ValidationError as exc: + context.log.warning(f'Skipping invalid URL "{url}": {exc}') + continue + extracted_requests.append(request) + + # Add extracted requests to the queue with the same-domain strategy. + await context.add_requests(extracted_requests, strategy='same-domain') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/http_crawlers/pyquery_parser.py b/docs/guides/code_examples/http_crawlers/pyquery_parser.py new file mode 100644 index 0000000000..1e15e9cb5b --- /dev/null +++ b/docs/guides/code_examples/http_crawlers/pyquery_parser.py @@ -0,0 +1,64 @@ +import asyncio + +from pydantic import ValidationError +from pyquery import PyQuery +from yarl import URL + +from crawlee import Request +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=1, + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Parse the HTML content using PyQuery. + parsed_html = PyQuery(await context.http_response.read()) + + # Extract data using jQuery-style selectors. + data = { + 'url': context.request.url, + 'title': parsed_html('title').text(), + 'h1s': [h1.text() for h1 in parsed_html('h1').items()], + 'h2s': [h2.text() for h2 in parsed_html('h2').items()], + 'h3s': [h3.text() for h3 in parsed_html('h3').items()], + } + await context.push_data(data) + + # Css selector to extract valid href attributes. + links_selector = ( + 'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])' + ) + base_url = URL(context.request.url) + + extracted_requests = [] + + # Extract links. + for item in parsed_html(links_selector).items(): + href = item.attr('href') + if not href: + continue + + # Convert relative URLs to absolute if needed. + url = str(base_url.join(URL(str(href)))) + try: + request = Request.from_url(url) + except ValidationError as exc: + context.log.warning(f'Skipping invalid URL "{url}": {exc}') + continue + extracted_requests.append(request) + + # Add extracted requests to the queue with the same-domain strategy. + await context.add_requests(extracted_requests, strategy='same-domain') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/http_crawlers/scrapling_parser.py b/docs/guides/code_examples/http_crawlers/scrapling_parser.py new file mode 100644 index 0000000000..201b9b0cbf --- /dev/null +++ b/docs/guides/code_examples/http_crawlers/scrapling_parser.py @@ -0,0 +1,74 @@ +import asyncio + +from pydantic import ValidationError +from scrapling.parser import Selector +from yarl import URL + +from crawlee import Request +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=1, + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Parse the HTML content using Scrapling. + page = Selector(await context.http_response.read(), url=context.request.url) + + # Extract data using Xpath selectors with .get_all_text method for full text + # content. + title_el = page.xpath_first('//title') + data = { + 'url': context.request.url, + 'title': title_el.text if isinstance(title_el, Selector) else title_el, + 'h1s': [ + h1.get_all_text() if isinstance(h1, Selector) else h1 + for h1 in page.xpath('//h1') + ], + 'h2s': [ + h2.get_all_text() if isinstance(h2, Selector) else h2 + for h2 in page.xpath('//h2') + ], + 'h3s': [ + h3.get_all_text() if isinstance(h3, Selector) else h3 + for h3 in page.xpath('//h3') + ], + } + await context.push_data(data) + + # Css selector to extract valid href attributes. + links_selector = ( + 'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])' + ) + base_url = URL(context.request.url) + extracted_requests = [] + + # Extract links. + for item in page.css(links_selector): + href = item.attrib.get('href') if isinstance(item, Selector) else None + if not href: + continue + + # Convert relative URLs to absolute if needed. + url = str(base_url.join(URL(href))) + try: + request = Request.from_url(url) + except ValidationError as exc: + context.log.warning(f'Skipping invalid URL "{url}": {exc}') + continue + extracted_requests.append(request) + + # Add extracted requests to the queue with the same-domain strategy. + await context.add_requests(extracted_requests, strategy='same-domain') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/http_crawlers/selectolax_adaptive_run.py b/docs/guides/code_examples/http_crawlers/selectolax_adaptive_run.py new file mode 100644 index 0000000000..1419a51fed --- /dev/null +++ b/docs/guides/code_examples/http_crawlers/selectolax_adaptive_run.py @@ -0,0 +1,38 @@ +import asyncio + +from crawlee.crawlers import ( + AdaptivePlaywrightCrawler, + AdaptivePlaywrightCrawlerStatisticState, + AdaptivePlaywrightCrawlingContext, +) +from crawlee.statistics import Statistics + +from .selectolax_parser import SelectolaxLexborParser + + +async def main() -> None: + crawler: AdaptivePlaywrightCrawler = AdaptivePlaywrightCrawler( + max_requests_per_crawl=10, + # Use custom Selectolax parser for static content parsing. + static_parser=SelectolaxLexborParser(), + # Set up statistics with AdaptivePlaywrightCrawlerStatisticState. + statistics=Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState), + ) + + @crawler.router.default_handler + async def handle_request(context: AdaptivePlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + data = { + 'url': context.request.url, + 'title': await context.query_selector_one('title'), + } + + await context.push_data(data) + + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/http_crawlers/selectolax_context.py b/docs/guides/code_examples/http_crawlers/selectolax_context.py new file mode 100644 index 0000000000..8fe36c039c --- /dev/null +++ b/docs/guides/code_examples/http_crawlers/selectolax_context.py @@ -0,0 +1,33 @@ +from dataclasses import dataclass, fields + +from selectolax.lexbor import LexborHTMLParser +from typing_extensions import Self + +from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext + + +@dataclass(frozen=True) +class SelectolaxLexborContext(ParsedHttpCrawlingContext[LexborHTMLParser]): + """Crawling context providing access to the parsed page. + + This context is passed to request handlers and includes all standard + context methods (push_data, enqueue_links, etc.) plus custom helpers. + """ + + @property + def parser(self) -> LexborHTMLParser: + """Convenient alias for accessing the parsed document.""" + return self.parsed_content + + @classmethod + def from_parsed_http_crawling_context( + cls, context: ParsedHttpCrawlingContext[LexborHTMLParser] + ) -> Self: + """Create custom context from the base context. + + Copies all fields from the base context to preserve framework + functionality while adding custom interface. + """ + return cls( + **{field.name: getattr(context, field.name) for field in fields(context)} + ) diff --git a/docs/guides/code_examples/http_crawlers/selectolax_crawler.py b/docs/guides/code_examples/http_crawlers/selectolax_crawler.py new file mode 100644 index 0000000000..d5efc466e6 --- /dev/null +++ b/docs/guides/code_examples/http_crawlers/selectolax_crawler.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from selectolax.lexbor import LexborHTMLParser, LexborNode + +from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions + +from .selectolax_context import SelectolaxLexborContext +from .selectolax_parser import SelectolaxLexborParser + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + from typing_extensions import Unpack + + from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext + + +class SelectolaxLexborCrawler( + AbstractHttpCrawler[SelectolaxLexborContext, LexborHTMLParser, LexborNode] +): + """Custom crawler using Selectolax Lexbor for HTML parsing.""" + + def __init__( + self, + **kwargs: Unpack[HttpCrawlerOptions[SelectolaxLexborContext]], + ) -> None: + # Final step converts the base context to custom context type. + async def final_step( + context: ParsedHttpCrawlingContext[LexborHTMLParser], + ) -> AsyncGenerator[SelectolaxLexborContext, None]: + yield SelectolaxLexborContext.from_parsed_http_crawling_context(context) + + # Build context pipeline: HTTP request -> parsing -> custom context. + kwargs['_context_pipeline'] = ( + self._create_static_content_crawler_pipeline().compose(final_step) + ) + super().__init__( + parser=SelectolaxLexborParser(), + **kwargs, + ) diff --git a/docs/guides/code_examples/http_crawlers/selectolax_crawler_run.py b/docs/guides/code_examples/http_crawlers/selectolax_crawler_run.py new file mode 100644 index 0000000000..52c25ac4da --- /dev/null +++ b/docs/guides/code_examples/http_crawlers/selectolax_crawler_run.py @@ -0,0 +1,27 @@ +import asyncio + +from .selectolax_crawler import SelectolaxLexborContext, SelectolaxLexborCrawler + + +async def main() -> None: + crawler = SelectolaxLexborCrawler( + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def handle_request(context: SelectolaxLexborContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + data = { + 'url': context.request.url, + 'title': context.parser.css_first('title').text(), + } + + await context.push_data(data) + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/http_crawlers/selectolax_parser.py b/docs/guides/code_examples/http_crawlers/selectolax_parser.py new file mode 100644 index 0000000000..1627a3b220 --- /dev/null +++ b/docs/guides/code_examples/http_crawlers/selectolax_parser.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +import asyncio +from typing import TYPE_CHECKING + +from selectolax.lexbor import LexborHTMLParser, LexborNode +from typing_extensions import override + +from crawlee.crawlers._abstract_http import AbstractHttpParser + +if TYPE_CHECKING: + from collections.abc import Iterable, Sequence + + from crawlee.http_clients import HttpResponse + + +class SelectolaxLexborParser(AbstractHttpParser[LexborHTMLParser, LexborNode]): + """Parser for parsing HTTP response using Selectolax Lexbor.""" + + @override + async def parse(self, response: HttpResponse) -> LexborHTMLParser: + """Parse HTTP response body into a document object.""" + response_body = await response.read() + # Run parsing in a thread to avoid blocking the event loop. + return await asyncio.to_thread(lambda: LexborHTMLParser(response_body)) + + @override + async def parse_text(self, text: str) -> LexborHTMLParser: + """Parse raw HTML string into a document object.""" + return LexborHTMLParser(text) + + @override + async def select( + self, parsed_content: LexborHTMLParser, selector: str + ) -> Sequence[LexborNode]: + """Select elements matching a CSS selector.""" + return tuple(item for item in parsed_content.css(selector)) + + @override + def is_matching_selector( + self, parsed_content: LexborHTMLParser, selector: str + ) -> bool: + """Check if any element matches the selector.""" + return parsed_content.css_first(selector) is not None + + @override + def find_links( + self, parsed_content: LexborHTMLParser, selector: str + ) -> Iterable[str]: + """Extract href attributes from elements matching the selector. + + Used by `enqueue_links` helper to discover URLs. + """ + link: LexborNode + urls: list[str] = [] + for link in parsed_content.css(selector): + url = link.attributes.get('href') + if url: + urls.append(url.strip()) + return urls diff --git a/docs/guides/http_crawlers.mdx b/docs/guides/http_crawlers.mdx index 3cd29ed314..73c36b9c9c 100644 --- a/docs/guides/http_crawlers.mdx +++ b/docs/guides/http_crawlers.mdx @@ -8,11 +8,24 @@ import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; +import CodeBlock from '@theme/CodeBlock'; import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/beautifulsoup_example.py'; import ParselExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/parsel_example.py'; import HttpExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/http_example.py'; +import LxmlParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lxml_parser.py'; +import LxmlSaxoncheParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lxml_saxonche_parser.py'; +import LexborParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lexbor_parser.py'; +import PyqueryParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/pyquery_parser.py'; +import ScraplingParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/scrapling_parser.py'; + +import SelectolaxParserSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_parser.py'; +import SelectolaxContextSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_context.py'; +import SelectolaxCrawlerSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_crawler.py'; +import SelectolaxCrawlerRunSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_crawler_run.py'; +import AdaptiveCrawlerRunSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_adaptive_run.py'; + HTTP crawlers are ideal for extracting data from server-rendered websites that don't require JavaScript execution. These crawlers make requests via HTTP clients to fetch HTML content and then parse it using various parsing libraries. For client-side rendered content, where you need to execute JavaScript consider using [Playwright crawler](https://crawlee.dev/python/docs/guides/playwright-crawler) instead. ## Overview @@ -84,7 +97,41 @@ The `HttpCrawler` provides direct acce {HttpExample} -## Creating custom HTTP crawler +### Using custom parsers + +Since `HttpCrawler` provides raw HTTP responses, you can integrate any parsing library. Note that helpers like `enqueue_links` and `extract_links` are not available with this approach. + +The following examples demonstrate how to integrate with several popular parsing libraries, including [lxml](https://lxml.de/) (high-performance parsing with XPath 1.0), [lxml with SaxonC-HE](https://pypi.org/project/saxonche/) (XPath 3.1 support), [selectolax](https://github.com/rushter/selectolax) (high-speed CSS selectors), [PyQuery](https://pyquery.readthedocs.io/) (jQuery-like syntax), and [scrapling](https://github.com/D4Vinci/Scrapling) (a Scrapy/Parsel-style API offering BeautifulSoup-like methods). + + + + + {LxmlParser} + + + + + {LxmlSaxoncheParser} + + + + + {LexborParser} + + + + + {PyqueryParser} + + + + + {ScraplingParser} + + + + +## Custom HTTP crawler While the built-in crawlers cover most use cases, you might need a custom HTTP crawler for specialized parsing requirements. To create a custom HTTP crawler, inherit directly from `AbstractHttpCrawler`. This approach requires implementing: @@ -94,8 +141,53 @@ While the built-in crawlers cover most use cases, you might need a custom HTTP c This approach is recommended when you need tight integration between parsing and the crawling context, or when you're building a reusable crawler for a specific technology or format. +The following example demonstrates how to create a custom crawler using `selectolax` with the `Lexbor` engine. + +### Parser implementation + +The parser converts HTTP responses into a parsed document and provides methods for element selection. Implement `AbstractHttpParser` using `selectolax` with required methods for parsing and querying: + + + {SelectolaxParserSource} + + +This is enough to use your parser with `AbstractHttpCrawler.create_parsed_http_crawler_class` factory method. For more control, continue with custom context and crawler classes below. + +### Crawling context definition + +The crawling context is passed to request handlers and provides access to the parsed content. Extend `ParsedHttpCrawlingContext` to define the interface your handlers will work with. Here you can implement additional helpers for the crawler context. + + + {SelectolaxContextSource} + + +### Crawler composition + +The crawler class connects the parser and context. Extend `AbstractHttpCrawler` and configure the context pipeline to use your custom components: + + + {SelectolaxCrawlerSource} + + +### Crawler usage + +The custom crawler works like any built-in crawler. Request handlers receive your custom context with full access to framework helpers like `enqueue_links`. Additionally, the custom parser can be used with `AdaptivePlaywrightCrawler` for adaptive crawling: + + + + + {SelectolaxCrawlerRunSource} + + + + + {AdaptiveCrawlerRunSource} + + + + ## Conclusion -This guide provided a comprehensive overview of HTTP crawlers in Crawlee. You learned about the three main crawler types - `BeautifulSoupCrawler` for fault-tolerant HTML parsing, `ParselCrawler` for high-performance extraction with XPath and CSS selectors, and `HttpCrawler` for raw response processing. You also discovered how to create custom crawlers for specific use cases. +This guide provided a comprehensive overview of HTTP crawlers in Crawlee. You learned about the three main crawler types - `BeautifulSoupCrawler` for fault-tolerant HTML parsing, `ParselCrawler` for high-performance extraction with XPath and CSS selectors, and `HttpCrawler` for raw response processing. You also discovered how to integrate third-party parsing libraries with `HttpCrawler` and how to create fully custom crawlers using `AbstractHttpCrawler` for specialized parsing requirements. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/pyproject.toml b/pyproject.toml index b7a9d72679..1593212972 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -256,6 +256,9 @@ module = [ "apify_fingerprint_datapoints", # Untyped and stubs not available "camoufox", # Example code shows integration of camoufox and crawlee. "fastapi", # Example code shows running in webserver. + "saxonche", # Example code shows HttpCrawler with custom parser. + "scrapling.*", # Example code shows HttpCrawler with custom parser. + "selectolax.*", # Example code shows HttpCrawler with custom parser. "stagehand.*", # Example code shows integration of Stagehand and crawlee. "starlette.*", # Example code shows running in webserver. "flask", # Example code shows deploy on Google Cloud. @@ -263,9 +266,11 @@ module = [ "jaro", # Untyped and stubs not available "litestar", # Example code shows deploy on Google Cloud Run. "loguru", # Example code shows integration of loguru and crawlee for JSON logging. + "lxml.*", # Example code shows HttpCrawler with custom parser. "sklearn.linear_model", # Untyped and stubs not available "cookiecutter.*", # Untyped and stubs not available "inquirer.*", # Untyped and stubs not available + "pyquery", # Example code shows HttpCrawler with custom parser. "warcio.*", # Example code shows WARC files creation. "wrapt" # Untyped and stubs not available ]