diff --git a/docs/guides/code_examples/http_crawlers/__init__.py b/docs/guides/code_examples/http_crawlers/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/docs/guides/code_examples/http_crawlers/lexbor_parser.py b/docs/guides/code_examples/http_crawlers/lexbor_parser.py
new file mode 100644
index 0000000000..ef279793ed
--- /dev/null
+++ b/docs/guides/code_examples/http_crawlers/lexbor_parser.py
@@ -0,0 +1,63 @@
+import asyncio
+
+from pydantic import ValidationError
+from selectolax.lexbor import LexborHTMLParser
+from yarl import URL
+
+from crawlee import Request
+from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
+
+
+async def main() -> None:
+ crawler = HttpCrawler(
+ max_request_retries=1,
+ max_requests_per_crawl=10,
+ )
+
+ @crawler.router.default_handler
+ async def request_handler(context: HttpCrawlingContext) -> None:
+ context.log.info(f'Processing {context.request.url} ...')
+
+ # Parse the HTML content using Selectolax with Lexbor backend.
+ parsed_html = LexborHTMLParser(await context.http_response.read())
+
+ # Extract data from the page.
+ data = {
+ 'url': context.request.url,
+ 'title': parsed_html.css_first('title').text(),
+ 'h1s': [h1.text() for h1 in parsed_html.css('h1')],
+ 'h2s': [h2.text() for h2 in parsed_html.css('h2')],
+ 'h3s': [h3.text() for h3 in parsed_html.css('h3')],
+ }
+ await context.push_data(data)
+
+ # Css selector to extract valid href attributes.
+ links_selector = (
+ 'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])'
+ )
+ base_url = URL(context.request.url)
+ extracted_requests = []
+
+ # Extract links.
+ for item in parsed_html.css(links_selector):
+ href = item.attributes.get('href')
+ if not href:
+ continue
+
+ # Convert relative URLs to absolute if needed.
+ url = str(base_url.join(URL(href)))
+ try:
+ request = Request.from_url(url)
+ except ValidationError as exc:
+ context.log.warning(f'Skipping invalid URL "{url}": {exc}')
+ continue
+ extracted_requests.append(request)
+
+ # Add extracted requests to the queue with the same-domain strategy.
+ await context.add_requests(extracted_requests, strategy='same-domain')
+
+ await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/docs/guides/code_examples/http_crawlers/lxml_parser.py b/docs/guides/code_examples/http_crawlers/lxml_parser.py
new file mode 100644
index 0000000000..b50fda4293
--- /dev/null
+++ b/docs/guides/code_examples/http_crawlers/lxml_parser.py
@@ -0,0 +1,61 @@
+import asyncio
+
+from lxml import html
+from pydantic import ValidationError
+
+from crawlee import Request
+from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
+
+
+async def main() -> None:
+ crawler = HttpCrawler(
+ max_request_retries=1,
+ max_requests_per_crawl=10,
+ )
+
+ @crawler.router.default_handler
+ async def request_handler(context: HttpCrawlingContext) -> None:
+ context.log.info(f'Processing {context.request.url} ...')
+
+ # Parse the HTML content using lxml.
+ parsed_html = html.fromstring(await context.http_response.read())
+
+ # Extract data from the page.
+ data = {
+ 'url': context.request.url,
+ 'title': parsed_html.findtext('.//title'),
+ 'h1s': [h1.text_content() for h1 in parsed_html.findall('.//h1')],
+ 'h2s': [h2.text_content() for h2 in parsed_html.findall('.//h2')],
+ 'h3s': [h3.text_content() for h3 in parsed_html.findall('.//h3')],
+ }
+ await context.push_data(data)
+
+ # Convert relative URLs to absolute before extracting links.
+ parsed_html.make_links_absolute(context.request.url, resolve_base_href=True)
+
+ # Xpath 1.0 selector for extracting valid href attributes.
+ links_xpath = (
+ '//a/@href[not(starts-with(., "#")) '
+ 'and not(starts-with(., "javascript:")) '
+ 'and not(starts-with(., "mailto:"))]'
+ )
+
+ extracted_requests = []
+
+ # Extract links.
+ for url in parsed_html.xpath(links_xpath):
+ try:
+ request = Request.from_url(url)
+ except ValidationError as exc:
+ context.log.warning(f'Skipping invalid URL "{url}": {exc}')
+ continue
+ extracted_requests.append(request)
+
+ # Add extracted requests to the queue with the same-domain strategy.
+ await context.add_requests(extracted_requests, strategy='same-domain')
+
+ await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/docs/guides/code_examples/http_crawlers/lxml_saxonche_parser.py b/docs/guides/code_examples/http_crawlers/lxml_saxonche_parser.py
new file mode 100644
index 0000000000..ac839a6164
--- /dev/null
+++ b/docs/guides/code_examples/http_crawlers/lxml_saxonche_parser.py
@@ -0,0 +1,77 @@
+import asyncio
+
+from lxml import html
+from pydantic import ValidationError
+from saxonche import PySaxonProcessor
+
+from crawlee import Request
+from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
+
+
+async def main() -> None:
+ crawler = HttpCrawler(
+ max_request_retries=1,
+ max_requests_per_crawl=10,
+ )
+
+ # Create Saxon processor once and reuse across requests.
+ saxon_proc = PySaxonProcessor(license=False)
+ xpath_proc = saxon_proc.new_xpath_processor()
+
+ @crawler.router.default_handler
+ async def request_handler(context: HttpCrawlingContext) -> None:
+ context.log.info(f'Processing {context.request.url} ...')
+
+ # Parse HTML with lxml.
+ parsed_html = html.fromstring(await context.http_response.read())
+ # Convert relative URLs to absolute before extracting links.
+ parsed_html.make_links_absolute(context.request.url, resolve_base_href=True)
+ # Convert parsed HTML to XML for Saxon processing.
+ xml = html.tostring(parsed_html, encoding='unicode', method='xml')
+ # Parse XML with Saxon.
+ parsed_xml = saxon_proc.parse_xml(xml_text=xml)
+ # Set the parsed context for XPath evaluation.
+ xpath_proc.set_context(xdm_item=parsed_xml)
+
+ # Extract data using XPath 2.0 string() function.
+ data = {
+ 'url': context.request.url,
+ 'title': xpath_proc.evaluate_single('.//title/string()'),
+ 'h1s': [str(h) for h in (xpath_proc.evaluate('//h1/string()') or [])],
+ 'h2s': [str(h) for h in (xpath_proc.evaluate('//h2/string()') or [])],
+ 'h3s': [str(h) for h in (xpath_proc.evaluate('//h3/string()') or [])],
+ }
+ await context.push_data(data)
+
+ # XPath 2.0 with distinct-values() to get unique links and remove fragments.
+ links_xpath = """
+ distinct-values(
+ for $href in //a/@href[
+ not(starts-with(., "#"))
+ and not(starts-with(., "javascript:"))
+ and not(starts-with(., "mailto:"))
+ ]
+ return replace($href, "#.*$", "")
+ )
+ """
+
+ extracted_requests = []
+
+ # Extract links.
+ for item in xpath_proc.evaluate(links_xpath) or []:
+ url = item.string_value
+ try:
+ request = Request.from_url(url)
+ except ValidationError as exc:
+ context.log.warning(f'Skipping invalid URL "{url}": {exc}')
+ continue
+ extracted_requests.append(request)
+
+ # Add extracted requests to the queue with the same-domain strategy.
+ await context.add_requests(extracted_requests, strategy='same-domain')
+
+ await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/docs/guides/code_examples/http_crawlers/pyquery_parser.py b/docs/guides/code_examples/http_crawlers/pyquery_parser.py
new file mode 100644
index 0000000000..1e15e9cb5b
--- /dev/null
+++ b/docs/guides/code_examples/http_crawlers/pyquery_parser.py
@@ -0,0 +1,64 @@
+import asyncio
+
+from pydantic import ValidationError
+from pyquery import PyQuery
+from yarl import URL
+
+from crawlee import Request
+from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
+
+
+async def main() -> None:
+ crawler = HttpCrawler(
+ max_request_retries=1,
+ max_requests_per_crawl=10,
+ )
+
+ @crawler.router.default_handler
+ async def request_handler(context: HttpCrawlingContext) -> None:
+ context.log.info(f'Processing {context.request.url} ...')
+
+ # Parse the HTML content using PyQuery.
+ parsed_html = PyQuery(await context.http_response.read())
+
+ # Extract data using jQuery-style selectors.
+ data = {
+ 'url': context.request.url,
+ 'title': parsed_html('title').text(),
+ 'h1s': [h1.text() for h1 in parsed_html('h1').items()],
+ 'h2s': [h2.text() for h2 in parsed_html('h2').items()],
+ 'h3s': [h3.text() for h3 in parsed_html('h3').items()],
+ }
+ await context.push_data(data)
+
+ # Css selector to extract valid href attributes.
+ links_selector = (
+ 'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])'
+ )
+ base_url = URL(context.request.url)
+
+ extracted_requests = []
+
+ # Extract links.
+ for item in parsed_html(links_selector).items():
+ href = item.attr('href')
+ if not href:
+ continue
+
+ # Convert relative URLs to absolute if needed.
+ url = str(base_url.join(URL(str(href))))
+ try:
+ request = Request.from_url(url)
+ except ValidationError as exc:
+ context.log.warning(f'Skipping invalid URL "{url}": {exc}')
+ continue
+ extracted_requests.append(request)
+
+ # Add extracted requests to the queue with the same-domain strategy.
+ await context.add_requests(extracted_requests, strategy='same-domain')
+
+ await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/docs/guides/code_examples/http_crawlers/scrapling_parser.py b/docs/guides/code_examples/http_crawlers/scrapling_parser.py
new file mode 100644
index 0000000000..201b9b0cbf
--- /dev/null
+++ b/docs/guides/code_examples/http_crawlers/scrapling_parser.py
@@ -0,0 +1,74 @@
+import asyncio
+
+from pydantic import ValidationError
+from scrapling.parser import Selector
+from yarl import URL
+
+from crawlee import Request
+from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
+
+
+async def main() -> None:
+ crawler = HttpCrawler(
+ max_request_retries=1,
+ max_requests_per_crawl=10,
+ )
+
+ @crawler.router.default_handler
+ async def request_handler(context: HttpCrawlingContext) -> None:
+ context.log.info(f'Processing {context.request.url} ...')
+
+ # Parse the HTML content using Scrapling.
+ page = Selector(await context.http_response.read(), url=context.request.url)
+
+ # Extract data using Xpath selectors with .get_all_text method for full text
+ # content.
+ title_el = page.xpath_first('//title')
+ data = {
+ 'url': context.request.url,
+ 'title': title_el.text if isinstance(title_el, Selector) else title_el,
+ 'h1s': [
+ h1.get_all_text() if isinstance(h1, Selector) else h1
+ for h1 in page.xpath('//h1')
+ ],
+ 'h2s': [
+ h2.get_all_text() if isinstance(h2, Selector) else h2
+ for h2 in page.xpath('//h2')
+ ],
+ 'h3s': [
+ h3.get_all_text() if isinstance(h3, Selector) else h3
+ for h3 in page.xpath('//h3')
+ ],
+ }
+ await context.push_data(data)
+
+ # Css selector to extract valid href attributes.
+ links_selector = (
+ 'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])'
+ )
+ base_url = URL(context.request.url)
+ extracted_requests = []
+
+ # Extract links.
+ for item in page.css(links_selector):
+ href = item.attrib.get('href') if isinstance(item, Selector) else None
+ if not href:
+ continue
+
+ # Convert relative URLs to absolute if needed.
+ url = str(base_url.join(URL(href)))
+ try:
+ request = Request.from_url(url)
+ except ValidationError as exc:
+ context.log.warning(f'Skipping invalid URL "{url}": {exc}')
+ continue
+ extracted_requests.append(request)
+
+ # Add extracted requests to the queue with the same-domain strategy.
+ await context.add_requests(extracted_requests, strategy='same-domain')
+
+ await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/docs/guides/code_examples/http_crawlers/selectolax_adaptive_run.py b/docs/guides/code_examples/http_crawlers/selectolax_adaptive_run.py
new file mode 100644
index 0000000000..1419a51fed
--- /dev/null
+++ b/docs/guides/code_examples/http_crawlers/selectolax_adaptive_run.py
@@ -0,0 +1,38 @@
+import asyncio
+
+from crawlee.crawlers import (
+ AdaptivePlaywrightCrawler,
+ AdaptivePlaywrightCrawlerStatisticState,
+ AdaptivePlaywrightCrawlingContext,
+)
+from crawlee.statistics import Statistics
+
+from .selectolax_parser import SelectolaxLexborParser
+
+
+async def main() -> None:
+ crawler: AdaptivePlaywrightCrawler = AdaptivePlaywrightCrawler(
+ max_requests_per_crawl=10,
+ # Use custom Selectolax parser for static content parsing.
+ static_parser=SelectolaxLexborParser(),
+ # Set up statistics with AdaptivePlaywrightCrawlerStatisticState.
+ statistics=Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState),
+ )
+
+ @crawler.router.default_handler
+ async def handle_request(context: AdaptivePlaywrightCrawlingContext) -> None:
+ context.log.info(f'Processing {context.request.url} ...')
+ data = {
+ 'url': context.request.url,
+ 'title': await context.query_selector_one('title'),
+ }
+
+ await context.push_data(data)
+
+ await context.enqueue_links()
+
+ await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/docs/guides/code_examples/http_crawlers/selectolax_context.py b/docs/guides/code_examples/http_crawlers/selectolax_context.py
new file mode 100644
index 0000000000..8fe36c039c
--- /dev/null
+++ b/docs/guides/code_examples/http_crawlers/selectolax_context.py
@@ -0,0 +1,33 @@
+from dataclasses import dataclass, fields
+
+from selectolax.lexbor import LexborHTMLParser
+from typing_extensions import Self
+
+from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext
+
+
+@dataclass(frozen=True)
+class SelectolaxLexborContext(ParsedHttpCrawlingContext[LexborHTMLParser]):
+ """Crawling context providing access to the parsed page.
+
+ This context is passed to request handlers and includes all standard
+ context methods (push_data, enqueue_links, etc.) plus custom helpers.
+ """
+
+ @property
+ def parser(self) -> LexborHTMLParser:
+ """Convenient alias for accessing the parsed document."""
+ return self.parsed_content
+
+ @classmethod
+ def from_parsed_http_crawling_context(
+ cls, context: ParsedHttpCrawlingContext[LexborHTMLParser]
+ ) -> Self:
+ """Create custom context from the base context.
+
+ Copies all fields from the base context to preserve framework
+ functionality while adding custom interface.
+ """
+ return cls(
+ **{field.name: getattr(context, field.name) for field in fields(context)}
+ )
diff --git a/docs/guides/code_examples/http_crawlers/selectolax_crawler.py b/docs/guides/code_examples/http_crawlers/selectolax_crawler.py
new file mode 100644
index 0000000000..d5efc466e6
--- /dev/null
+++ b/docs/guides/code_examples/http_crawlers/selectolax_crawler.py
@@ -0,0 +1,42 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from selectolax.lexbor import LexborHTMLParser, LexborNode
+
+from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
+
+from .selectolax_context import SelectolaxLexborContext
+from .selectolax_parser import SelectolaxLexborParser
+
+if TYPE_CHECKING:
+ from collections.abc import AsyncGenerator
+
+ from typing_extensions import Unpack
+
+ from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext
+
+
+class SelectolaxLexborCrawler(
+ AbstractHttpCrawler[SelectolaxLexborContext, LexborHTMLParser, LexborNode]
+):
+ """Custom crawler using Selectolax Lexbor for HTML parsing."""
+
+ def __init__(
+ self,
+ **kwargs: Unpack[HttpCrawlerOptions[SelectolaxLexborContext]],
+ ) -> None:
+ # Final step converts the base context to custom context type.
+ async def final_step(
+ context: ParsedHttpCrawlingContext[LexborHTMLParser],
+ ) -> AsyncGenerator[SelectolaxLexborContext, None]:
+ yield SelectolaxLexborContext.from_parsed_http_crawling_context(context)
+
+ # Build context pipeline: HTTP request -> parsing -> custom context.
+ kwargs['_context_pipeline'] = (
+ self._create_static_content_crawler_pipeline().compose(final_step)
+ )
+ super().__init__(
+ parser=SelectolaxLexborParser(),
+ **kwargs,
+ )
diff --git a/docs/guides/code_examples/http_crawlers/selectolax_crawler_run.py b/docs/guides/code_examples/http_crawlers/selectolax_crawler_run.py
new file mode 100644
index 0000000000..52c25ac4da
--- /dev/null
+++ b/docs/guides/code_examples/http_crawlers/selectolax_crawler_run.py
@@ -0,0 +1,27 @@
+import asyncio
+
+from .selectolax_crawler import SelectolaxLexborContext, SelectolaxLexborCrawler
+
+
+async def main() -> None:
+ crawler = SelectolaxLexborCrawler(
+ max_requests_per_crawl=10,
+ )
+
+ @crawler.router.default_handler
+ async def handle_request(context: SelectolaxLexborContext) -> None:
+ context.log.info(f'Processing {context.request.url} ...')
+
+ data = {
+ 'url': context.request.url,
+ 'title': context.parser.css_first('title').text(),
+ }
+
+ await context.push_data(data)
+ await context.enqueue_links()
+
+ await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/docs/guides/code_examples/http_crawlers/selectolax_parser.py b/docs/guides/code_examples/http_crawlers/selectolax_parser.py
new file mode 100644
index 0000000000..1627a3b220
--- /dev/null
+++ b/docs/guides/code_examples/http_crawlers/selectolax_parser.py
@@ -0,0 +1,60 @@
+from __future__ import annotations
+
+import asyncio
+from typing import TYPE_CHECKING
+
+from selectolax.lexbor import LexborHTMLParser, LexborNode
+from typing_extensions import override
+
+from crawlee.crawlers._abstract_http import AbstractHttpParser
+
+if TYPE_CHECKING:
+ from collections.abc import Iterable, Sequence
+
+ from crawlee.http_clients import HttpResponse
+
+
+class SelectolaxLexborParser(AbstractHttpParser[LexborHTMLParser, LexborNode]):
+ """Parser for parsing HTTP response using Selectolax Lexbor."""
+
+ @override
+ async def parse(self, response: HttpResponse) -> LexborHTMLParser:
+ """Parse HTTP response body into a document object."""
+ response_body = await response.read()
+ # Run parsing in a thread to avoid blocking the event loop.
+ return await asyncio.to_thread(lambda: LexborHTMLParser(response_body))
+
+ @override
+ async def parse_text(self, text: str) -> LexborHTMLParser:
+ """Parse raw HTML string into a document object."""
+ return LexborHTMLParser(text)
+
+ @override
+ async def select(
+ self, parsed_content: LexborHTMLParser, selector: str
+ ) -> Sequence[LexborNode]:
+ """Select elements matching a CSS selector."""
+ return tuple(item for item in parsed_content.css(selector))
+
+ @override
+ def is_matching_selector(
+ self, parsed_content: LexborHTMLParser, selector: str
+ ) -> bool:
+ """Check if any element matches the selector."""
+ return parsed_content.css_first(selector) is not None
+
+ @override
+ def find_links(
+ self, parsed_content: LexborHTMLParser, selector: str
+ ) -> Iterable[str]:
+ """Extract href attributes from elements matching the selector.
+
+ Used by `enqueue_links` helper to discover URLs.
+ """
+ link: LexborNode
+ urls: list[str] = []
+ for link in parsed_content.css(selector):
+ url = link.attributes.get('href')
+ if url:
+ urls.append(url.strip())
+ return urls
diff --git a/docs/guides/http_crawlers.mdx b/docs/guides/http_crawlers.mdx
index 3cd29ed314..73c36b9c9c 100644
--- a/docs/guides/http_crawlers.mdx
+++ b/docs/guides/http_crawlers.mdx
@@ -8,11 +8,24 @@ import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
+import CodeBlock from '@theme/CodeBlock';
import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/beautifulsoup_example.py';
import ParselExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/parsel_example.py';
import HttpExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/http_example.py';
+import LxmlParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lxml_parser.py';
+import LxmlSaxoncheParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lxml_saxonche_parser.py';
+import LexborParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lexbor_parser.py';
+import PyqueryParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/pyquery_parser.py';
+import ScraplingParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/scrapling_parser.py';
+
+import SelectolaxParserSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_parser.py';
+import SelectolaxContextSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_context.py';
+import SelectolaxCrawlerSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_crawler.py';
+import SelectolaxCrawlerRunSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_crawler_run.py';
+import AdaptiveCrawlerRunSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_adaptive_run.py';
+
HTTP crawlers are ideal for extracting data from server-rendered websites that don't require JavaScript execution. These crawlers make requests via HTTP clients to fetch HTML content and then parse it using various parsing libraries. For client-side rendered content, where you need to execute JavaScript consider using [Playwright crawler](https://crawlee.dev/python/docs/guides/playwright-crawler) instead.
## Overview
@@ -84,7 +97,41 @@ The `HttpCrawler` provides direct acce
{HttpExample}
-## Creating custom HTTP crawler
+### Using custom parsers
+
+Since `HttpCrawler` provides raw HTTP responses, you can integrate any parsing library. Note that helpers like `enqueue_links` and `extract_links` are not available with this approach.
+
+The following examples demonstrate how to integrate with several popular parsing libraries, including [lxml](https://lxml.de/) (high-performance parsing with XPath 1.0), [lxml with SaxonC-HE](https://pypi.org/project/saxonche/) (XPath 3.1 support), [selectolax](https://github.com/rushter/selectolax) (high-speed CSS selectors), [PyQuery](https://pyquery.readthedocs.io/) (jQuery-like syntax), and [scrapling](https://github.com/D4Vinci/Scrapling) (a Scrapy/Parsel-style API offering BeautifulSoup-like methods).
+
+
+
+
+ {LxmlParser}
+
+
+
+
+ {LxmlSaxoncheParser}
+
+
+
+
+ {LexborParser}
+
+
+
+
+ {PyqueryParser}
+
+
+
+
+ {ScraplingParser}
+
+
+
+
+## Custom HTTP crawler
While the built-in crawlers cover most use cases, you might need a custom HTTP crawler for specialized parsing requirements. To create a custom HTTP crawler, inherit directly from `AbstractHttpCrawler`. This approach requires implementing:
@@ -94,8 +141,53 @@ While the built-in crawlers cover most use cases, you might need a custom HTTP c
This approach is recommended when you need tight integration between parsing and the crawling context, or when you're building a reusable crawler for a specific technology or format.
+The following example demonstrates how to create a custom crawler using `selectolax` with the `Lexbor` engine.
+
+### Parser implementation
+
+The parser converts HTTP responses into a parsed document and provides methods for element selection. Implement `AbstractHttpParser` using `selectolax` with required methods for parsing and querying:
+
+
+ {SelectolaxParserSource}
+
+
+This is enough to use your parser with `AbstractHttpCrawler.create_parsed_http_crawler_class` factory method. For more control, continue with custom context and crawler classes below.
+
+### Crawling context definition
+
+The crawling context is passed to request handlers and provides access to the parsed content. Extend `ParsedHttpCrawlingContext` to define the interface your handlers will work with. Here you can implement additional helpers for the crawler context.
+
+
+ {SelectolaxContextSource}
+
+
+### Crawler composition
+
+The crawler class connects the parser and context. Extend `AbstractHttpCrawler` and configure the context pipeline to use your custom components:
+
+
+ {SelectolaxCrawlerSource}
+
+
+### Crawler usage
+
+The custom crawler works like any built-in crawler. Request handlers receive your custom context with full access to framework helpers like `enqueue_links`. Additionally, the custom parser can be used with `AdaptivePlaywrightCrawler` for adaptive crawling:
+
+
+
+
+ {SelectolaxCrawlerRunSource}
+
+
+
+
+ {AdaptiveCrawlerRunSource}
+
+
+
+
## Conclusion
-This guide provided a comprehensive overview of HTTP crawlers in Crawlee. You learned about the three main crawler types - `BeautifulSoupCrawler` for fault-tolerant HTML parsing, `ParselCrawler` for high-performance extraction with XPath and CSS selectors, and `HttpCrawler` for raw response processing. You also discovered how to create custom crawlers for specific use cases.
+This guide provided a comprehensive overview of HTTP crawlers in Crawlee. You learned about the three main crawler types - `BeautifulSoupCrawler` for fault-tolerant HTML parsing, `ParselCrawler` for high-performance extraction with XPath and CSS selectors, and `HttpCrawler` for raw response processing. You also discovered how to integrate third-party parsing libraries with `HttpCrawler` and how to create fully custom crawlers using `AbstractHttpCrawler` for specialized parsing requirements.
If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
diff --git a/pyproject.toml b/pyproject.toml
index b7a9d72679..1593212972 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -256,6 +256,9 @@ module = [
"apify_fingerprint_datapoints", # Untyped and stubs not available
"camoufox", # Example code shows integration of camoufox and crawlee.
"fastapi", # Example code shows running in webserver.
+ "saxonche", # Example code shows HttpCrawler with custom parser.
+ "scrapling.*", # Example code shows HttpCrawler with custom parser.
+ "selectolax.*", # Example code shows HttpCrawler with custom parser.
"stagehand.*", # Example code shows integration of Stagehand and crawlee.
"starlette.*", # Example code shows running in webserver.
"flask", # Example code shows deploy on Google Cloud.
@@ -263,9 +266,11 @@ module = [
"jaro", # Untyped and stubs not available
"litestar", # Example code shows deploy on Google Cloud Run.
"loguru", # Example code shows integration of loguru and crawlee for JSON logging.
+ "lxml.*", # Example code shows HttpCrawler with custom parser.
"sklearn.linear_model", # Untyped and stubs not available
"cookiecutter.*", # Untyped and stubs not available
"inquirer.*", # Untyped and stubs not available
+ "pyquery", # Example code shows HttpCrawler with custom parser.
"warcio.*", # Example code shows WARC files creation.
"wrapt" # Untyped and stubs not available
]