diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 74ab4a5..4f41e86 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -93,6 +93,9 @@ jobs: artifact: dist/*.tar.gz - source: wheel artifact: dist/*.whl + - opts: "" + - python-version: graalpy-24 + opts: "--experimental-options --engine.CompileOnly='~tregex re'" steps: - name: Checkout working copy uses: actions/checkout@v4 @@ -127,6 +130,6 @@ jobs: name: ${{ matrix.source }} path: dist/ - name: install package in environment - run: pip install ${{ matrix.artifact || '.' }} + run: python -m pip install ${{ matrix.artifact || '.' }} - name: run tests - run: pytest -v -Werror -Wignore::ImportWarning --doctest-glob="*.rst" -ra + run: python ${{ matrix.opts }} -m pytest -v -Werror -Wignore::ImportWarning --doctest-glob="*.rst" -ra diff --git a/README.rst b/README.rst index 096a647..d3805ea 100644 --- a/README.rst +++ b/README.rst @@ -30,17 +30,20 @@ Just add ``ua-parser`` to your project's dependencies, or run to install in the current environment. -Installing `google-re2 `_ is -*strongly* recommended as it leads to *significantly* better -performances. This can be done directly via the ``re2`` optional -dependency: +Installing `ua-parser-rs `_ or +`google-re2 `_ is *strongly* +recommended as they yield *significantly* better performances. This +can be done directly via the ``regex`` and ``re2`` optional +dependencies respectively: .. code-block:: sh + $ pip install 'ua_parser[regex]' $ pip install 'ua_parser[re2]' -If ``re2`` is available, ``ua-parser`` will simply use it by default -instead of the pure-python resolver. +If either dependency is already available (e.g. because the software +makes use of re2 for other reasons) ``ua-parser`` will use the +corresponding resolver automatically. Quick Start ----------- diff --git a/doc/api.rst b/doc/api.rst index 18a7d48..6f984a4 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -75,6 +75,19 @@ from user agent strings. .. warning:: Only available if |re2|_ is installed. +.. class::ua_parser.regex.Resolver(Matchers) + + An advanced resolver based on |regex|_ and a bespoke implementation + of regex prefiltering, by the sibling project `ua-rust + _ is + installed. + Eager Matchers '''''''''''''' diff --git a/doc/guides.rst b/doc/guides.rst index b216d18..039bd24 100644 --- a/doc/guides.rst +++ b/doc/guides.rst @@ -129,6 +129,103 @@ from here on:: :class:`~ua_parser.caching.Local`, which is also caching-related, and serves to use thread-local caches rather than a shared cache. +Builtin Resolvers +================= + +.. list-table:: + :header-rows: 1 + :stub-columns: 1 + + * - + - speed + - portability + - memory use + - safety + * - ``regex`` + - great + - good + - bad + - great + * - ``re2`` + - good + - bad + - good + - good + * - ``basic`` + - terrible + - great + - great + - great + +``regex`` +--------- + +The ``regex`` resolver is a bespoke effort as part of the `uap-rust +`_ sibling project, built on +`rust-regex `_ and `a bespoke +regex-prefiltering implementation +`_, +it: + +- Is the fastest available resolver, usually edging out ``re2`` by a + significant margin (when that is even available). +- Is fully controlled by the project, and thus can be built for all + interpreters and platforms supported by pyo3 (currently: cpython, + pypy, and graalpy, on linux, macos and linux, intel and arm). It is + also built as a cpython abi3 wheel and should thus suffer from no + compatibility issues with new release. +- Built entirely out of safe rust code, its safety risks are entirely + in ``regex`` and ``pyo3``. +- Its biggest drawback is that it is a lot more memory intensive than + the other resolvers, because ``regex`` tends to trade memory for + speed (~155MB high water mark on a real-world dataset). + +If available, it is the default resolver, without a cache. + +``re2`` +------- + +The ``re2`` resolver is built atop the widely used `google-re2 +`_ via its built-in Python bindings. +It: + +- Is extremely fast, though around 80% slower than ``regex`` on + real-world data. +- Is only compatible with CPython, and uses pure API wheels, so needs + a different release for each cpython version, for each OS, for each + architecture. +- Is built entirely in C++, but by experienced Google developers. +- Is more memory intensive than the pure-python ``basic`` resolver, + but quite slim all things considered (~55MB high water mark on a + real-world dataset). + +If available, it is the second-preferred resolver, without a cache. + +``basic`` +--------- + +The ``basic`` resolver is a naive linear traversal of all rules, using +the standard library's ``re``. It: + +- Is *extremely* slow, about 10x slower than ``re2`` in cpython, and + pypy and graal's regex implementations do *not* like the workload + and behind cpython by a factor of 3~4. +- Has perfect compatibility, with the caveat above, by virtue of being + built entirely out of standard library code. +- Is basically as safe as Python software can be by virtue of being + just Python, with the native code being the standard library's. +- Is the slimmest resolver at about 40MB. + +This is caveated by a hard requirement to use caches which makes it +workably faster on real-world datasets (if still nowhere near +*uncached* ``re2`` or ``regex``) but increases its memory requirement +significantly e.g. using "sieve" and a cache size of 20000 on a +real-world dataset, it is about 4x slower than ``re2`` for about the +same memory requirements. + +It is the fallback and least preferred resolver, with a medium +(currently 2000 entries) cache by default. + Writing Custom Resolvers ======================== diff --git a/doc/installation.rst b/doc/installation.rst index d4bf7ba..ac6b311 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -35,3 +35,9 @@ if installed, but can also be installed via and alongside ua-parser: $ pip install 'ua-parser[yaml]' $ pip install 'ua-parser[regex,yaml]' +``yaml`` simply enables the ability to :func:`load yaml rulesets +`. + +The other two dependencies enable more efficient resolvers. By +default, ``ua-parser`` will select the fastest resolver it finds out +of the available set. For more, see :ref:`builtin resolvers`. diff --git a/pyproject.toml b/pyproject.toml index 1979ebb..65271a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,7 +86,7 @@ warn_redundant_casts = true # these can be overridden (maybe?) strict_equality = true -strict_concatenate = true +# strict_concatenate = true check_untyped_defs = true disallow_subclassing_any = true disallow_untyped_decorators = true @@ -110,6 +110,7 @@ module = [ "test_core", "test_caches", "test_parsers_basics", + "test_fa_simplifier", ] #check_untyped_defs = false diff --git a/src/ua_parser/__init__.py b/src/ua_parser/__init__.py index a9a09b4..19b6faa 100644 --- a/src/ua_parser/__init__.py +++ b/src/ua_parser/__init__.py @@ -57,10 +57,25 @@ UserAgent, ) from .loaders import load_builtins, load_lazy_builtins +from .utils import IS_GRAAL -Re2Resolver: Optional[Callable[[Matchers], Resolver]] = None +_ResolverCtor = Callable[[Matchers], Resolver] +Re2Resolver: Optional[_ResolverCtor] = None if importlib.util.find_spec("re2"): from .re2 import Resolver as Re2Resolver +RegexResolver: Optional[_ResolverCtor] = None +if importlib.util.find_spec("ua_parser_rs"): + from .regex import Resolver as RegexResolver +BestAvailableResolver: _ResolverCtor = next( + filter( + None, + ( + RegexResolver, + Re2Resolver, + lambda m: CachingResolver(BasicResolver(m), Cache(2000)), + ), + ) +) VERSION = (1, 0, 0) @@ -81,15 +96,7 @@ def from_matchers(cls, m: Matchers, /) -> Parser: stack. """ - if Re2Resolver is not None: - return cls(Re2Resolver(m)) - else: - return cls( - CachingResolver( - BasicResolver(m), - Cache(200), - ) - ) + return cls(BestAvailableResolver(m)) def __init__(self, resolver: Resolver) -> None: self.resolver = resolver @@ -132,10 +139,11 @@ def parse_device(self: Resolver, ua: str) -> Optional[Device]: def __getattr__(name: str) -> Parser: global parser if name == "parser": - parser = Parser.from_matchers( - load_builtins() if Re2Resolver is None else load_lazy_builtins() - ) - return parser + if RegexResolver or Re2Resolver or IS_GRAAL: + matchers = load_lazy_builtins() + else: + matchers = load_builtins() + return Parser.from_matchers(matchers) raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/src/ua_parser/basic.py b/src/ua_parser/basic.py index bdc1e69..00b49e1 100644 --- a/src/ua_parser/basic.py +++ b/src/ua_parser/basic.py @@ -1,7 +1,9 @@ __all__ = ["Resolver"] +import re +from itertools import chain from operator import methodcaller -from typing import List +from typing import Any, List from .core import ( Device, @@ -12,6 +14,7 @@ PartialResult, UserAgent, ) +from .utils import IS_GRAAL, fa_simplifier class Resolver: @@ -30,6 +33,24 @@ def __init__( matchers: Matchers, ) -> None: self.user_agent_matchers, self.os_matchers, self.device_matchers = matchers + if IS_GRAAL: + matcher: Any + kind = next( + ( + "eager" if hasattr(type(m), "regex") else "lazy" + for m in chain.from_iterable(matchers) + ), + None, + ) + if kind == "eager": + for matcher in chain.from_iterable(matchers): + matcher.pattern = re.compile( + fa_simplifier(matcher.pattern.pattern), + flags=matcher.pattern.flags, + ) + elif kind == "lazy": + for matcher in chain.from_iterable(matchers): + matcher.regex = fa_simplifier(matcher.pattern.pattern) def __call__(self, ua: str, domains: Domain, /) -> PartialResult: parse = methodcaller("__call__", ua) diff --git a/src/ua_parser/re2.py b/src/ua_parser/re2.py index 83a4a14..1f17e22 100644 --- a/src/ua_parser/re2.py +++ b/src/ua_parser/re2.py @@ -14,6 +14,7 @@ PartialResult, UserAgent, ) +from .utils import fa_simplifier class DummyFilter: @@ -38,7 +39,7 @@ def __init__( if self.user_agent_matchers: self.ua = re2.Filter() for u in self.user_agent_matchers: - self.ua.Add(u.regex) + self.ua.Add(fa_simplifier(u.regex)) self.ua.Compile() else: self.ua = DummyFilter() @@ -46,7 +47,7 @@ def __init__( if self.os_matchers: self.os = re2.Filter() for o in self.os_matchers: - self.os.Add(o.regex) + self.os.Add(fa_simplifier(o.regex)) self.os.Compile() else: self.os = DummyFilter() @@ -58,9 +59,9 @@ def __init__( # no pattern uses global flags, but since they're not # supported in JS that seems safe. if d.flags & re.IGNORECASE: - self.devices.Add("(?i)" + d.regex) + self.devices.Add("(?i)" + fa_simplifier(d.regex)) else: - self.devices.Add(d.regex) + self.devices.Add(fa_simplifier(d.regex)) self.devices.Compile() else: self.devices = DummyFilter() diff --git a/src/ua_parser/utils.py b/src/ua_parser/utils.py index f3afa48..ac11c5a 100644 --- a/src/ua_parser/utils.py +++ b/src/ua_parser/utils.py @@ -1,6 +1,9 @@ +import platform import re from typing import Match, Optional +IS_GRAAL: bool = platform.python_implementation() == "GraalVM" + def get(m: Match[str], idx: int) -> Optional[str]: return (m[idx] or None) if 0 < idx <= m.re.groups else None @@ -28,3 +31,33 @@ def replacer(repl: str, m: Match[str]) -> Optional[str]: return None return re.sub(r"\$(\d)", lambda n: get(m, int(n[1])) or "", repl).strip() or None + + +REPETITION_PATTERN = re.compile(r"\{(0|1)\s*,\s*\d{3,}\}") +CLASS_PATTERN = re.compile( + r""" +\[[^]]*\\(d|w)[^]]*\] +| +\\(d|w) +""", + re.VERBOSE, +) + + +def class_replacer(m: re.Match[str]) -> str: + d, w = ("0-9", "A-Za-z0-9_") if m[1] else ("[0-9]", "[A-Za-z0-9_]") + return m[0].replace(r"\d", d).replace(r"\w", w) + + +def fa_simplifier(pattern: str) -> str: + """uap-core makes significant use of large bounded repetitions, to + mitigate catastrophic backtracking. + + However this explodes the number of states (and thus graph size) + for finite automaton engines, which significantly increases their + memory use, and for those which use JITs it can exceed the JIT + threshold and force fallback to a slower engine (seems to be the + case for graal's TRegex). + """ + pattern = REPETITION_PATTERN.sub(lambda m: "*" if m[1] == "0" else "+", pattern) + return CLASS_PATTERN.sub(class_replacer, pattern) diff --git a/tests/test_fa_simplifier.py b/tests/test_fa_simplifier.py new file mode 100644 index 0000000..1c66050 --- /dev/null +++ b/tests/test_fa_simplifier.py @@ -0,0 +1,15 @@ +import pytest # type: ignore + +from ua_parser.utils import fa_simplifier + + +@pytest.mark.parametrize( + ("from_", "to"), + [ + (r"\d", "[0-9]"), + (r"[\d]", "[0-9]"), + (r"[\d\.]", r"[0-9\.]"), + ], +) +def test_classes(from_, to): + assert fa_simplifier(from_) == to diff --git a/tox.ini b/tox.ini index d8c7cf0..de36509 100644 --- a/tox.ini +++ b/tox.ini @@ -1,11 +1,11 @@ [tox] min_version = 4.0 env_list = py3{9,10,11,12} - pypy3.10 - graalpy-24 + pypy + graalpy flake8, black, typecheck labels = - test = py3{9,10,11,12},pypy3.10,graalpy-24 + test = py3{9,10,11,12},pypy,graalpy cpy = py3{9,10,11,12} pypy = pypy3.10 graal = graalpy-24 @@ -26,13 +26,7 @@ deps = commands = pytest -Werror --doctest-glob="*.rst" {posargs} -[testenv:pypy3.10] -deps = - pytest - pyyaml - ua-parser-rs - -[testenv:graalpy-24] +[testenv:{pypy,graalpy}] deps = pytest pyyaml