Source code for web_poet.framework._api

from __future__ import annotations

from typing import Annotated, Any, TypeAlias, get_args, get_origin

import andi
from andi.typeutils import strip_annotated
from playwright.async_api import async_playwright

from web_poet import default_registry
from web_poet.annotated import annotation_encode
from web_poet.page_inputs import (
    BrowserHtml,
    BrowserResponse,
    HttpRequest,
)
from web_poet.page_inputs.stats import DictStatCollector, StatCollector
from web_poet.page_inputs.url import RequestUrl, ResponseUrl
from web_poet.pages import ItemPage, is_injectable
from web_poet.rules import RulesRegistry
from web_poet.utils import ensure_awaitable

from ._providers import DEFAULT_PLAYWRIGHT_ENGINE, PROVIDERS, ResponseFetcher

ANNOTATION_PREFIX = "playwright_engine."


[docs] def playwright_engine(name: str) -> str: """Helper to create a hashable metadata value for Annotated Playwright engine names. Example usage: .. code-block:: python Annotated[BrowserResponse, playwright_engine("firefox")] """ return annotation_encode(f"{ANNOTATION_PREFIX}{name}")
RequestLike: TypeAlias = HttpRequest | RequestUrl | ResponseUrl | str def _normalize_request(request: RequestLike) -> HttpRequest: if not isinstance(request, HttpRequest): request = HttpRequest(url=request) return request
[docs] class Framework: """Manager of the :ref:`built-in framework <framework>`. *registry* is the :class:`~web_poet.rules.RulesRegistry` from where page objects resolve their dependencies. If ``None``, :data:`~web_poet.default_registry` is used. *default_playwright_engine* is the Playwright browser engine to use when browser inputs do not specify one. Examples: ``"chromium"``, ``"firefox"``, ``"webkit"``. *stats* is a :class:`~web_poet.page_inputs.stats.StatCollector` instance to collect stats written by the page object through the :class:`~web_poet.page_inputs.stats.Stats` dependency. If not specified, a :class:`~web_poet.page_inputs.stats.DictStatCollector` is used. You can access the collector through the :attr:`stats` attribute, e.g. to read its data. """ def __init__( self, *, registry: RulesRegistry | None = None, default_playwright_engine: str | None = None, stats: StatCollector | None = None, ) -> None: self._registry = registry or default_registry self._default_playwright_engine = default_playwright_engine self.stats: StatCollector = stats or DictStatCollector()
[docs] async def get_page( self, request: RequestLike, page_cls: type[ItemPage], *, page_params: dict[Any, Any] | None = None, ) -> ItemPage: """Return a page object built from *request* and *page_cls*. *page_params* is a dict that the page object may access through the :class:`~web_poet.page_inputs.PageParams` dependency. """ request = _normalize_request(request) plan = andi.plan( page_cls, is_injectable=is_injectable, externally_provided=set(PROVIDERS), ) instances: dict[Any, Any] = {} required_deps: set[type] = set() for fn_or_cls, _ in plan: base = strip_annotated(fn_or_cls) assert isinstance(base, type) required_deps.add(base) response_fetcher = ResponseFetcher( required_deps=required_deps, default_playwright_engine=self._default_playwright_engine, ) # first pass: collect explicit Playwright engine names from Annotated # browser deps explicit_engines: set[str] = set() for fn_or_cls, _ in plan: base = strip_annotated(fn_or_cls) if ( base in {BrowserResponse, BrowserHtml} and get_origin(fn_or_cls) is Annotated ): meta = get_args(fn_or_cls)[1:] if meta and isinstance(meta[0], str): m = meta[0] if m.startswith(ANNOTATION_PREFIX): explicit_engines.add(m.split(".", 1)[1]) if not explicit_engines: chosen_engine_for_unannotated = ( self._default_playwright_engine or DEFAULT_PLAYWRIGHT_ENGINE ) elif ( self._default_playwright_engine and self._default_playwright_engine in explicit_engines ): chosen_engine_for_unannotated = self._default_playwright_engine else: chosen_engine_for_unannotated = min(explicit_engines) # validate requested browsers are available in playwright before doing work needed_browsers = set(explicit_engines) # include chosen browser for unannotated deps if there are any browser deps if required_deps & {BrowserResponse, BrowserHtml}: needed_browsers.add(chosen_engine_for_unannotated) if needed_browsers: async with async_playwright() as playwright: for b in needed_browsers: if getattr(playwright, b, None) is None: raise ValueError(f"Playwright does not provide engine '{b}'") # second pass: instantiate dependencies, forwarding browser kwarg when needed for fn_or_cls, kwargs_spec in plan: kwargs = kwargs_spec.kwargs(instances) base = strip_annotated(fn_or_cls) assert isinstance(base, type) playwright_engine_kw: str | None = None if ( base in {BrowserResponse, BrowserHtml} and get_origin(fn_or_cls) is Annotated ): meta = get_args(fn_or_cls)[1:] if meta and isinstance(meta[0], str): m = meta[0] if m.startswith(ANNOTATION_PREFIX): playwright_engine_kw = m.split(".", 1)[1] elif base in {BrowserResponse, BrowserHtml}: playwright_engine_kw = chosen_engine_for_unannotated provider = PROVIDERS.get(base) if provider is not None: call_kwargs = { "request": request, "page_params": page_params, "page_cls": page_cls, "registry": self._registry, "response_fetcher": response_fetcher, "stats": self.stats, **kwargs, } if playwright_engine_kw is not None: call_kwargs["playwright_engine"] = playwright_engine_kw value = await ensure_awaitable(provider(**call_kwargs)) else: value = await ensure_awaitable(base(**kwargs)) instances[fn_or_cls] = value return instances[page_cls]
[docs] async def get_item( self, request: RequestLike, item_or_page_cls: type, *, page_params: dict[Any, Any] | None = None, ) -> Any: """Return an item built from *request*. *item_or_page_cls* is either an item class or a page object class. If it is an item class, the page class to use is determined by the :class:`~web_poet.rules.RulesRegistry` passed to :class:`~web_poet.framework.Framework`. *page_params* is a dict that the page object may access through the :class:`~web_poet.page_inputs.PageParams` dependency """ request = _normalize_request(request) if issubclass(item_or_page_cls, ItemPage): page_cls: type | None = item_or_page_cls else: page_cls = self._registry.page_cls_for_item(request.url, item_or_page_cls) if page_cls is None: raise ValueError(f"No page object class found for URL: {request.url}") assert page_cls is not None page = await self.get_page(request, page_cls, page_params=page_params) return await ensure_awaitable(page.to_item())