Source code for web_poet.mixins

from __future__ import annotations

import abc
from typing import TYPE_CHECKING, Protocol, Union
from urllib.parse import urljoin

import parsel
from w3lib.html import get_base_url

if TYPE_CHECKING:
    from web_poet.page_inputs.http import HttpResponse  # pragma: nocover
    from web_poet.page_inputs.url import RequestUrl, ResponseUrl  # pragma: nocover


class SelectorShortcutsMixin:
    def xpath(self, query, **kwargs) -> parsel.SelectorList:
        """A shortcut to ``.selector.xpath()``."""
        return self.selector.xpath(query, **kwargs)  # type: ignore[attr-defined]

    def css(self, query) -> parsel.SelectorList:
        """A shortcut to ``.selector.css()``."""
        return self.selector.css(query)  # type: ignore[attr-defined]

    def jmespath(self, query: str, **kwargs) -> parsel.SelectorList:
        """A shortcut to ``.selector.jmespath()``."""
        if not hasattr(self.selector, "jmespath"):  # type: ignore[attr-defined]
            raise AttributeError(
                "Please install parsel >= 1.8.1 to get jmespath support"
            )
        return self.selector.jmespath(query, **kwargs)  # type: ignore[attr-defined]


[docs]class SelectableMixin(abc.ABC, SelectorShortcutsMixin): """ Inherit from this mixin, implement ``._selector_input`` method, get ``.selector`` property and ``.xpath`` / ``.css`` / ``.jmespath`` methods. """ __cached_selector = None @abc.abstractmethod def _selector_input(self) -> str: raise NotImplementedError() # pragma: nocover @property def selector(self) -> parsel.Selector: """Cached instance of :external:class:`parsel.selector.Selector`.""" # caching is implemented in a manual way to avoid issues with # non-hashable classes, where memoizemethod_noargs doesn't work if self.__cached_selector is not None: return self.__cached_selector base_url = str(self.url) if hasattr(self, "url") else None sel = parsel.Selector(text=self._selector_input(), base_url=base_url) self.__cached_selector = sel return sel
class UrlShortcutsMixin: _cached_base_url = None def _url_shortcuts_input(self) -> str: return self._selector_input() # type: ignore[attr-defined] @property def _base_url(self) -> str: if self._cached_base_url is None: text = self._url_shortcuts_input()[:4096] self._cached_base_url = get_base_url(text, str(self.url)) # type: ignore[attr-defined] return self._cached_base_url def urljoin(self, url: Union[str, RequestUrl, ResponseUrl]) -> RequestUrl: """Return *url* as an absolute URL. If *url* is relative, it is made absolute relative to the base URL of *self*.""" from web_poet.page_inputs.url import RequestUrl return RequestUrl(urljoin(self._base_url, str(url))) class ResponseProtocol(Protocol): response: HttpResponse
[docs]class ResponseShortcutsMixin(ResponseProtocol, SelectableMixin, UrlShortcutsMixin): """Common shortcut methods for working with HTML responses. This mixin could be used with Page Object base classes. It requires "response" attribute to be present. """ _cached_base_url = None @property def url(self) -> str: """Shortcut to HTML Response's URL, as a string.""" return str(self.response.url) @property def html(self) -> str: """Shortcut to HTML Response's content.""" return self.response.text def _selector_input(self) -> str: return self.html @property def base_url(self) -> str: """Return the base url of the given response""" return self._base_url
[docs] def urljoin(self, url: str) -> str: # type: ignore[override] """Convert url to absolute, taking in account url and baseurl of the response""" return str(super().urljoin(url))