import asyncio
import logging
from dataclasses import dataclass
from http import HTTPStatus
from typing import Callable, Dict, Iterable, List, Optional, Union, cast
from web_poet.exceptions import HttpError, HttpResponseError
from web_poet.exceptions.core import NoSavedHttpResponse
from web_poet.page_inputs.http import (
HttpRequest,
HttpRequestBody,
HttpRequestHeaders,
HttpResponse,
request_fingerprint,
)
from web_poet.page_inputs.url import _Url
from web_poet.requests import _perform_request
from web_poet.utils import as_list
logger = logging.getLogger(__name__)
_StrMapping = Dict[str, str]
_Headers = Union[_StrMapping, HttpRequestHeaders]
_Body = Union[bytes, HttpRequestBody]
_StatusList = Union[str, int, List[Union[str, int]]]
@dataclass
class _SavedResponseData:
"""Class for storing a request and its result."""
request: HttpRequest
response: Optional[HttpResponse]
exception: Optional[HttpError] = None
def fingerprint(self) -> str:
"""Return the request fingeprint."""
return request_fingerprint(self.request)
[docs]class HttpClient:
"""Async HTTP client to be used in Page Objects.
See :ref:`additional-requests` for the usage information.
HttpClient doesn't make HTTP requests on itself. It uses either the
request function assigned to the ``web_poet.request_downloader_var``
:mod:`contextvar <contextvars>`, or a function passed via
``request_downloader`` argument of the :meth:`~.HttpClient.__init__` method.
Either way, this function should be an ``async def`` function which
receives an :class:`~.HttpRequest` instance, and either returns a
:class:`~.HttpResponse` instance, or raises a subclass of
:class:`~.HttpError`. You can read more in the
:ref:`advanced-downloader-impl` documentation.
"""
def __init__(
self,
request_downloader: Optional[Callable] = None,
*,
save_responses: bool = False,
return_only_saved_responses: bool = False,
responses: Optional[Iterable[_SavedResponseData]] = None,
):
self._request_downloader = request_downloader or _perform_request
self.save_responses = save_responses
self.return_only_saved_responses = return_only_saved_responses
self._saved_responses: Dict[str, _SavedResponseData] = {
data.fingerprint(): data for data in responses or []
}
@staticmethod
def _handle_status(
response: HttpResponse,
request: HttpRequest,
*,
allow_status: Optional[_StatusList] = None,
) -> None:
allow_status_normalized = list(map(str, as_list(allow_status)))
allow_all_status = any(
True for s in allow_status_normalized if "*" == s.strip()
)
if (
allow_all_status
or response.status is None # allows serialized responses from tests
or response.status < 400
or str(response.status) in allow_status_normalized
):
return
status_name = _http_status_name(response.status)
msg = f"{response.status} {status_name} response for {response.url}"
raise HttpResponseError(msg, request=request, response=response)
[docs] async def request(
self,
url: Union[str, _Url],
*,
method: str = "GET",
headers: Optional[_Headers] = None,
body: Optional[_Body] = None,
allow_status: Optional[_StatusList] = None,
) -> HttpResponse:
"""This is a shortcut for creating an :class:`~.HttpRequest` instance and
executing that request.
:class:`~.HttpRequestError` is raised for
*connection errors*, *connection and read timeouts*, etc.
An :class:`~.HttpResponse` instance is returned for successful
responses in the ``100-3xx`` status code range.
Otherwise, an exception of type :class:`~.HttpResponseError` is raised.
Rasing :class:`~.HttpResponseError` can be suppressed for certain
status codes using the ``allow_status`` param - it is
a list of status code values for which :class:`~.HttpResponse`
should be returned instead of raising :class:`~.HttpResponseError`.
There is a special "*" ``allow_status`` value which allows
any status code.
There is no need to include ``100-3xx`` status codes in ``allow_status``,
because :class:`~.HttpResponseError` is not raised for them.
"""
headers = headers or {}
body = body or b""
req = HttpRequest(url=url, method=method, headers=headers, body=body)
response = await self.execute(req, allow_status=allow_status)
return response
[docs] async def get(
self,
url: Union[str, _Url],
*,
headers: Optional[_Headers] = None,
allow_status: Optional[_StatusList] = None,
) -> HttpResponse:
"""Similar to :meth:`~.HttpClient.request` but peforming a ``GET``
request.
"""
return await self.request(
url=url,
method="GET",
headers=headers,
allow_status=allow_status,
)
[docs] async def post(
self,
url: Union[str, _Url],
*,
headers: Optional[_Headers] = None,
body: Optional[_Body] = None,
allow_status: Optional[_StatusList] = None,
) -> HttpResponse:
"""Similar to :meth:`~.HttpClient.request` but performing a ``POST``
request.
"""
return await self.request(
url=url,
method="POST",
headers=headers,
body=body,
allow_status=allow_status,
)
[docs] async def execute(
self, request: HttpRequest, *, allow_status: Optional[_StatusList] = None
) -> HttpResponse:
"""Execute the specified :class:`~.HttpRequest` instance using the
request implementation configured in the :class:`~.HttpClient`
instance.
:class:`~.HttpRequestError` is raised for
*connection errors*, *connection and read timeouts*, etc.
:class:`~.HttpResponse` instance is returned for successful
responses in the ``100-3xx`` status code range.
Otherwise, an exception of type :class:`~.HttpResponseError` is raised.
Rasing :class:`~.HttpResponseError` can be suppressed for certain
status codes using the ``allow_status`` param - it is
a list of status code values for which :class:`~.HttpResponse`
should be returned instead of raising :class:`~.HttpResponseError`.
There is a special "*" ``allow_status`` value which allows
any status code.
There is no need to include ``100-3xx`` status codes in ``allow_status``,
because :class:`~.HttpResponseError` is not raised for them.
"""
if self.return_only_saved_responses:
for fp, saved_data in self._saved_responses.items():
if request_fingerprint(request) == fp:
if saved_data.exception:
raise saved_data.exception
assert saved_data.response
self._handle_status(
saved_data.response,
saved_data.request,
allow_status=allow_status,
)
return saved_data.response
raise NoSavedHttpResponse(request=request)
try:
response = await self._request_downloader(request)
except HttpError as ex:
if self.save_responses:
self._saved_responses[request_fingerprint(request)] = (
_SavedResponseData(request, None, ex)
)
raise
if self.save_responses:
self._saved_responses[request_fingerprint(request)] = _SavedResponseData(
request, response
)
self._handle_status(response, request, allow_status=allow_status)
return response
[docs] async def batch_execute(
self,
*requests: HttpRequest,
return_exceptions: bool = False,
allow_status: Optional[_StatusList] = None,
) -> List[Union[HttpResponse, HttpResponseError]]:
"""Similar to :meth:`~.HttpClient.execute` but accepts a collection of
:class:`~.HttpRequest` instances that would be batch executed.
The order of the :class:`~.HttpResponses` would correspond to the order
of :class:`~.HttpRequest` passed.
If any of the :class:`~.HttpRequest` raises an exception upon execution,
the exception is raised.
To prevent this, the actual exception can be returned alongside any
successful :class:`~.HttpResponse`. This enables salvaging any usable
responses despite any possible failures. This can be done by setting
``True`` to the ``return_exceptions`` parameter.
Like :meth:`~.HttpClient.execute`, :class:`~.HttpResponseError`
will be raised for responses with status codes in the ``400-5xx`` range.
The ``allow_status`` parameter could be used the same way here to prevent
these exceptions from being raised.
You can omit ``allow_status="*"`` if you're passing ``return_exceptions=True``.
However, it would be returning :class:`~.HttpResponseError`
instead of :class:`~.HttpResponse`.
Lastly, a :class:`~.HttpRequestError` may be raised
on cases like *connection errors*, *connection and read timeouts*, etc.
"""
coroutines = [self.execute(r, allow_status=allow_status) for r in requests]
responses = await asyncio.gather(
*coroutines, return_exceptions=return_exceptions
)
return cast(List[Union[HttpResponse, HttpResponseError]], responses)
[docs] def get_saved_responses(self) -> Iterable[_SavedResponseData]:
"""Return saved requests and responses."""
return self._saved_responses.values()
def _http_status_name(status: int):
"""
>>> _http_status_name(200)
'OK'
>>> _http_status_name(404)
'NOT_FOUND'
>>> _http_status_name(999)
'UNKNOWN'
"""
try:
return HTTPStatus(status).name
except ValueError:
return "UNKNOWN"