Source code for web_poet.page_inputs.http

import json
from hashlib import sha1
from typing import Any, Optional, TypeVar, Union
from urllib.parse import urljoin

import attrs
from w3lib.encoding import (
    html_body_declared_encoding,
    html_to_unicode,
    http_content_type_encoding,
    read_bom,
    resolve_encoding,
)
from w3lib.url import canonicalize_url

from web_poet._base import _HttpHeaders
from web_poet.mixins import SelectableMixin, UrlShortcutsMixin
from web_poet.utils import _create_deprecated_class, memoizemethod_noargs

from .url import RequestUrl as _RequestUrl
from .url import ResponseUrl as _ResponseUrl

T_headers = TypeVar("T_headers", bound=_HttpHeaders)


RequestUrl = _create_deprecated_class("RequestUrl", _RequestUrl)
ResponseUrl = _create_deprecated_class("ResponseUrl", _ResponseUrl)


[docs]class HttpRequestBody(bytes): """A container for holding the raw HTTP request body in bytes format.""" pass
[docs]class HttpResponseBody(bytes): """A container for holding the raw HTTP response body in bytes format."""
[docs] def bom_encoding(self) -> Optional[str]: """Returns the encoding from the byte order mark if present.""" return read_bom(self)[0]
[docs] def declared_encoding(self) -> Optional[str]: """Return the encoding specified in meta tags in the html body, or ``None`` if no suitable encoding was found""" return html_body_declared_encoding(self)
[docs] def json(self) -> Any: """ Deserialize a JSON document to a Python object. """ return json.loads(self)
[docs]class HttpRequestHeaders(_HttpHeaders): """A container for holding the HTTP request headers. It's able to accept instantiation via an Iterable of Tuples: >>> pairs = [("Content-Encoding", "gzip"), ("content-length", "648")] >>> HttpRequestHeaders(pairs) <HttpRequestHeaders('Content-Encoding': 'gzip', 'content-length': '648')> It's also accepts a mapping of key-value pairs as well: >>> pairs = {"Content-Encoding": "gzip", "content-length": "648"} >>> headers = HttpRequestHeaders(pairs) >>> headers <HttpRequestHeaders('Content-Encoding': 'gzip', 'content-length': '648')> Note that this also supports case insensitive header-key lookups: >>> headers.get("content-encoding") 'gzip' >>> headers.get("Content-Length") '648' These are just a few of the functionalities it inherits from :class:`multidict.CIMultiDict`. For more info on its other features, read the API spec of :class:`multidict.CIMultiDict`. """ pass
[docs]class HttpResponseHeaders(_HttpHeaders): """A container for holding the HTTP response headers. It's able to accept instantiation via an Iterable of Tuples: >>> pairs = [("Content-Encoding", "gzip"), ("content-length", "648")] >>> HttpResponseHeaders(pairs) <HttpResponseHeaders('Content-Encoding': 'gzip', 'content-length': '648')> It's also accepts a mapping of key-value pairs as well: >>> pairs = {"Content-Encoding": "gzip", "content-length": "648"} >>> headers = HttpResponseHeaders(pairs) >>> headers <HttpResponseHeaders('Content-Encoding': 'gzip', 'content-length': '648')> Note that this also supports case insensitive header-key lookups: >>> headers.get("content-encoding") 'gzip' >>> headers.get("Content-Length") '648' These are just a few of the functionalities it inherits from :class:`multidict.CIMultiDict`. For more info on its other features, read the API spec of :class:`multidict.CIMultiDict`. """
[docs] def declared_encoding(self) -> Optional[str]: """Return encoding detected from the Content-Type header, or None if encoding is not found""" content_type = self.get("Content-Type", "") return http_content_type_encoding(content_type)
[docs]@attrs.define(auto_attribs=False, slots=False, eq=False) class HttpRequest: """Represents a generic HTTP request used by other functionalities in **web-poet** like :class:`~.HttpClient`. """ url: _RequestUrl = attrs.field(converter=_RequestUrl) method: str = attrs.field(default="GET", kw_only=True) headers: HttpRequestHeaders = attrs.field( factory=HttpRequestHeaders, converter=HttpRequestHeaders, kw_only=True ) body: HttpRequestBody = attrs.field( factory=HttpRequestBody, converter=HttpRequestBody, kw_only=True )
[docs] def urljoin(self, url: Union[str, _RequestUrl, _ResponseUrl]) -> _RequestUrl: """Return *url* as an absolute URL. If *url* is relative, it is made absolute relative to :attr:`url`.""" return _RequestUrl(urljoin(str(self.url), str(url)))
[docs]@attrs.define(auto_attribs=False, slots=False, eq=False) class HttpResponse(SelectableMixin, UrlShortcutsMixin): """A container for the contents of a response, downloaded directly using an HTTP client. ``url`` should be a URL of the response (after all redirects), not a URL of the request, if possible. ``body`` contains the raw HTTP response body. The following are optional since it would depend on the source of the ``HttpResponse`` if these are available or not. For example, the responses could simply come off from a local HTML file which doesn't contain ``headers`` and ``status``. ``status`` should represent the int status code of the HTTP response. ``headers`` should contain the HTTP response headers. ``encoding`` encoding of the response. If None (default), encoding is auto-detected from headers and body content. """ url: _ResponseUrl = attrs.field(converter=_ResponseUrl) body: HttpResponseBody = attrs.field(converter=HttpResponseBody) status: Optional[int] = attrs.field(default=None, kw_only=True) headers: HttpResponseHeaders = attrs.field( factory=HttpResponseHeaders, converter=HttpResponseHeaders, kw_only=True ) _encoding: Optional[str] = attrs.field(default=None, kw_only=True) _DEFAULT_ENCODING = "ascii" _cached_text: Optional[str] = None @property def text(self) -> str: """ Content of the HTTP body, converted to unicode using the detected encoding of the response, according to the web browser rules (respecting Content-Type header, etc.) """ # Access self.encoding before self._cached_text, because # there is a chance self._cached_text would be already populated # while detecting the encoding encoding = self.encoding if self._cached_text is None: fake_content_type_header = f"charset={encoding}" encoding, text = html_to_unicode(fake_content_type_header, self.body) self._cached_text = text return self._cached_text def _selector_input(self) -> str: return self.text @property def encoding(self) -> Optional[str]: """Encoding of the response""" return ( self._encoding or self._body_bom_encoding() or self._headers_declared_encoding() or self._body_declared_encoding() or self._body_inferred_encoding() )
[docs] @memoizemethod_noargs def json(self) -> Any: """Deserialize a JSON document to a Python object.""" return self.body.json()
@memoizemethod_noargs def _body_bom_encoding(self) -> Optional[str]: return self.body.bom_encoding() @memoizemethod_noargs def _headers_declared_encoding(self) -> Optional[str]: return self.headers.declared_encoding() @memoizemethod_noargs def _body_declared_encoding(self) -> Optional[str]: return self.body.declared_encoding() @memoizemethod_noargs def _body_inferred_encoding(self) -> Optional[str]: content_type = self.headers.get("Content-Type", "") body_encoding, text = html_to_unicode( content_type, self.body, # FIXME: type ignore can be removed when the following is released: # https://github.com/scrapy/w3lib/pull/190 auto_detect_fun=self._auto_detect_fun, # type: ignore[arg-type] default_encoding=self._DEFAULT_ENCODING, ) self._cached_text = text return body_encoding def _auto_detect_fun(self, body: bytes) -> Optional[str]: for enc in (self._DEFAULT_ENCODING, "utf-8", "cp1252"): try: body.decode(enc) except UnicodeError: continue return resolve_encoding(enc)
[docs]def request_fingerprint(req: HttpRequest) -> str: """Return the fingerprint of the request.""" fp = sha1() fp.update(req.method.encode() + b"\n") fp.update(canonicalize_url(str(req.url)).encode() + b"\n") for name, value in sorted(req.headers.items()): fp.update(f"{name.title()}:{value}\n".encode()) fp.update(b"\n") fp.update(req.body) return fp.hexdigest()