From 736080f5c0881279536dcc6e44b51d1acfaf9351 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 26 Sep 2023 12:00:49 +0200 Subject: [PATCH 1/7] Drop the Set-Cookie header --- scrapy_zyte_api/responses.py | 6 +++++- tests/test_responses.py | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 18b63401..7456e799 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -22,7 +22,11 @@ class ZyteAPIMixin: # Zyte API already decompresses the HTTP Response Body. Scrapy's # HttpCompressionMiddleware will error out when it attempts to # decompress an already decompressed body based on this header. - "content-encoding" + "content-encoding", + # Cookies should be fetched from experimental.responseCookies only, the + # Set-Cookie may define cookies from the main HTTP response that were + # overriden later on during browser rendering, for example. + "set-cookie", } def __init__(self, *args, raw_api_response: Optional[Dict] = None, **kwargs): diff --git a/tests/test_responses.py b/tests/test_responses.py index 9705a6fc..836f31d3 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -235,6 +235,7 @@ def test_response_headers_removal(api_response, cls): """ additional_headers = [ {"name": "Content-Encoding", "value": "gzip"}, + {"name": "Set-Cookie", "value": "a=b"}, {"name": "X-Some-Other-Value", "value": "123"}, ] raw_response = api_response() From 5d25576f2295c57a191c53159a912562ce814175 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 26 Sep 2023 12:03:35 +0200 Subject: [PATCH 2/7] Fix a typo, reword a bit --- scrapy_zyte_api/responses.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 7456e799..964dd939 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -24,8 +24,8 @@ class ZyteAPIMixin: # decompress an already decompressed body based on this header. "content-encoding", # Cookies should be fetched from experimental.responseCookies only, the - # Set-Cookie may define cookies from the main HTTP response that were - # overriden later on during browser rendering, for example. + # Set-Cookie header may define cookies from the main HTTP response that + # were overriden later on, e.g. during browser rendering. "set-cookie", } From 867cf7780311c1704c34663c58340fe66b73c115 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 4 Oct 2023 09:13:53 +0200 Subject: [PATCH 3/7] Do not drop Set-Cookie for HTTP responses without experimenta.responseCookies --- scrapy_zyte_api/responses.py | 27 +++++++---- tests/test_responses.py | 94 +++++++++++++++++++++++++++++++++++- 2 files changed, 110 insertions(+), 11 deletions(-) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 964dd939..25af55a8 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -1,4 +1,5 @@ from base64 import b64decode +from copy import copy from datetime import datetime from typing import Any, Dict, List, Optional, Tuple, Union @@ -23,10 +24,6 @@ class ZyteAPIMixin: # HttpCompressionMiddleware will error out when it attempts to # decompress an already decompressed body based on this header. "content-encoding", - # Cookies should be fetched from experimental.responseCookies only, the - # Set-Cookie header may define cookies from the main HTTP response that - # were overriden later on, e.g. during browser rendering. - "set-cookie", } def __init__(self, *args, raw_api_response: Optional[Dict] = None, **kwargs): @@ -94,18 +91,28 @@ def _prepare_headers(cls, api_response: Dict[str, Any]): input_headers: Optional[List[Dict[str, str]]] = api_response.get( "httpResponseHeaders" ) + response_cookies: Optional[List[Dict[str, str]]] = api_response.get( + "experimental", {} + ).get("responseCookies") if input_headers: + headers_to_remove = copy(cls.REMOVE_HEADERS) + if response_cookies or "httpResponseBody" not in api_response: + # Only the experimental.responseCookies response field should be + # used in general to get cookies. since the Set-Cookie header may + # define cookies from the main HTTP response that were overridden + # later on, e.g. during browser rendering. + # However, for HTTP requests (i.e. with httpResponseBody), it + # is OK to keep the Set-Cookie header if + # experimental.responseCookies was not received. + headers_to_remove.add("set-cookie") result = { h["name"]: [h["value"]] for h in input_headers - if h["name"].lower() not in cls.REMOVE_HEADERS + if h["name"].lower() not in headers_to_remove } - input_cookies: Optional[List[Dict[str, str]]] = api_response.get( - "experimental", {} - ).get("responseCookies") - if input_cookies: + if response_cookies: result["Set-Cookie"] = [] - for cookie in input_cookies: + for cookie in response_cookies: result["Set-Cookie"].append( cls._response_cookie_to_header_value(cookie) ) diff --git a/tests/test_responses.py b/tests/test_responses.py index 836f31d3..bf8341c3 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -243,16 +243,108 @@ def test_response_headers_removal(api_response, cls): response = cls.from_api_response(raw_response) - assert response.headers == { + expected_headers = { b"X-Some-Other-Value": [b"123"], **OUTPUT_COOKIE_HEADERS, } + assert response.headers == expected_headers assert ( response.raw_api_response["httpResponseHeaders"] == raw_response["httpResponseHeaders"] ) +INPUT_COOKIES_SIMPLE = [{"name": "c", "value": "d"}] + + +@pytest.mark.parametrize( + "fields,cls,keep", + [ + # For HTTP requests, whether the response Set-Cookie header is kept or + # not depends on whether experimental.responseCookies is received. + *( + ( + { + "httpResponseBody": b64encode(PAGE_CONTENT.encode("utf-8")), + "httpResponseHeaders": [ + {"name": "Content-Type", "value": "text/html"}, + {"name": "Content-Length", "value": str(len(PAGE_CONTENT))}, + ], + **cookie_fields, + }, + ZyteAPIResponse, + keep, + ) + for cookie_fields, keep in ( + # No response cookies, so Set-Cookie is kept. + ( + {}, + True, + ), + # Response cookies, so Set-Cookie is not kept. + ( + { + "experimental": { + "responseCookies": INPUT_COOKIES_SIMPLE, + }, + }, + False, + ), + ) + ), + # For non-HTTP requests, the response Set-Cookie header is always + # dropped. + *( + ( + { + "browserHtml": PAGE_CONTENT, + "httpResponseHeaders": [ + {"name": "Content-Type", "value": "text/html"}, + {"name": "Content-Length", "value": str(len(PAGE_CONTENT))}, + ], + **cookie_fields, + }, + ZyteAPITextResponse, + False, + ) + for cookie_fields in ( + {}, + { + "experimental": { + "responseCookies": INPUT_COOKIES_SIMPLE, + }, + }, + ) + ), + ], +) +def test_response_cookie_header(fields, cls, keep): + """Test the logic to keep or not the Set-Cookie header in response + headers.""" + expected_headers = { + **{ + header["name"].encode(): [header["value"].encode()] + for header in fields["httpResponseHeaders"] + }, + } + if keep: + expected_headers[b"Set-Cookie"] = [b"a=b"] + elif "experimental" in fields: + expected_headers[b"Set-Cookie"] = [b"c=d"] + + fields["url"] = "https://example.com" + fields["statusCode"] = 200 + fields["httpResponseHeaders"].append({"name": "Set-Cookie", "value": "a=b"}) + + response = cls.from_api_response(fields) + + assert response.headers == expected_headers + assert ( + response.raw_api_response["httpResponseHeaders"] + == fields["httpResponseHeaders"] + ) + + def test__process_response_no_body(): """The _process_response() function should handle missing 'browserHtml' or 'httpResponseBody'. From 97acb86e0ac627e0d93efa7c1db213789b021812 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 4 Oct 2023 09:19:46 +0200 Subject: [PATCH 4/7] Comment improvements --- scrapy_zyte_api/responses.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 25af55a8..a384e8bb 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -97,13 +97,15 @@ def _prepare_headers(cls, api_response: Dict[str, Any]): if input_headers: headers_to_remove = copy(cls.REMOVE_HEADERS) if response_cookies or "httpResponseBody" not in api_response: - # Only the experimental.responseCookies response field should be - # used in general to get cookies. since the Set-Cookie header may - # define cookies from the main HTTP response that were overridden - # later on, e.g. during browser rendering. + # Only the experimental.responseCookies response field should + # be used in general to get cookies, since the Set-Cookie + # header may define cookies from the main HTTP response that + # were overridden later on, e.g. during browser rendering. # However, for HTTP requests (i.e. with httpResponseBody), it # is OK to keep the Set-Cookie header if - # experimental.responseCookies was not received. + # experimental.responseCookies was not received, as it should + # be identical to what experimental.responseCookies would + # contain. headers_to_remove.add("set-cookie") result = { h["name"]: [h["value"]] From ce2da624bb82687e392f9fca1739f4316025e353 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 4 Oct 2023 09:25:28 +0200 Subject: [PATCH 5/7] Silence mypy issues in new tests --- tests/test_responses.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_responses.py b/tests/test_responses.py index bf8341c3..9299ef5b 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -270,7 +270,7 @@ def test_response_headers_removal(api_response, cls): {"name": "Content-Type", "value": "text/html"}, {"name": "Content-Length", "value": str(len(PAGE_CONTENT))}, ], - **cookie_fields, + **cookie_fields, # type: ignore[dict-item] }, ZyteAPIResponse, keep, @@ -302,7 +302,7 @@ def test_response_headers_removal(api_response, cls): {"name": "Content-Type", "value": "text/html"}, {"name": "Content-Length", "value": str(len(PAGE_CONTENT))}, ], - **cookie_fields, + **cookie_fields, # type: ignore[dict-item] }, ZyteAPITextResponse, False, From ad1444a07f341db8c234b0565ba40f0c87ad01c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 23 Nov 2023 11:54:11 +0100 Subject: [PATCH 6/7] Always drop Set-Cookie if responseCookies are received --- scrapy_zyte_api/responses.py | 11 +-------- tests/test_responses.py | 44 +++++++++++++----------------------- 2 files changed, 17 insertions(+), 38 deletions(-) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index a384e8bb..95ee28fd 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -96,16 +96,7 @@ def _prepare_headers(cls, api_response: Dict[str, Any]): ).get("responseCookies") if input_headers: headers_to_remove = copy(cls.REMOVE_HEADERS) - if response_cookies or "httpResponseBody" not in api_response: - # Only the experimental.responseCookies response field should - # be used in general to get cookies, since the Set-Cookie - # header may define cookies from the main HTTP response that - # were overridden later on, e.g. during browser rendering. - # However, for HTTP requests (i.e. with httpResponseBody), it - # is OK to keep the Set-Cookie header if - # experimental.responseCookies was not received, as it should - # be identical to what experimental.responseCookies would - # contain. + if response_cookies: headers_to_remove.add("set-cookie") result = { h["name"]: [h["value"]] diff --git a/tests/test_responses.py b/tests/test_responses.py index 9299ef5b..b5958123 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -260,21 +260,33 @@ def test_response_headers_removal(api_response, cls): @pytest.mark.parametrize( "fields,cls,keep", [ - # For HTTP requests, whether the response Set-Cookie header is kept or - # not depends on whether experimental.responseCookies is received. + # Only keep the Set-Cookie header if experimental.responseCookies is + # not received. *( ( { - "httpResponseBody": b64encode(PAGE_CONTENT.encode("utf-8")), + **output_fields, "httpResponseHeaders": [ {"name": "Content-Type", "value": "text/html"}, {"name": "Content-Length", "value": str(len(PAGE_CONTENT))}, ], **cookie_fields, # type: ignore[dict-item] }, - ZyteAPIResponse, + response_cls, keep, ) + for output_fields, response_cls in ( + ( + {"httpResponseBody": b64encode(PAGE_CONTENT.encode("utf-8"))}, + ZyteAPIResponse, + ), + ( + { + "browserHtml": PAGE_CONTENT, + }, + ZyteAPITextResponse, + ), + ) for cookie_fields, keep in ( # No response cookies, so Set-Cookie is kept. ( @@ -292,30 +304,6 @@ def test_response_headers_removal(api_response, cls): ), ) ), - # For non-HTTP requests, the response Set-Cookie header is always - # dropped. - *( - ( - { - "browserHtml": PAGE_CONTENT, - "httpResponseHeaders": [ - {"name": "Content-Type", "value": "text/html"}, - {"name": "Content-Length", "value": str(len(PAGE_CONTENT))}, - ], - **cookie_fields, # type: ignore[dict-item] - }, - ZyteAPITextResponse, - False, - ) - for cookie_fields in ( - {}, - { - "experimental": { - "responseCookies": INPUT_COOKIES_SIMPLE, - }, - }, - ) - ), ], ) def test_response_cookie_header(fields, cls, keep): From f9fb461a08b47a81a5f9a697626c47ae5c5a6edc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 24 Nov 2023 09:42:09 +0100 Subject: [PATCH 7/7] Fix CI issues --- docs/usage/manual.rst | 2 +- scrapy_zyte_api/responses.py | 2 +- tests/test_responses.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/usage/manual.rst b/docs/usage/manual.rst index 525c530c..8293caf2 100644 --- a/docs/usage/manual.rst +++ b/docs/usage/manual.rst @@ -64,4 +64,4 @@ remember to also request :http:`request:httpResponseHeaders`: # "…" To learn more about Zyte API parameters, see the upstream :ref:`usage -` and :ref:`API reference ` pages. +` and :ref:`API reference ` pages. diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 6de4b666..1a8c4cef 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -57,7 +57,7 @@ def replace(self, *args, **kwargs): def raw_api_response(self) -> Optional[Dict]: """Contains the raw API response from Zyte API. - For the full list of parameters, see :ref:`zyte-api-http-api`. + For the full list of parameters, see :ref:`zyte-api-reference`. """ return self._raw_api_response diff --git a/tests/test_responses.py b/tests/test_responses.py index b5958123..e35a079f 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -1,6 +1,7 @@ from base64 import b64encode from collections import defaultdict from functools import partial +from typing import Any, Dict, cast import pytest from scrapy import Request @@ -265,7 +266,7 @@ def test_response_headers_removal(api_response, cls): *( ( { - **output_fields, + **cast(Dict[Any, Any], output_fields), "httpResponseHeaders": [ {"name": "Content-Type", "value": "text/html"}, {"name": "Content-Length", "value": str(len(PAGE_CONTENT))},