| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175 |
- """
- Tests for the pandas custom headers in http(s) requests
- """
- from functools import partial
- import gzip
- from io import BytesIO
- import pytest
- from pandas._config import using_string_dtype
- import pandas.util._test_decorators as td
- import pandas as pd
- import pandas._testing as tm
- pytestmark = [
- pytest.mark.single_cpu,
- pytest.mark.network,
- pytest.mark.filterwarnings(
- "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
- ),
- ]
- def gzip_bytes(response_bytes):
- with BytesIO() as bio:
- with gzip.GzipFile(fileobj=bio, mode="w") as zipper:
- zipper.write(response_bytes)
- return bio.getvalue()
- def csv_responder(df):
- return df.to_csv(index=False).encode("utf-8")
- def gz_csv_responder(df):
- return gzip_bytes(csv_responder(df))
- def json_responder(df):
- return df.to_json().encode("utf-8")
- def gz_json_responder(df):
- return gzip_bytes(json_responder(df))
- def html_responder(df):
- return df.to_html(index=False).encode("utf-8")
- def parquetpyarrow_reponder(df):
- return df.to_parquet(index=False, engine="pyarrow")
- def parquetfastparquet_responder(df):
- # the fastparquet engine doesn't like to write to a buffer
- # it can do it via the open_with function being set appropriately
- # however it automatically calls the close method and wipes the buffer
- # so just overwrite that attribute on this instance to not do that
- # protected by an importorskip in the respective test
- import fsspec
- df.to_parquet(
- "memory://fastparquet_user_agent.parquet",
- index=False,
- engine="fastparquet",
- compression=None,
- )
- with fsspec.open("memory://fastparquet_user_agent.parquet", "rb") as f:
- return f.read()
- def pickle_respnder(df):
- with BytesIO() as bio:
- df.to_pickle(bio)
- return bio.getvalue()
- def stata_responder(df):
- with BytesIO() as bio:
- df.to_stata(bio, write_index=False)
- return bio.getvalue()
- @pytest.mark.parametrize(
- "responder, read_method",
- [
- (csv_responder, pd.read_csv),
- (json_responder, pd.read_json),
- (
- html_responder,
- lambda *args, **kwargs: pd.read_html(*args, **kwargs)[0],
- ),
- pytest.param(
- parquetpyarrow_reponder,
- partial(pd.read_parquet, engine="pyarrow"),
- marks=td.skip_if_no("pyarrow"),
- ),
- pytest.param(
- parquetfastparquet_responder,
- partial(pd.read_parquet, engine="fastparquet"),
- # TODO(ArrayManager) fastparquet
- marks=[
- td.skip_if_no("fastparquet"),
- td.skip_if_no("fsspec"),
- td.skip_array_manager_not_yet_implemented,
- pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string"),
- ],
- ),
- (pickle_respnder, pd.read_pickle),
- (stata_responder, pd.read_stata),
- (gz_csv_responder, pd.read_csv),
- (gz_json_responder, pd.read_json),
- ],
- )
- @pytest.mark.parametrize(
- "storage_options",
- [
- None,
- {"User-Agent": "foo"},
- {"User-Agent": "foo", "Auth": "bar"},
- ],
- )
- def test_request_headers(responder, read_method, httpserver, storage_options):
- expected = pd.DataFrame({"a": ["b"]})
- default_headers = ["Accept-Encoding", "Host", "Connection", "User-Agent"]
- if "gz" in responder.__name__:
- extra = {"Content-Encoding": "gzip"}
- if storage_options is None:
- storage_options = extra
- else:
- storage_options |= extra
- else:
- extra = None
- expected_headers = set(default_headers).union(
- storage_options.keys() if storage_options else []
- )
- httpserver.serve_content(content=responder(expected), headers=extra)
- result = read_method(httpserver.url, storage_options=storage_options)
- tm.assert_frame_equal(result, expected)
- request_headers = dict(httpserver.requests[0].headers)
- for header in expected_headers:
- exp = request_headers.pop(header)
- if storage_options and header in storage_options:
- assert exp == storage_options[header]
- # No extra headers added
- assert not request_headers
- @pytest.mark.parametrize(
- "engine",
- [
- "pyarrow",
- "fastparquet",
- ],
- )
- def test_to_parquet_to_disk_with_storage_options(engine):
- headers = {
- "User-Agent": "custom",
- "Auth": "other_custom",
- }
- pytest.importorskip(engine)
- true_df = pd.DataFrame({"column_name": ["column_value"]})
- msg = (
- "storage_options passed with file object or non-fsspec file path|"
- "storage_options passed with buffer, or non-supported URL"
- )
- with pytest.raises(ValueError, match=msg):
- true_df.to_parquet("/tmp/junk.parquet", storage_options=headers, engine=engine)
|