test_compression.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378
  1. import gzip
  2. import io
  3. import os
  4. from pathlib import Path
  5. import subprocess
  6. import sys
  7. import tarfile
  8. import textwrap
  9. import time
  10. import zipfile
  11. import numpy as np
  12. import pytest
  13. from pandas.compat import is_platform_windows
  14. import pandas as pd
  15. import pandas._testing as tm
  16. import pandas.io.common as icom
  17. @pytest.mark.parametrize(
  18. "obj",
  19. [
  20. pd.DataFrame(
  21. 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
  22. columns=["X", "Y", "Z"],
  23. ),
  24. pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
  25. ],
  26. )
  27. @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
  28. def test_compression_size(obj, method, compression_only):
  29. if compression_only == "tar":
  30. compression_only = {"method": "tar", "mode": "w:gz"}
  31. with tm.ensure_clean() as path:
  32. getattr(obj, method)(path, compression=compression_only)
  33. compressed_size = os.path.getsize(path)
  34. getattr(obj, method)(path, compression=None)
  35. uncompressed_size = os.path.getsize(path)
  36. assert uncompressed_size > compressed_size
  37. @pytest.mark.parametrize(
  38. "obj",
  39. [
  40. pd.DataFrame(
  41. 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
  42. columns=["X", "Y", "Z"],
  43. ),
  44. pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
  45. ],
  46. )
  47. @pytest.mark.parametrize("method", ["to_csv", "to_json"])
  48. def test_compression_size_fh(obj, method, compression_only):
  49. with tm.ensure_clean() as path:
  50. with icom.get_handle(
  51. path,
  52. "w:gz" if compression_only == "tar" else "w",
  53. compression=compression_only,
  54. ) as handles:
  55. getattr(obj, method)(handles.handle)
  56. assert not handles.handle.closed
  57. compressed_size = os.path.getsize(path)
  58. with tm.ensure_clean() as path:
  59. with icom.get_handle(path, "w", compression=None) as handles:
  60. getattr(obj, method)(handles.handle)
  61. assert not handles.handle.closed
  62. uncompressed_size = os.path.getsize(path)
  63. assert uncompressed_size > compressed_size
  64. @pytest.mark.parametrize(
  65. "write_method, write_kwargs, read_method",
  66. [
  67. ("to_csv", {"index": False}, pd.read_csv),
  68. ("to_json", {}, pd.read_json),
  69. ("to_pickle", {}, pd.read_pickle),
  70. ],
  71. )
  72. def test_dataframe_compression_defaults_to_infer(
  73. write_method, write_kwargs, read_method, compression_only, compression_to_extension
  74. ):
  75. # GH22004
  76. input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=["X", "Y", "Z"])
  77. extension = compression_to_extension[compression_only]
  78. with tm.ensure_clean("compressed" + extension) as path:
  79. getattr(input, write_method)(path, **write_kwargs)
  80. output = read_method(path, compression=compression_only)
  81. tm.assert_frame_equal(output, input)
  82. @pytest.mark.parametrize(
  83. "write_method,write_kwargs,read_method,read_kwargs",
  84. [
  85. ("to_csv", {"index": False, "header": True}, pd.read_csv, {"squeeze": True}),
  86. ("to_json", {}, pd.read_json, {"typ": "series"}),
  87. ("to_pickle", {}, pd.read_pickle, {}),
  88. ],
  89. )
  90. def test_series_compression_defaults_to_infer(
  91. write_method,
  92. write_kwargs,
  93. read_method,
  94. read_kwargs,
  95. compression_only,
  96. compression_to_extension,
  97. ):
  98. # GH22004
  99. input = pd.Series([0, 5, -2, 10], name="X")
  100. extension = compression_to_extension[compression_only]
  101. with tm.ensure_clean("compressed" + extension) as path:
  102. getattr(input, write_method)(path, **write_kwargs)
  103. if "squeeze" in read_kwargs:
  104. kwargs = read_kwargs.copy()
  105. del kwargs["squeeze"]
  106. output = read_method(path, compression=compression_only, **kwargs).squeeze(
  107. "columns"
  108. )
  109. else:
  110. output = read_method(path, compression=compression_only, **read_kwargs)
  111. tm.assert_series_equal(output, input, check_names=False)
  112. def test_compression_warning(compression_only):
  113. # Assert that passing a file object to to_csv while explicitly specifying a
  114. # compression protocol triggers a RuntimeWarning, as per GH21227.
  115. df = pd.DataFrame(
  116. 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
  117. columns=["X", "Y", "Z"],
  118. )
  119. with tm.ensure_clean() as path:
  120. with icom.get_handle(path, "w", compression=compression_only) as handles:
  121. with tm.assert_produces_warning(RuntimeWarning):
  122. df.to_csv(handles.handle, compression=compression_only)
  123. def test_compression_binary(compression_only):
  124. """
  125. Binary file handles support compression.
  126. GH22555
  127. """
  128. df = pd.DataFrame(
  129. 1.1 * np.arange(120).reshape((30, 4)),
  130. columns=pd.Index(list("ABCD")),
  131. index=pd.Index([f"i-{i}" for i in range(30)]),
  132. )
  133. # with a file
  134. with tm.ensure_clean() as path:
  135. with open(path, mode="wb") as file:
  136. df.to_csv(file, mode="wb", compression=compression_only)
  137. file.seek(0) # file shouldn't be closed
  138. tm.assert_frame_equal(
  139. df, pd.read_csv(path, index_col=0, compression=compression_only)
  140. )
  141. # with BytesIO
  142. file = io.BytesIO()
  143. df.to_csv(file, mode="wb", compression=compression_only)
  144. file.seek(0) # file shouldn't be closed
  145. tm.assert_frame_equal(
  146. df, pd.read_csv(file, index_col=0, compression=compression_only)
  147. )
  148. def test_gzip_reproducibility_file_name():
  149. """
  150. Gzip should create reproducible archives with mtime.
  151. Note: Archives created with different filenames will still be different!
  152. GH 28103
  153. """
  154. df = pd.DataFrame(
  155. 1.1 * np.arange(120).reshape((30, 4)),
  156. columns=pd.Index(list("ABCD")),
  157. index=pd.Index([f"i-{i}" for i in range(30)]),
  158. )
  159. compression_options = {"method": "gzip", "mtime": 1}
  160. # test for filename
  161. with tm.ensure_clean() as path:
  162. path = Path(path)
  163. df.to_csv(path, compression=compression_options)
  164. time.sleep(0.1)
  165. output = path.read_bytes()
  166. df.to_csv(path, compression=compression_options)
  167. assert output == path.read_bytes()
  168. def test_gzip_reproducibility_file_object():
  169. """
  170. Gzip should create reproducible archives with mtime.
  171. GH 28103
  172. """
  173. df = pd.DataFrame(
  174. 1.1 * np.arange(120).reshape((30, 4)),
  175. columns=pd.Index(list("ABCD")),
  176. index=pd.Index([f"i-{i}" for i in range(30)]),
  177. )
  178. compression_options = {"method": "gzip", "mtime": 1}
  179. # test for file object
  180. buffer = io.BytesIO()
  181. df.to_csv(buffer, compression=compression_options, mode="wb")
  182. output = buffer.getvalue()
  183. time.sleep(0.1)
  184. buffer = io.BytesIO()
  185. df.to_csv(buffer, compression=compression_options, mode="wb")
  186. assert output == buffer.getvalue()
  187. @pytest.mark.single_cpu
  188. def test_with_missing_lzma():
  189. """Tests if import pandas works when lzma is not present."""
  190. # https://github.com/pandas-dev/pandas/issues/27575
  191. code = textwrap.dedent(
  192. """\
  193. import sys
  194. sys.modules['lzma'] = None
  195. import pandas
  196. """
  197. )
  198. subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE)
  199. @pytest.mark.single_cpu
  200. def test_with_missing_lzma_runtime():
  201. """Tests if RuntimeError is hit when calling lzma without
  202. having the module available.
  203. """
  204. code = textwrap.dedent(
  205. """
  206. import sys
  207. import pytest
  208. sys.modules['lzma'] = None
  209. import pandas as pd
  210. df = pd.DataFrame()
  211. with pytest.raises(RuntimeError, match='lzma module'):
  212. df.to_csv('foo.csv', compression='xz')
  213. """
  214. )
  215. subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE)
  216. @pytest.mark.parametrize(
  217. "obj",
  218. [
  219. pd.DataFrame(
  220. 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
  221. columns=["X", "Y", "Z"],
  222. ),
  223. pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
  224. ],
  225. )
  226. @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
  227. def test_gzip_compression_level(obj, method):
  228. # GH33196
  229. with tm.ensure_clean() as path:
  230. getattr(obj, method)(path, compression="gzip")
  231. compressed_size_default = os.path.getsize(path)
  232. getattr(obj, method)(path, compression={"method": "gzip", "compresslevel": 1})
  233. compressed_size_fast = os.path.getsize(path)
  234. assert compressed_size_default < compressed_size_fast
  235. @pytest.mark.parametrize(
  236. "obj",
  237. [
  238. pd.DataFrame(
  239. 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
  240. columns=["X", "Y", "Z"],
  241. ),
  242. pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
  243. ],
  244. )
  245. @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
  246. def test_xz_compression_level_read(obj, method):
  247. with tm.ensure_clean() as path:
  248. getattr(obj, method)(path, compression="xz")
  249. compressed_size_default = os.path.getsize(path)
  250. getattr(obj, method)(path, compression={"method": "xz", "preset": 1})
  251. compressed_size_fast = os.path.getsize(path)
  252. assert compressed_size_default < compressed_size_fast
  253. if method == "to_csv":
  254. pd.read_csv(path, compression="xz")
  255. @pytest.mark.parametrize(
  256. "obj",
  257. [
  258. pd.DataFrame(
  259. 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
  260. columns=["X", "Y", "Z"],
  261. ),
  262. pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
  263. ],
  264. )
  265. @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
  266. def test_bzip_compression_level(obj, method):
  267. """GH33196 bzip needs file size > 100k to show a size difference between
  268. compression levels, so here we just check if the call works when
  269. compression is passed as a dict.
  270. """
  271. with tm.ensure_clean() as path:
  272. getattr(obj, method)(path, compression={"method": "bz2", "compresslevel": 1})
  273. @pytest.mark.parametrize(
  274. "suffix,archive",
  275. [
  276. (".zip", zipfile.ZipFile),
  277. (".tar", tarfile.TarFile),
  278. ],
  279. )
  280. def test_empty_archive_zip(suffix, archive):
  281. with tm.ensure_clean(filename=suffix) as path:
  282. with archive(path, "w"):
  283. pass
  284. with pytest.raises(ValueError, match="Zero files found"):
  285. pd.read_csv(path)
  286. def test_ambiguous_archive_zip():
  287. with tm.ensure_clean(filename=".zip") as path:
  288. with zipfile.ZipFile(path, "w") as file:
  289. file.writestr("a.csv", "foo,bar")
  290. file.writestr("b.csv", "foo,bar")
  291. with pytest.raises(ValueError, match="Multiple files found in ZIP file"):
  292. pd.read_csv(path)
  293. def test_ambiguous_archive_tar(tmp_path):
  294. csvAPath = tmp_path / "a.csv"
  295. with open(csvAPath, "w", encoding="utf-8") as a:
  296. a.write("foo,bar\n")
  297. csvBPath = tmp_path / "b.csv"
  298. with open(csvBPath, "w", encoding="utf-8") as b:
  299. b.write("foo,bar\n")
  300. tarpath = tmp_path / "archive.tar"
  301. with tarfile.TarFile(tarpath, "w") as tar:
  302. tar.add(csvAPath, "a.csv")
  303. tar.add(csvBPath, "b.csv")
  304. with pytest.raises(ValueError, match="Multiple files found in TAR archive"):
  305. pd.read_csv(tarpath)
  306. def test_tar_gz_to_different_filename():
  307. with tm.ensure_clean(filename=".foo") as file:
  308. pd.DataFrame(
  309. [["1", "2"]],
  310. columns=["foo", "bar"],
  311. ).to_csv(file, compression={"method": "tar", "mode": "w:gz"}, index=False)
  312. with gzip.open(file) as uncompressed:
  313. with tarfile.TarFile(fileobj=uncompressed) as archive:
  314. members = archive.getmembers()
  315. assert len(members) == 1
  316. content = archive.extractfile(members[0]).read().decode("utf8")
  317. if is_platform_windows():
  318. expected = "foo,bar\r\n1,2\r\n"
  319. else:
  320. expected = "foo,bar\n1,2\n"
  321. assert content == expected
  322. def test_tar_no_error_on_close():
  323. with io.BytesIO() as buffer:
  324. with icom._BytesTarFile(fileobj=buffer, mode="w"):
  325. pass