test_info.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589
  1. from io import StringIO
  2. import re
  3. from string import ascii_uppercase
  4. import sys
  5. import textwrap
  6. import numpy as np
  7. import pytest
  8. from pandas._config import using_string_dtype
  9. from pandas.compat import (
  10. HAS_PYARROW,
  11. IS64,
  12. PYPY,
  13. is_platform_arm,
  14. )
  15. from pandas import (
  16. CategoricalIndex,
  17. DataFrame,
  18. Index,
  19. MultiIndex,
  20. Series,
  21. date_range,
  22. option_context,
  23. )
  24. import pandas._testing as tm
  25. from pandas.util.version import Version
  26. @pytest.fixture
  27. def duplicate_columns_frame():
  28. """Dataframe with duplicate column names."""
  29. return DataFrame(
  30. np.random.default_rng(2).standard_normal((1500, 4)),
  31. columns=["a", "a", "b", "b"],
  32. )
  33. def test_info_empty():
  34. # GH #45494
  35. df = DataFrame()
  36. buf = StringIO()
  37. df.info(buf=buf)
  38. result = buf.getvalue()
  39. expected = textwrap.dedent(
  40. """\
  41. <class 'pandas.core.frame.DataFrame'>
  42. RangeIndex: 0 entries
  43. Empty DataFrame\n"""
  44. )
  45. assert result == expected
  46. def test_info_categorical_column_smoke_test():
  47. n = 2500
  48. df = DataFrame({"int64": np.random.default_rng(2).integers(100, size=n, dtype=int)})
  49. df["category"] = Series(
  50. np.array(list("abcdefghij")).take(
  51. np.random.default_rng(2).integers(0, 10, size=n, dtype=int)
  52. )
  53. ).astype("category")
  54. df.isna()
  55. buf = StringIO()
  56. df.info(buf=buf)
  57. df2 = df[df["category"] == "d"]
  58. buf = StringIO()
  59. df2.info(buf=buf)
  60. @pytest.mark.parametrize(
  61. "fixture_func_name",
  62. [
  63. "int_frame",
  64. "float_frame",
  65. "datetime_frame",
  66. "duplicate_columns_frame",
  67. "float_string_frame",
  68. ],
  69. )
  70. def test_info_smoke_test(fixture_func_name, request):
  71. frame = request.getfixturevalue(fixture_func_name)
  72. buf = StringIO()
  73. frame.info(buf=buf)
  74. result = buf.getvalue().splitlines()
  75. assert len(result) > 10
  76. buf = StringIO()
  77. frame.info(buf=buf, verbose=False)
  78. def test_info_smoke_test2(float_frame):
  79. # pretty useless test, used to be mixed into the repr tests
  80. buf = StringIO()
  81. float_frame.reindex(columns=["A"]).info(verbose=False, buf=buf)
  82. float_frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf)
  83. # no columns or index
  84. DataFrame().info(buf=buf)
  85. @pytest.mark.parametrize(
  86. "num_columns, max_info_columns, verbose",
  87. [
  88. (10, 100, True),
  89. (10, 11, True),
  90. (10, 10, True),
  91. (10, 9, False),
  92. (10, 1, False),
  93. ],
  94. )
  95. def test_info_default_verbose_selection(num_columns, max_info_columns, verbose):
  96. frame = DataFrame(np.random.default_rng(2).standard_normal((5, num_columns)))
  97. with option_context("display.max_info_columns", max_info_columns):
  98. io_default = StringIO()
  99. frame.info(buf=io_default)
  100. result = io_default.getvalue()
  101. io_explicit = StringIO()
  102. frame.info(buf=io_explicit, verbose=verbose)
  103. expected = io_explicit.getvalue()
  104. assert result == expected
  105. def test_info_verbose_check_header_separator_body():
  106. buf = StringIO()
  107. size = 1001
  108. start = 5
  109. frame = DataFrame(np.random.default_rng(2).standard_normal((3, size)))
  110. frame.info(verbose=True, buf=buf)
  111. res = buf.getvalue()
  112. header = " # Column Dtype \n--- ------ ----- "
  113. assert header in res
  114. frame.info(verbose=True, buf=buf)
  115. buf.seek(0)
  116. lines = buf.readlines()
  117. assert len(lines) > 0
  118. for i, line in enumerate(lines):
  119. if start <= i < start + size:
  120. line_nr = f" {i - start} "
  121. assert line.startswith(line_nr)
  122. @pytest.mark.parametrize(
  123. "size, header_exp, separator_exp, first_line_exp, last_line_exp",
  124. [
  125. (
  126. 4,
  127. " # Column Non-Null Count Dtype ",
  128. "--- ------ -------------- ----- ",
  129. " 0 0 3 non-null float64",
  130. " 3 3 3 non-null float64",
  131. ),
  132. (
  133. 11,
  134. " # Column Non-Null Count Dtype ",
  135. "--- ------ -------------- ----- ",
  136. " 0 0 3 non-null float64",
  137. " 10 10 3 non-null float64",
  138. ),
  139. (
  140. 101,
  141. " # Column Non-Null Count Dtype ",
  142. "--- ------ -------------- ----- ",
  143. " 0 0 3 non-null float64",
  144. " 100 100 3 non-null float64",
  145. ),
  146. (
  147. 1001,
  148. " # Column Non-Null Count Dtype ",
  149. "--- ------ -------------- ----- ",
  150. " 0 0 3 non-null float64",
  151. " 1000 1000 3 non-null float64",
  152. ),
  153. (
  154. 10001,
  155. " # Column Non-Null Count Dtype ",
  156. "--- ------ -------------- ----- ",
  157. " 0 0 3 non-null float64",
  158. " 10000 10000 3 non-null float64",
  159. ),
  160. ],
  161. )
  162. def test_info_verbose_with_counts_spacing(
  163. size, header_exp, separator_exp, first_line_exp, last_line_exp
  164. ):
  165. """Test header column, spacer, first line and last line in verbose mode."""
  166. frame = DataFrame(np.random.default_rng(2).standard_normal((3, size)))
  167. with StringIO() as buf:
  168. frame.info(verbose=True, show_counts=True, buf=buf)
  169. all_lines = buf.getvalue().splitlines()
  170. # Here table would contain only header, separator and table lines
  171. # dframe repr, index summary, memory usage and dtypes are excluded
  172. table = all_lines[3:-2]
  173. header, separator, first_line, *rest, last_line = table
  174. assert header == header_exp
  175. assert separator == separator_exp
  176. assert first_line == first_line_exp
  177. assert last_line == last_line_exp
  178. def test_info_memory():
  179. # https://github.com/pandas-dev/pandas/issues/21056
  180. df = DataFrame({"a": Series([1, 2], dtype="i8")})
  181. buf = StringIO()
  182. df.info(buf=buf)
  183. result = buf.getvalue()
  184. bytes = float(df.memory_usage().sum())
  185. expected = textwrap.dedent(
  186. f"""\
  187. <class 'pandas.core.frame.DataFrame'>
  188. RangeIndex: 2 entries, 0 to 1
  189. Data columns (total 1 columns):
  190. # Column Non-Null Count Dtype
  191. --- ------ -------------- -----
  192. 0 a 2 non-null int64
  193. dtypes: int64(1)
  194. memory usage: {bytes} bytes
  195. """
  196. )
  197. assert result == expected
  198. def test_info_wide():
  199. io = StringIO()
  200. df = DataFrame(np.random.default_rng(2).standard_normal((5, 101)))
  201. df.info(buf=io)
  202. io = StringIO()
  203. df.info(buf=io, max_cols=101)
  204. result = io.getvalue()
  205. assert len(result.splitlines()) > 100
  206. expected = result
  207. with option_context("display.max_info_columns", 101):
  208. io = StringIO()
  209. df.info(buf=io)
  210. result = io.getvalue()
  211. assert result == expected
  212. def test_info_duplicate_columns_shows_correct_dtypes():
  213. # GH11761
  214. io = StringIO()
  215. frame = DataFrame([[1, 2.0]], columns=["a", "a"])
  216. frame.info(buf=io)
  217. lines = io.getvalue().splitlines(True)
  218. assert " 0 a 1 non-null int64 \n" == lines[5]
  219. assert " 1 a 1 non-null float64\n" == lines[6]
  220. def test_info_shows_column_dtypes():
  221. dtypes = [
  222. "int64",
  223. "float64",
  224. "datetime64[ns]",
  225. "timedelta64[ns]",
  226. "complex128",
  227. "object",
  228. "bool",
  229. ]
  230. data = {}
  231. n = 10
  232. for i, dtype in enumerate(dtypes):
  233. data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype)
  234. df = DataFrame(data)
  235. buf = StringIO()
  236. df.info(buf=buf)
  237. res = buf.getvalue()
  238. header = (
  239. " # Column Non-Null Count Dtype \n"
  240. "--- ------ -------------- ----- "
  241. )
  242. assert header in res
  243. for i, dtype in enumerate(dtypes):
  244. name = f" {i:d} {i:d} {n:d} non-null {dtype}"
  245. assert name in res
  246. def test_info_max_cols():
  247. df = DataFrame(np.random.default_rng(2).standard_normal((10, 5)))
  248. for len_, verbose in [(5, None), (5, False), (12, True)]:
  249. # For verbose always ^ setting ^ summarize ^ full output
  250. with option_context("max_info_columns", 4):
  251. buf = StringIO()
  252. df.info(buf=buf, verbose=verbose)
  253. res = buf.getvalue()
  254. assert len(res.strip().split("\n")) == len_
  255. for len_, verbose in [(12, None), (5, False), (12, True)]:
  256. # max_cols not exceeded
  257. with option_context("max_info_columns", 5):
  258. buf = StringIO()
  259. df.info(buf=buf, verbose=verbose)
  260. res = buf.getvalue()
  261. assert len(res.strip().split("\n")) == len_
  262. for len_, max_cols in [(12, 5), (5, 4)]:
  263. # setting truncates
  264. with option_context("max_info_columns", 4):
  265. buf = StringIO()
  266. df.info(buf=buf, max_cols=max_cols)
  267. res = buf.getvalue()
  268. assert len(res.strip().split("\n")) == len_
  269. # setting wouldn't truncate
  270. with option_context("max_info_columns", 5):
  271. buf = StringIO()
  272. df.info(buf=buf, max_cols=max_cols)
  273. res = buf.getvalue()
  274. assert len(res.strip().split("\n")) == len_
  275. def test_info_memory_usage():
  276. # Ensure memory usage is displayed, when asserted, on the last line
  277. dtypes = [
  278. "int64",
  279. "float64",
  280. "datetime64[ns]",
  281. "timedelta64[ns]",
  282. "complex128",
  283. "object",
  284. "bool",
  285. ]
  286. data = {}
  287. n = 10
  288. for i, dtype in enumerate(dtypes):
  289. data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype)
  290. df = DataFrame(data)
  291. buf = StringIO()
  292. # display memory usage case
  293. df.info(buf=buf, memory_usage=True)
  294. res = buf.getvalue().splitlines()
  295. assert "memory usage: " in res[-1]
  296. # do not display memory usage case
  297. df.info(buf=buf, memory_usage=False)
  298. res = buf.getvalue().splitlines()
  299. assert "memory usage: " not in res[-1]
  300. df.info(buf=buf, memory_usage=True)
  301. res = buf.getvalue().splitlines()
  302. # memory usage is a lower bound, so print it as XYZ+ MB
  303. assert re.match(r"memory usage: [^+]+\+", res[-1])
  304. df.iloc[:, :5].info(buf=buf, memory_usage=True)
  305. res = buf.getvalue().splitlines()
  306. # excluded column with object dtype, so estimate is accurate
  307. assert not re.match(r"memory usage: [^+]+\+", res[-1])
  308. # Test a DataFrame with duplicate columns
  309. dtypes = ["int64", "int64", "int64", "float64"]
  310. data = {}
  311. n = 100
  312. for i, dtype in enumerate(dtypes):
  313. data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype)
  314. df = DataFrame(data)
  315. df.columns = dtypes
  316. df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object))
  317. df_with_object_index.info(buf=buf, memory_usage=True)
  318. res = buf.getvalue().splitlines()
  319. assert re.match(r"memory usage: [^+]+\+", res[-1])
  320. df_with_object_index.info(buf=buf, memory_usage="deep")
  321. res = buf.getvalue().splitlines()
  322. assert re.match(r"memory usage: [^+]+$", res[-1])
  323. # Ensure df size is as expected
  324. # (cols * rows * bytes) + index size
  325. df_size = df.memory_usage().sum()
  326. exp_size = len(dtypes) * n * 8 + df.index.nbytes
  327. assert df_size == exp_size
  328. # Ensure number of cols in memory_usage is the same as df
  329. size_df = np.size(df.columns.values) + 1 # index=True; default
  330. assert size_df == np.size(df.memory_usage())
  331. # assert deep works only on object
  332. assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()
  333. # test for validity
  334. DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True)
  335. DataFrame(1, index=["a"], columns=["A"]).index.nbytes
  336. df = DataFrame(
  337. data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"]
  338. )
  339. df.index.nbytes
  340. df.memory_usage(index=True)
  341. df.index.values.nbytes
  342. mem = df.memory_usage(deep=True).sum()
  343. assert mem > 0
  344. @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result")
  345. def test_info_memory_usage_deep_not_pypy():
  346. df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object))
  347. assert (
  348. df_with_object_index.memory_usage(index=True, deep=True).sum()
  349. > df_with_object_index.memory_usage(index=True).sum()
  350. )
  351. df_object = DataFrame({"a": Series(["a"], dtype=object)})
  352. assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum()
  353. @pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result")
  354. def test_info_memory_usage_deep_pypy():
  355. df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object))
  356. assert (
  357. df_with_object_index.memory_usage(index=True, deep=True).sum()
  358. == df_with_object_index.memory_usage(index=True).sum()
  359. )
  360. df_object = DataFrame({"a": Series(["a"], dtype=object)})
  361. assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum()
  362. @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
  363. def test_usage_via_getsizeof():
  364. df = DataFrame(
  365. data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"]
  366. )
  367. mem = df.memory_usage(deep=True).sum()
  368. # sys.getsizeof will call the .memory_usage with
  369. # deep=True, and add on some GC overhead
  370. diff = mem - sys.getsizeof(df)
  371. assert abs(diff) < 100
  372. def test_info_memory_usage_qualified(using_infer_string):
  373. buf = StringIO()
  374. df = DataFrame(1, columns=list("ab"), index=[1, 2, 3])
  375. df.info(buf=buf)
  376. assert "+" not in buf.getvalue()
  377. buf = StringIO()
  378. df = DataFrame(1, columns=list("ab"), index=Index(list("ABC"), dtype=object))
  379. df.info(buf=buf)
  380. assert "+" in buf.getvalue()
  381. buf = StringIO()
  382. df = DataFrame(1, columns=list("ab"), index=Index(list("ABC"), dtype="str"))
  383. df.info(buf=buf)
  384. if using_infer_string and HAS_PYARROW:
  385. assert "+" not in buf.getvalue()
  386. else:
  387. assert "+" in buf.getvalue()
  388. buf = StringIO()
  389. df = DataFrame(
  390. 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)])
  391. )
  392. df.info(buf=buf)
  393. assert "+" not in buf.getvalue()
  394. buf = StringIO()
  395. df = DataFrame(
  396. 1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]])
  397. )
  398. df.info(buf=buf)
  399. if using_infer_string and HAS_PYARROW:
  400. assert "+" not in buf.getvalue()
  401. else:
  402. assert "+" in buf.getvalue()
  403. def test_info_memory_usage_bug_on_multiindex():
  404. # GH 14308
  405. # memory usage introspection should not materialize .values
  406. def memory_usage(f):
  407. return f.memory_usage(deep=True).sum()
  408. N = 100
  409. M = len(ascii_uppercase)
  410. index = MultiIndex.from_product(
  411. [list(ascii_uppercase), date_range("20160101", periods=N)],
  412. names=["id", "date"],
  413. )
  414. df = DataFrame(
  415. {"value": np.random.default_rng(2).standard_normal(N * M)}, index=index
  416. )
  417. unstacked = df.unstack("id")
  418. assert df.values.nbytes == unstacked.values.nbytes
  419. assert memory_usage(df) > memory_usage(unstacked)
  420. # high upper bound
  421. assert memory_usage(unstacked) - memory_usage(df) < 2000
  422. def test_info_categorical():
  423. # GH14298
  424. idx = CategoricalIndex(["a", "b"])
  425. df = DataFrame(np.zeros((2, 2)), index=idx, columns=idx)
  426. buf = StringIO()
  427. df.info(buf=buf)
  428. @pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system")
  429. def test_info_int_columns(using_infer_string):
  430. # GH#37245
  431. df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"])
  432. buf = StringIO()
  433. df.info(show_counts=True, buf=buf)
  434. result = buf.getvalue()
  435. expected = textwrap.dedent(
  436. f"""\
  437. <class 'pandas.core.frame.DataFrame'>
  438. Index: 2 entries, A to B
  439. Data columns (total 2 columns):
  440. # Column Non-Null Count Dtype
  441. --- ------ -------------- -----
  442. 0 1 2 non-null int64
  443. 1 2 2 non-null int64
  444. dtypes: int64(2)
  445. memory usage: {'50.0' if using_infer_string and HAS_PYARROW else '48.0+'} bytes
  446. """
  447. )
  448. assert result == expected
  449. @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
  450. def test_memory_usage_empty_no_warning(using_infer_string):
  451. # GH#50066
  452. df = DataFrame(index=["a", "b"])
  453. with tm.assert_produces_warning(None):
  454. result = df.memory_usage()
  455. if using_infer_string and HAS_PYARROW:
  456. value = 18
  457. else:
  458. value = 16 if IS64 else 8
  459. expected = Series(value, index=["Index"])
  460. tm.assert_series_equal(result, expected)
  461. @pytest.mark.single_cpu
  462. def test_info_compute_numba():
  463. # GH#51922
  464. numba = pytest.importorskip("numba")
  465. if Version(numba.__version__) == Version("0.61") and is_platform_arm():
  466. pytest.skip(f"Segfaults on ARM platforms with numba {numba.__version__}")
  467. df = DataFrame([[1, 2], [3, 4]])
  468. with option_context("compute.use_numba", True):
  469. buf = StringIO()
  470. df.info(buf=buf)
  471. result = buf.getvalue()
  472. buf = StringIO()
  473. df.info(buf=buf)
  474. expected = buf.getvalue()
  475. assert result == expected
  476. @pytest.mark.parametrize(
  477. "row, columns, show_counts, result",
  478. [
  479. [20, 20, None, True],
  480. [20, 20, True, True],
  481. [20, 20, False, False],
  482. [5, 5, None, False],
  483. [5, 5, True, False],
  484. [5, 5, False, False],
  485. ],
  486. )
  487. def test_info_show_counts(row, columns, show_counts, result):
  488. # Explicit cast to float to avoid implicit cast when setting nan
  489. df = DataFrame(1, columns=range(10), index=range(10)).astype({1: "float"})
  490. df.iloc[1, 1] = np.nan
  491. with option_context(
  492. "display.max_info_rows", row, "display.max_info_columns", columns
  493. ):
  494. with StringIO() as buf:
  495. df.info(buf=buf, show_counts=show_counts)
  496. assert ("non-null" in buf.getvalue()) is result