info.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101
  1. from __future__ import annotations
  2. from abc import (
  3. ABC,
  4. abstractmethod,
  5. )
  6. import sys
  7. from textwrap import dedent
  8. from typing import TYPE_CHECKING
  9. from pandas._config import get_option
  10. from pandas.io.formats import format as fmt
  11. from pandas.io.formats.printing import pprint_thing
  12. if TYPE_CHECKING:
  13. from collections.abc import (
  14. Iterable,
  15. Iterator,
  16. Mapping,
  17. Sequence,
  18. )
  19. from pandas._typing import (
  20. Dtype,
  21. WriteBuffer,
  22. )
  23. from pandas import (
  24. DataFrame,
  25. Index,
  26. Series,
  27. )
  28. frame_max_cols_sub = dedent(
  29. """\
  30. max_cols : int, optional
  31. When to switch from the verbose to the truncated output. If the
  32. DataFrame has more than `max_cols` columns, the truncated output
  33. is used. By default, the setting in
  34. ``pandas.options.display.max_info_columns`` is used."""
  35. )
  36. show_counts_sub = dedent(
  37. """\
  38. show_counts : bool, optional
  39. Whether to show the non-null counts. By default, this is shown
  40. only if the DataFrame is smaller than
  41. ``pandas.options.display.max_info_rows`` and
  42. ``pandas.options.display.max_info_columns``. A value of True always
  43. shows the counts, and False never shows the counts."""
  44. )
  45. frame_examples_sub = dedent(
  46. """\
  47. >>> int_values = [1, 2, 3, 4, 5]
  48. >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
  49. >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
  50. >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
  51. ... "float_col": float_values})
  52. >>> df
  53. int_col text_col float_col
  54. 0 1 alpha 0.00
  55. 1 2 beta 0.25
  56. 2 3 gamma 0.50
  57. 3 4 delta 0.75
  58. 4 5 epsilon 1.00
  59. Prints information of all columns:
  60. >>> df.info(verbose=True)
  61. <class 'pandas.core.frame.DataFrame'>
  62. RangeIndex: 5 entries, 0 to 4
  63. Data columns (total 3 columns):
  64. # Column Non-Null Count Dtype
  65. --- ------ -------------- -----
  66. 0 int_col 5 non-null int64
  67. 1 text_col 5 non-null object
  68. 2 float_col 5 non-null float64
  69. dtypes: float64(1), int64(1), object(1)
  70. memory usage: 248.0+ bytes
  71. Prints a summary of columns count and its dtypes but not per column
  72. information:
  73. >>> df.info(verbose=False)
  74. <class 'pandas.core.frame.DataFrame'>
  75. RangeIndex: 5 entries, 0 to 4
  76. Columns: 3 entries, int_col to float_col
  77. dtypes: float64(1), int64(1), object(1)
  78. memory usage: 248.0+ bytes
  79. Pipe output of DataFrame.info to buffer instead of sys.stdout, get
  80. buffer content and writes to a text file:
  81. >>> import io
  82. >>> buffer = io.StringIO()
  83. >>> df.info(buf=buffer)
  84. >>> s = buffer.getvalue()
  85. >>> with open("df_info.txt", "w",
  86. ... encoding="utf-8") as f: # doctest: +SKIP
  87. ... f.write(s)
  88. 260
  89. The `memory_usage` parameter allows deep introspection mode, specially
  90. useful for big DataFrames and fine-tune memory optimization:
  91. >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
  92. >>> df = pd.DataFrame({
  93. ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
  94. ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
  95. ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
  96. ... })
  97. >>> df.info()
  98. <class 'pandas.core.frame.DataFrame'>
  99. RangeIndex: 1000000 entries, 0 to 999999
  100. Data columns (total 3 columns):
  101. # Column Non-Null Count Dtype
  102. --- ------ -------------- -----
  103. 0 column_1 1000000 non-null object
  104. 1 column_2 1000000 non-null object
  105. 2 column_3 1000000 non-null object
  106. dtypes: object(3)
  107. memory usage: 22.9+ MB
  108. >>> df.info(memory_usage='deep')
  109. <class 'pandas.core.frame.DataFrame'>
  110. RangeIndex: 1000000 entries, 0 to 999999
  111. Data columns (total 3 columns):
  112. # Column Non-Null Count Dtype
  113. --- ------ -------------- -----
  114. 0 column_1 1000000 non-null object
  115. 1 column_2 1000000 non-null object
  116. 2 column_3 1000000 non-null object
  117. dtypes: object(3)
  118. memory usage: 165.9 MB"""
  119. )
  120. frame_see_also_sub = dedent(
  121. """\
  122. DataFrame.describe: Generate descriptive statistics of DataFrame
  123. columns.
  124. DataFrame.memory_usage: Memory usage of DataFrame columns."""
  125. )
  126. frame_sub_kwargs = {
  127. "klass": "DataFrame",
  128. "type_sub": " and columns",
  129. "max_cols_sub": frame_max_cols_sub,
  130. "show_counts_sub": show_counts_sub,
  131. "examples_sub": frame_examples_sub,
  132. "see_also_sub": frame_see_also_sub,
  133. "version_added_sub": "",
  134. }
  135. series_examples_sub = dedent(
  136. """\
  137. >>> int_values = [1, 2, 3, 4, 5]
  138. >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
  139. >>> s = pd.Series(text_values, index=int_values)
  140. >>> s.info()
  141. <class 'pandas.core.series.Series'>
  142. Index: 5 entries, 1 to 5
  143. Series name: None
  144. Non-Null Count Dtype
  145. -------------- -----
  146. 5 non-null object
  147. dtypes: object(1)
  148. memory usage: 80.0+ bytes
  149. Prints a summary excluding information about its values:
  150. >>> s.info(verbose=False)
  151. <class 'pandas.core.series.Series'>
  152. Index: 5 entries, 1 to 5
  153. dtypes: object(1)
  154. memory usage: 80.0+ bytes
  155. Pipe output of Series.info to buffer instead of sys.stdout, get
  156. buffer content and writes to a text file:
  157. >>> import io
  158. >>> buffer = io.StringIO()
  159. >>> s.info(buf=buffer)
  160. >>> s = buffer.getvalue()
  161. >>> with open("df_info.txt", "w",
  162. ... encoding="utf-8") as f: # doctest: +SKIP
  163. ... f.write(s)
  164. 260
  165. The `memory_usage` parameter allows deep introspection mode, specially
  166. useful for big Series and fine-tune memory optimization:
  167. >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
  168. >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6))
  169. >>> s.info()
  170. <class 'pandas.core.series.Series'>
  171. RangeIndex: 1000000 entries, 0 to 999999
  172. Series name: None
  173. Non-Null Count Dtype
  174. -------------- -----
  175. 1000000 non-null object
  176. dtypes: object(1)
  177. memory usage: 7.6+ MB
  178. >>> s.info(memory_usage='deep')
  179. <class 'pandas.core.series.Series'>
  180. RangeIndex: 1000000 entries, 0 to 999999
  181. Series name: None
  182. Non-Null Count Dtype
  183. -------------- -----
  184. 1000000 non-null object
  185. dtypes: object(1)
  186. memory usage: 55.3 MB"""
  187. )
  188. series_see_also_sub = dedent(
  189. """\
  190. Series.describe: Generate descriptive statistics of Series.
  191. Series.memory_usage: Memory usage of Series."""
  192. )
  193. series_sub_kwargs = {
  194. "klass": "Series",
  195. "type_sub": "",
  196. "max_cols_sub": "",
  197. "show_counts_sub": show_counts_sub,
  198. "examples_sub": series_examples_sub,
  199. "see_also_sub": series_see_also_sub,
  200. "version_added_sub": "\n.. versionadded:: 1.4.0\n",
  201. }
  202. INFO_DOCSTRING = dedent(
  203. """
  204. Print a concise summary of a {klass}.
  205. This method prints information about a {klass} including
  206. the index dtype{type_sub}, non-null values and memory usage.
  207. {version_added_sub}\
  208. Parameters
  209. ----------
  210. verbose : bool, optional
  211. Whether to print the full summary. By default, the setting in
  212. ``pandas.options.display.max_info_columns`` is followed.
  213. buf : writable buffer, defaults to sys.stdout
  214. Where to send the output. By default, the output is printed to
  215. sys.stdout. Pass a writable buffer if you need to further process
  216. the output.
  217. {max_cols_sub}
  218. memory_usage : bool, str, optional
  219. Specifies whether total memory usage of the {klass}
  220. elements (including the index) should be displayed. By default,
  221. this follows the ``pandas.options.display.memory_usage`` setting.
  222. True always show memory usage. False never shows memory usage.
  223. A value of 'deep' is equivalent to "True with deep introspection".
  224. Memory usage is shown in human-readable units (base-2
  225. representation). Without deep introspection a memory estimation is
  226. made based in column dtype and number of rows assuming values
  227. consume the same memory amount for corresponding dtypes. With deep
  228. memory introspection, a real memory usage calculation is performed
  229. at the cost of computational resources. See the
  230. :ref:`Frequently Asked Questions <df-memory-usage>` for more
  231. details.
  232. {show_counts_sub}
  233. Returns
  234. -------
  235. None
  236. This method prints a summary of a {klass} and returns None.
  237. See Also
  238. --------
  239. {see_also_sub}
  240. Examples
  241. --------
  242. {examples_sub}
  243. """
  244. )
  245. def _put_str(s: str | Dtype, space: int) -> str:
  246. """
  247. Make string of specified length, padding to the right if necessary.
  248. Parameters
  249. ----------
  250. s : Union[str, Dtype]
  251. String to be formatted.
  252. space : int
  253. Length to force string to be of.
  254. Returns
  255. -------
  256. str
  257. String coerced to given length.
  258. Examples
  259. --------
  260. >>> pd.io.formats.info._put_str("panda", 6)
  261. 'panda '
  262. >>> pd.io.formats.info._put_str("panda", 4)
  263. 'pand'
  264. """
  265. return str(s)[:space].ljust(space)
  266. def _sizeof_fmt(num: float, size_qualifier: str) -> str:
  267. """
  268. Return size in human readable format.
  269. Parameters
  270. ----------
  271. num : int
  272. Size in bytes.
  273. size_qualifier : str
  274. Either empty, or '+' (if lower bound).
  275. Returns
  276. -------
  277. str
  278. Size in human readable format.
  279. Examples
  280. --------
  281. >>> _sizeof_fmt(23028, '')
  282. '22.5 KB'
  283. >>> _sizeof_fmt(23028, '+')
  284. '22.5+ KB'
  285. """
  286. for x in ["bytes", "KB", "MB", "GB", "TB"]:
  287. if num < 1024.0:
  288. return f"{num:3.1f}{size_qualifier} {x}"
  289. num /= 1024.0
  290. return f"{num:3.1f}{size_qualifier} PB"
  291. def _initialize_memory_usage(
  292. memory_usage: bool | str | None = None,
  293. ) -> bool | str:
  294. """Get memory usage based on inputs and display options."""
  295. if memory_usage is None:
  296. memory_usage = get_option("display.memory_usage")
  297. return memory_usage
  298. class _BaseInfo(ABC):
  299. """
  300. Base class for DataFrameInfo and SeriesInfo.
  301. Parameters
  302. ----------
  303. data : DataFrame or Series
  304. Either dataframe or series.
  305. memory_usage : bool or str, optional
  306. If "deep", introspect the data deeply by interrogating object dtypes
  307. for system-level memory consumption, and include it in the returned
  308. values.
  309. """
  310. data: DataFrame | Series
  311. memory_usage: bool | str
  312. @property
  313. @abstractmethod
  314. def dtypes(self) -> Iterable[Dtype]:
  315. """
  316. Dtypes.
  317. Returns
  318. -------
  319. dtypes : sequence
  320. Dtype of each of the DataFrame's columns (or one series column).
  321. """
  322. @property
  323. @abstractmethod
  324. def dtype_counts(self) -> Mapping[str, int]:
  325. """Mapping dtype - number of counts."""
  326. @property
  327. @abstractmethod
  328. def non_null_counts(self) -> Sequence[int]:
  329. """Sequence of non-null counts for all columns or column (if series)."""
  330. @property
  331. @abstractmethod
  332. def memory_usage_bytes(self) -> int:
  333. """
  334. Memory usage in bytes.
  335. Returns
  336. -------
  337. memory_usage_bytes : int
  338. Object's total memory usage in bytes.
  339. """
  340. @property
  341. def memory_usage_string(self) -> str:
  342. """Memory usage in a form of human readable string."""
  343. return f"{_sizeof_fmt(self.memory_usage_bytes, self.size_qualifier)}\n"
  344. @property
  345. def size_qualifier(self) -> str:
  346. size_qualifier = ""
  347. if self.memory_usage:
  348. if self.memory_usage != "deep":
  349. # size_qualifier is just a best effort; not guaranteed to catch
  350. # all cases (e.g., it misses categorical data even with object
  351. # categories)
  352. if (
  353. "object" in self.dtype_counts
  354. or self.data.index._is_memory_usage_qualified()
  355. ):
  356. size_qualifier = "+"
  357. return size_qualifier
  358. @abstractmethod
  359. def render(
  360. self,
  361. *,
  362. buf: WriteBuffer[str] | None,
  363. max_cols: int | None,
  364. verbose: bool | None,
  365. show_counts: bool | None,
  366. ) -> None:
  367. pass
  368. class DataFrameInfo(_BaseInfo):
  369. """
  370. Class storing dataframe-specific info.
  371. """
  372. def __init__(
  373. self,
  374. data: DataFrame,
  375. memory_usage: bool | str | None = None,
  376. ) -> None:
  377. self.data: DataFrame = data
  378. self.memory_usage = _initialize_memory_usage(memory_usage)
  379. @property
  380. def dtype_counts(self) -> Mapping[str, int]:
  381. return _get_dataframe_dtype_counts(self.data)
  382. @property
  383. def dtypes(self) -> Iterable[Dtype]:
  384. """
  385. Dtypes.
  386. Returns
  387. -------
  388. dtypes
  389. Dtype of each of the DataFrame's columns.
  390. """
  391. return self.data.dtypes
  392. @property
  393. def ids(self) -> Index:
  394. """
  395. Column names.
  396. Returns
  397. -------
  398. ids : Index
  399. DataFrame's column names.
  400. """
  401. return self.data.columns
  402. @property
  403. def col_count(self) -> int:
  404. """Number of columns to be summarized."""
  405. return len(self.ids)
  406. @property
  407. def non_null_counts(self) -> Sequence[int]:
  408. """Sequence of non-null counts for all columns or column (if series)."""
  409. return self.data.count()
  410. @property
  411. def memory_usage_bytes(self) -> int:
  412. deep = self.memory_usage == "deep"
  413. return self.data.memory_usage(index=True, deep=deep).sum()
  414. def render(
  415. self,
  416. *,
  417. buf: WriteBuffer[str] | None,
  418. max_cols: int | None,
  419. verbose: bool | None,
  420. show_counts: bool | None,
  421. ) -> None:
  422. printer = _DataFrameInfoPrinter(
  423. info=self,
  424. max_cols=max_cols,
  425. verbose=verbose,
  426. show_counts=show_counts,
  427. )
  428. printer.to_buffer(buf)
  429. class SeriesInfo(_BaseInfo):
  430. """
  431. Class storing series-specific info.
  432. """
  433. def __init__(
  434. self,
  435. data: Series,
  436. memory_usage: bool | str | None = None,
  437. ) -> None:
  438. self.data: Series = data
  439. self.memory_usage = _initialize_memory_usage(memory_usage)
  440. def render(
  441. self,
  442. *,
  443. buf: WriteBuffer[str] | None = None,
  444. max_cols: int | None = None,
  445. verbose: bool | None = None,
  446. show_counts: bool | None = None,
  447. ) -> None:
  448. if max_cols is not None:
  449. raise ValueError(
  450. "Argument `max_cols` can only be passed "
  451. "in DataFrame.info, not Series.info"
  452. )
  453. printer = _SeriesInfoPrinter(
  454. info=self,
  455. verbose=verbose,
  456. show_counts=show_counts,
  457. )
  458. printer.to_buffer(buf)
  459. @property
  460. def non_null_counts(self) -> Sequence[int]:
  461. return [self.data.count()]
  462. @property
  463. def dtypes(self) -> Iterable[Dtype]:
  464. return [self.data.dtypes]
  465. @property
  466. def dtype_counts(self) -> Mapping[str, int]:
  467. from pandas.core.frame import DataFrame
  468. return _get_dataframe_dtype_counts(DataFrame(self.data))
  469. @property
  470. def memory_usage_bytes(self) -> int:
  471. """Memory usage in bytes.
  472. Returns
  473. -------
  474. memory_usage_bytes : int
  475. Object's total memory usage in bytes.
  476. """
  477. deep = self.memory_usage == "deep"
  478. return self.data.memory_usage(index=True, deep=deep)
  479. class _InfoPrinterAbstract:
  480. """
  481. Class for printing dataframe or series info.
  482. """
  483. def to_buffer(self, buf: WriteBuffer[str] | None = None) -> None:
  484. """Save dataframe info into buffer."""
  485. table_builder = self._create_table_builder()
  486. lines = table_builder.get_lines()
  487. if buf is None: # pragma: no cover
  488. buf = sys.stdout
  489. fmt.buffer_put_lines(buf, lines)
  490. @abstractmethod
  491. def _create_table_builder(self) -> _TableBuilderAbstract:
  492. """Create instance of table builder."""
  493. class _DataFrameInfoPrinter(_InfoPrinterAbstract):
  494. """
  495. Class for printing dataframe info.
  496. Parameters
  497. ----------
  498. info : DataFrameInfo
  499. Instance of DataFrameInfo.
  500. max_cols : int, optional
  501. When to switch from the verbose to the truncated output.
  502. verbose : bool, optional
  503. Whether to print the full summary.
  504. show_counts : bool, optional
  505. Whether to show the non-null counts.
  506. """
  507. def __init__(
  508. self,
  509. info: DataFrameInfo,
  510. max_cols: int | None = None,
  511. verbose: bool | None = None,
  512. show_counts: bool | None = None,
  513. ) -> None:
  514. self.info = info
  515. self.data = info.data
  516. self.verbose = verbose
  517. self.max_cols = self._initialize_max_cols(max_cols)
  518. self.show_counts = self._initialize_show_counts(show_counts)
  519. @property
  520. def max_rows(self) -> int:
  521. """Maximum info rows to be displayed."""
  522. return get_option("display.max_info_rows", len(self.data) + 1)
  523. @property
  524. def exceeds_info_cols(self) -> bool:
  525. """Check if number of columns to be summarized does not exceed maximum."""
  526. return bool(self.col_count > self.max_cols)
  527. @property
  528. def exceeds_info_rows(self) -> bool:
  529. """Check if number of rows to be summarized does not exceed maximum."""
  530. return bool(len(self.data) > self.max_rows)
  531. @property
  532. def col_count(self) -> int:
  533. """Number of columns to be summarized."""
  534. return self.info.col_count
  535. def _initialize_max_cols(self, max_cols: int | None) -> int:
  536. if max_cols is None:
  537. return get_option("display.max_info_columns", self.col_count + 1)
  538. return max_cols
  539. def _initialize_show_counts(self, show_counts: bool | None) -> bool:
  540. if show_counts is None:
  541. return bool(not self.exceeds_info_cols and not self.exceeds_info_rows)
  542. else:
  543. return show_counts
  544. def _create_table_builder(self) -> _DataFrameTableBuilder:
  545. """
  546. Create instance of table builder based on verbosity and display settings.
  547. """
  548. if self.verbose:
  549. return _DataFrameTableBuilderVerbose(
  550. info=self.info,
  551. with_counts=self.show_counts,
  552. )
  553. elif self.verbose is False: # specifically set to False, not necessarily None
  554. return _DataFrameTableBuilderNonVerbose(info=self.info)
  555. elif self.exceeds_info_cols:
  556. return _DataFrameTableBuilderNonVerbose(info=self.info)
  557. else:
  558. return _DataFrameTableBuilderVerbose(
  559. info=self.info,
  560. with_counts=self.show_counts,
  561. )
  562. class _SeriesInfoPrinter(_InfoPrinterAbstract):
  563. """Class for printing series info.
  564. Parameters
  565. ----------
  566. info : SeriesInfo
  567. Instance of SeriesInfo.
  568. verbose : bool, optional
  569. Whether to print the full summary.
  570. show_counts : bool, optional
  571. Whether to show the non-null counts.
  572. """
  573. def __init__(
  574. self,
  575. info: SeriesInfo,
  576. verbose: bool | None = None,
  577. show_counts: bool | None = None,
  578. ) -> None:
  579. self.info = info
  580. self.data = info.data
  581. self.verbose = verbose
  582. self.show_counts = self._initialize_show_counts(show_counts)
  583. def _create_table_builder(self) -> _SeriesTableBuilder:
  584. """
  585. Create instance of table builder based on verbosity.
  586. """
  587. if self.verbose or self.verbose is None:
  588. return _SeriesTableBuilderVerbose(
  589. info=self.info,
  590. with_counts=self.show_counts,
  591. )
  592. else:
  593. return _SeriesTableBuilderNonVerbose(info=self.info)
  594. def _initialize_show_counts(self, show_counts: bool | None) -> bool:
  595. if show_counts is None:
  596. return True
  597. else:
  598. return show_counts
  599. class _TableBuilderAbstract(ABC):
  600. """
  601. Abstract builder for info table.
  602. """
  603. _lines: list[str]
  604. info: _BaseInfo
  605. @abstractmethod
  606. def get_lines(self) -> list[str]:
  607. """Product in a form of list of lines (strings)."""
  608. @property
  609. def data(self) -> DataFrame | Series:
  610. return self.info.data
  611. @property
  612. def dtypes(self) -> Iterable[Dtype]:
  613. """Dtypes of each of the DataFrame's columns."""
  614. return self.info.dtypes
  615. @property
  616. def dtype_counts(self) -> Mapping[str, int]:
  617. """Mapping dtype - number of counts."""
  618. return self.info.dtype_counts
  619. @property
  620. def display_memory_usage(self) -> bool:
  621. """Whether to display memory usage."""
  622. return bool(self.info.memory_usage)
  623. @property
  624. def memory_usage_string(self) -> str:
  625. """Memory usage string with proper size qualifier."""
  626. return self.info.memory_usage_string
  627. @property
  628. def non_null_counts(self) -> Sequence[int]:
  629. return self.info.non_null_counts
  630. def add_object_type_line(self) -> None:
  631. """Add line with string representation of dataframe to the table."""
  632. self._lines.append(str(type(self.data)))
  633. def add_index_range_line(self) -> None:
  634. """Add line with range of indices to the table."""
  635. self._lines.append(self.data.index._summary())
  636. def add_dtypes_line(self) -> None:
  637. """Add summary line with dtypes present in dataframe."""
  638. collected_dtypes = [
  639. f"{key}({val:d})" for key, val in sorted(self.dtype_counts.items())
  640. ]
  641. self._lines.append(f"dtypes: {', '.join(collected_dtypes)}")
  642. class _DataFrameTableBuilder(_TableBuilderAbstract):
  643. """
  644. Abstract builder for dataframe info table.
  645. Parameters
  646. ----------
  647. info : DataFrameInfo.
  648. Instance of DataFrameInfo.
  649. """
  650. def __init__(self, *, info: DataFrameInfo) -> None:
  651. self.info: DataFrameInfo = info
  652. def get_lines(self) -> list[str]:
  653. self._lines = []
  654. if self.col_count == 0:
  655. self._fill_empty_info()
  656. else:
  657. self._fill_non_empty_info()
  658. return self._lines
  659. def _fill_empty_info(self) -> None:
  660. """Add lines to the info table, pertaining to empty dataframe."""
  661. self.add_object_type_line()
  662. self.add_index_range_line()
  663. self._lines.append(f"Empty {type(self.data).__name__}\n")
  664. @abstractmethod
  665. def _fill_non_empty_info(self) -> None:
  666. """Add lines to the info table, pertaining to non-empty dataframe."""
  667. @property
  668. def data(self) -> DataFrame:
  669. """DataFrame."""
  670. return self.info.data
  671. @property
  672. def ids(self) -> Index:
  673. """Dataframe columns."""
  674. return self.info.ids
  675. @property
  676. def col_count(self) -> int:
  677. """Number of dataframe columns to be summarized."""
  678. return self.info.col_count
  679. def add_memory_usage_line(self) -> None:
  680. """Add line containing memory usage."""
  681. self._lines.append(f"memory usage: {self.memory_usage_string}")
  682. class _DataFrameTableBuilderNonVerbose(_DataFrameTableBuilder):
  683. """
  684. Dataframe info table builder for non-verbose output.
  685. """
  686. def _fill_non_empty_info(self) -> None:
  687. """Add lines to the info table, pertaining to non-empty dataframe."""
  688. self.add_object_type_line()
  689. self.add_index_range_line()
  690. self.add_columns_summary_line()
  691. self.add_dtypes_line()
  692. if self.display_memory_usage:
  693. self.add_memory_usage_line()
  694. def add_columns_summary_line(self) -> None:
  695. self._lines.append(self.ids._summary(name="Columns"))
  696. class _TableBuilderVerboseMixin(_TableBuilderAbstract):
  697. """
  698. Mixin for verbose info output.
  699. """
  700. SPACING: str = " " * 2
  701. strrows: Sequence[Sequence[str]]
  702. gross_column_widths: Sequence[int]
  703. with_counts: bool
  704. @property
  705. @abstractmethod
  706. def headers(self) -> Sequence[str]:
  707. """Headers names of the columns in verbose table."""
  708. @property
  709. def header_column_widths(self) -> Sequence[int]:
  710. """Widths of header columns (only titles)."""
  711. return [len(col) for col in self.headers]
  712. def _get_gross_column_widths(self) -> Sequence[int]:
  713. """Get widths of columns containing both headers and actual content."""
  714. body_column_widths = self._get_body_column_widths()
  715. return [
  716. max(*widths)
  717. for widths in zip(self.header_column_widths, body_column_widths)
  718. ]
  719. def _get_body_column_widths(self) -> Sequence[int]:
  720. """Get widths of table content columns."""
  721. strcols: Sequence[Sequence[str]] = list(zip(*self.strrows))
  722. return [max(len(x) for x in col) for col in strcols]
  723. def _gen_rows(self) -> Iterator[Sequence[str]]:
  724. """
  725. Generator function yielding rows content.
  726. Each element represents a row comprising a sequence of strings.
  727. """
  728. if self.with_counts:
  729. return self._gen_rows_with_counts()
  730. else:
  731. return self._gen_rows_without_counts()
  732. @abstractmethod
  733. def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
  734. """Iterator with string representation of body data with counts."""
  735. @abstractmethod
  736. def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
  737. """Iterator with string representation of body data without counts."""
  738. def add_header_line(self) -> None:
  739. header_line = self.SPACING.join(
  740. [
  741. _put_str(header, col_width)
  742. for header, col_width in zip(self.headers, self.gross_column_widths)
  743. ]
  744. )
  745. self._lines.append(header_line)
  746. def add_separator_line(self) -> None:
  747. separator_line = self.SPACING.join(
  748. [
  749. _put_str("-" * header_colwidth, gross_colwidth)
  750. for header_colwidth, gross_colwidth in zip(
  751. self.header_column_widths, self.gross_column_widths
  752. )
  753. ]
  754. )
  755. self._lines.append(separator_line)
  756. def add_body_lines(self) -> None:
  757. for row in self.strrows:
  758. body_line = self.SPACING.join(
  759. [
  760. _put_str(col, gross_colwidth)
  761. for col, gross_colwidth in zip(row, self.gross_column_widths)
  762. ]
  763. )
  764. self._lines.append(body_line)
  765. def _gen_non_null_counts(self) -> Iterator[str]:
  766. """Iterator with string representation of non-null counts."""
  767. for count in self.non_null_counts:
  768. yield f"{count} non-null"
  769. def _gen_dtypes(self) -> Iterator[str]:
  770. """Iterator with string representation of column dtypes."""
  771. for dtype in self.dtypes:
  772. yield pprint_thing(dtype)
  773. class _DataFrameTableBuilderVerbose(_DataFrameTableBuilder, _TableBuilderVerboseMixin):
  774. """
  775. Dataframe info table builder for verbose output.
  776. """
  777. def __init__(
  778. self,
  779. *,
  780. info: DataFrameInfo,
  781. with_counts: bool,
  782. ) -> None:
  783. self.info = info
  784. self.with_counts = with_counts
  785. self.strrows: Sequence[Sequence[str]] = list(self._gen_rows())
  786. self.gross_column_widths: Sequence[int] = self._get_gross_column_widths()
  787. def _fill_non_empty_info(self) -> None:
  788. """Add lines to the info table, pertaining to non-empty dataframe."""
  789. self.add_object_type_line()
  790. self.add_index_range_line()
  791. self.add_columns_summary_line()
  792. self.add_header_line()
  793. self.add_separator_line()
  794. self.add_body_lines()
  795. self.add_dtypes_line()
  796. if self.display_memory_usage:
  797. self.add_memory_usage_line()
  798. @property
  799. def headers(self) -> Sequence[str]:
  800. """Headers names of the columns in verbose table."""
  801. if self.with_counts:
  802. return [" # ", "Column", "Non-Null Count", "Dtype"]
  803. return [" # ", "Column", "Dtype"]
  804. def add_columns_summary_line(self) -> None:
  805. self._lines.append(f"Data columns (total {self.col_count} columns):")
  806. def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
  807. """Iterator with string representation of body data without counts."""
  808. yield from zip(
  809. self._gen_line_numbers(),
  810. self._gen_columns(),
  811. self._gen_dtypes(),
  812. )
  813. def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
  814. """Iterator with string representation of body data with counts."""
  815. yield from zip(
  816. self._gen_line_numbers(),
  817. self._gen_columns(),
  818. self._gen_non_null_counts(),
  819. self._gen_dtypes(),
  820. )
  821. def _gen_line_numbers(self) -> Iterator[str]:
  822. """Iterator with string representation of column numbers."""
  823. for i, _ in enumerate(self.ids):
  824. yield f" {i}"
  825. def _gen_columns(self) -> Iterator[str]:
  826. """Iterator with string representation of column names."""
  827. for col in self.ids:
  828. yield pprint_thing(col)
  829. class _SeriesTableBuilder(_TableBuilderAbstract):
  830. """
  831. Abstract builder for series info table.
  832. Parameters
  833. ----------
  834. info : SeriesInfo.
  835. Instance of SeriesInfo.
  836. """
  837. def __init__(self, *, info: SeriesInfo) -> None:
  838. self.info: SeriesInfo = info
  839. def get_lines(self) -> list[str]:
  840. self._lines = []
  841. self._fill_non_empty_info()
  842. return self._lines
  843. @property
  844. def data(self) -> Series:
  845. """Series."""
  846. return self.info.data
  847. def add_memory_usage_line(self) -> None:
  848. """Add line containing memory usage."""
  849. self._lines.append(f"memory usage: {self.memory_usage_string}")
  850. @abstractmethod
  851. def _fill_non_empty_info(self) -> None:
  852. """Add lines to the info table, pertaining to non-empty series."""
  853. class _SeriesTableBuilderNonVerbose(_SeriesTableBuilder):
  854. """
  855. Series info table builder for non-verbose output.
  856. """
  857. def _fill_non_empty_info(self) -> None:
  858. """Add lines to the info table, pertaining to non-empty series."""
  859. self.add_object_type_line()
  860. self.add_index_range_line()
  861. self.add_dtypes_line()
  862. if self.display_memory_usage:
  863. self.add_memory_usage_line()
  864. class _SeriesTableBuilderVerbose(_SeriesTableBuilder, _TableBuilderVerboseMixin):
  865. """
  866. Series info table builder for verbose output.
  867. """
  868. def __init__(
  869. self,
  870. *,
  871. info: SeriesInfo,
  872. with_counts: bool,
  873. ) -> None:
  874. self.info = info
  875. self.with_counts = with_counts
  876. self.strrows: Sequence[Sequence[str]] = list(self._gen_rows())
  877. self.gross_column_widths: Sequence[int] = self._get_gross_column_widths()
  878. def _fill_non_empty_info(self) -> None:
  879. """Add lines to the info table, pertaining to non-empty series."""
  880. self.add_object_type_line()
  881. self.add_index_range_line()
  882. self.add_series_name_line()
  883. self.add_header_line()
  884. self.add_separator_line()
  885. self.add_body_lines()
  886. self.add_dtypes_line()
  887. if self.display_memory_usage:
  888. self.add_memory_usage_line()
  889. def add_series_name_line(self) -> None:
  890. self._lines.append(f"Series name: {self.data.name}")
  891. @property
  892. def headers(self) -> Sequence[str]:
  893. """Headers names of the columns in verbose table."""
  894. if self.with_counts:
  895. return ["Non-Null Count", "Dtype"]
  896. return ["Dtype"]
  897. def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
  898. """Iterator with string representation of body data without counts."""
  899. yield from self._gen_dtypes()
  900. def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
  901. """Iterator with string representation of body data with counts."""
  902. yield from zip(
  903. self._gen_non_null_counts(),
  904. self._gen_dtypes(),
  905. )
  906. def _get_dataframe_dtype_counts(df: DataFrame) -> Mapping[str, int]:
  907. """
  908. Create mapping between datatypes and their number of occurrences.
  909. """
  910. # groupby dtype.name to collect e.g. Categorical columns
  911. return df.dtypes.value_counts().groupby(lambda x: x.name).sum()