format.py 65 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058
  1. """
  2. Internal module for formatting output data in csv, html, xml,
  3. and latex files. This module also applies to display formatting.
  4. """
  5. from __future__ import annotations
  6. from collections.abc import (
  7. Generator,
  8. Hashable,
  9. Mapping,
  10. Sequence,
  11. )
  12. from contextlib import contextmanager
  13. from csv import QUOTE_NONE
  14. from decimal import Decimal
  15. from functools import partial
  16. from io import StringIO
  17. import math
  18. import re
  19. from shutil import get_terminal_size
  20. from typing import (
  21. TYPE_CHECKING,
  22. Any,
  23. Callable,
  24. Final,
  25. cast,
  26. )
  27. import numpy as np
  28. from pandas._config.config import (
  29. get_option,
  30. set_option,
  31. )
  32. from pandas._libs import lib
  33. from pandas._libs.missing import NA
  34. from pandas._libs.tslibs import (
  35. NaT,
  36. Timedelta,
  37. Timestamp,
  38. )
  39. from pandas._libs.tslibs.nattype import NaTType
  40. from pandas.core.dtypes.common import (
  41. is_complex_dtype,
  42. is_float,
  43. is_integer,
  44. is_list_like,
  45. is_numeric_dtype,
  46. is_scalar,
  47. )
  48. from pandas.core.dtypes.dtypes import (
  49. CategoricalDtype,
  50. DatetimeTZDtype,
  51. ExtensionDtype,
  52. )
  53. from pandas.core.dtypes.missing import (
  54. isna,
  55. notna,
  56. )
  57. from pandas.core.arrays import (
  58. Categorical,
  59. DatetimeArray,
  60. ExtensionArray,
  61. TimedeltaArray,
  62. )
  63. from pandas.core.arrays.string_ import StringDtype
  64. from pandas.core.base import PandasObject
  65. import pandas.core.common as com
  66. from pandas.core.indexes.api import (
  67. Index,
  68. MultiIndex,
  69. PeriodIndex,
  70. ensure_index,
  71. )
  72. from pandas.core.indexes.datetimes import DatetimeIndex
  73. from pandas.core.indexes.timedeltas import TimedeltaIndex
  74. from pandas.core.reshape.concat import concat
  75. from pandas.io.common import (
  76. check_parent_directory,
  77. stringify_path,
  78. )
  79. from pandas.io.formats import printing
  80. if TYPE_CHECKING:
  81. from pandas._typing import (
  82. ArrayLike,
  83. Axes,
  84. ColspaceArgType,
  85. ColspaceType,
  86. CompressionOptions,
  87. FilePath,
  88. FloatFormatType,
  89. FormattersType,
  90. IndexLabel,
  91. SequenceNotStr,
  92. StorageOptions,
  93. WriteBuffer,
  94. )
  95. from pandas import (
  96. DataFrame,
  97. Series,
  98. )
  99. common_docstring: Final = """
  100. Parameters
  101. ----------
  102. buf : str, Path or StringIO-like, optional, default None
  103. Buffer to write to. If None, the output is returned as a string.
  104. columns : array-like, optional, default None
  105. The subset of columns to write. Writes all columns by default.
  106. col_space : %(col_space_type)s, optional
  107. %(col_space)s.
  108. header : %(header_type)s, optional
  109. %(header)s.
  110. index : bool, optional, default True
  111. Whether to print index (row) labels.
  112. na_rep : str, optional, default 'NaN'
  113. String representation of ``NaN`` to use.
  114. formatters : list, tuple or dict of one-param. functions, optional
  115. Formatter functions to apply to columns' elements by position or
  116. name.
  117. The result of each function must be a unicode string.
  118. List/tuple must be of length equal to the number of columns.
  119. float_format : one-parameter function, optional, default None
  120. Formatter function to apply to columns' elements if they are
  121. floats. This function must return a unicode string and will be
  122. applied only to the non-``NaN`` elements, with ``NaN`` being
  123. handled by ``na_rep``.
  124. sparsify : bool, optional, default True
  125. Set to False for a DataFrame with a hierarchical index to print
  126. every multiindex key at each row.
  127. index_names : bool, optional, default True
  128. Prints the names of the indexes.
  129. justify : str, default None
  130. How to justify the column labels. If None uses the option from
  131. the print configuration (controlled by set_option), 'right' out
  132. of the box. Valid values are
  133. * left
  134. * right
  135. * center
  136. * justify
  137. * justify-all
  138. * start
  139. * end
  140. * inherit
  141. * match-parent
  142. * initial
  143. * unset.
  144. max_rows : int, optional
  145. Maximum number of rows to display in the console.
  146. max_cols : int, optional
  147. Maximum number of columns to display in the console.
  148. show_dimensions : bool, default False
  149. Display DataFrame dimensions (number of rows by number of columns).
  150. decimal : str, default '.'
  151. Character recognized as decimal separator, e.g. ',' in Europe.
  152. """
  153. VALID_JUSTIFY_PARAMETERS = (
  154. "left",
  155. "right",
  156. "center",
  157. "justify",
  158. "justify-all",
  159. "start",
  160. "end",
  161. "inherit",
  162. "match-parent",
  163. "initial",
  164. "unset",
  165. )
  166. return_docstring: Final = """
  167. Returns
  168. -------
  169. str or None
  170. If buf is None, returns the result as a string. Otherwise returns
  171. None.
  172. """
  173. class SeriesFormatter:
  174. """
  175. Implement the main logic of Series.to_string, which underlies
  176. Series.__repr__.
  177. """
  178. def __init__(
  179. self,
  180. series: Series,
  181. *,
  182. length: bool | str = True,
  183. header: bool = True,
  184. index: bool = True,
  185. na_rep: str = "NaN",
  186. name: bool = False,
  187. float_format: str | None = None,
  188. dtype: bool = True,
  189. max_rows: int | None = None,
  190. min_rows: int | None = None,
  191. ) -> None:
  192. self.series = series
  193. self.buf = StringIO()
  194. self.name = name
  195. self.na_rep = na_rep
  196. self.header = header
  197. self.length = length
  198. self.index = index
  199. self.max_rows = max_rows
  200. self.min_rows = min_rows
  201. if float_format is None:
  202. float_format = get_option("display.float_format")
  203. self.float_format = float_format
  204. self.dtype = dtype
  205. self.adj = printing.get_adjustment()
  206. self._chk_truncate()
  207. def _chk_truncate(self) -> None:
  208. self.tr_row_num: int | None
  209. min_rows = self.min_rows
  210. max_rows = self.max_rows
  211. # truncation determined by max_rows, actual truncated number of rows
  212. # used below by min_rows
  213. is_truncated_vertically = max_rows and (len(self.series) > max_rows)
  214. series = self.series
  215. if is_truncated_vertically:
  216. max_rows = cast(int, max_rows)
  217. if min_rows:
  218. # if min_rows is set (not None or 0), set max_rows to minimum
  219. # of both
  220. max_rows = min(min_rows, max_rows)
  221. if max_rows == 1:
  222. row_num = max_rows
  223. series = series.iloc[:max_rows]
  224. else:
  225. row_num = max_rows // 2
  226. series = concat((series.iloc[:row_num], series.iloc[-row_num:]))
  227. self.tr_row_num = row_num
  228. else:
  229. self.tr_row_num = None
  230. self.tr_series = series
  231. self.is_truncated_vertically = is_truncated_vertically
  232. def _get_footer(self) -> str:
  233. name = self.series.name
  234. footer = ""
  235. index = self.series.index
  236. if (
  237. isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex))
  238. and index.freq is not None
  239. ):
  240. footer += f"Freq: {index.freqstr}"
  241. if self.name is not False and name is not None:
  242. if footer:
  243. footer += ", "
  244. series_name = printing.pprint_thing(name, escape_chars=("\t", "\r", "\n"))
  245. footer += f"Name: {series_name}"
  246. if self.length is True or (
  247. self.length == "truncate" and self.is_truncated_vertically
  248. ):
  249. if footer:
  250. footer += ", "
  251. footer += f"Length: {len(self.series)}"
  252. if self.dtype is not False and self.dtype is not None:
  253. dtype_name = getattr(self.tr_series.dtype, "name", None)
  254. if dtype_name:
  255. if footer:
  256. footer += ", "
  257. footer += f"dtype: {printing.pprint_thing(dtype_name)}"
  258. # level infos are added to the end and in a new line, like it is done
  259. # for Categoricals
  260. if isinstance(self.tr_series.dtype, CategoricalDtype):
  261. level_info = self.tr_series._values._get_repr_footer()
  262. if footer:
  263. footer += "\n"
  264. footer += level_info
  265. return str(footer)
  266. def _get_formatted_values(self) -> list[str]:
  267. return format_array(
  268. self.tr_series._values,
  269. None,
  270. float_format=self.float_format,
  271. na_rep=self.na_rep,
  272. leading_space=self.index,
  273. )
  274. def to_string(self) -> str:
  275. series = self.tr_series
  276. footer = self._get_footer()
  277. if len(series) == 0:
  278. return f"{type(self.series).__name__}([], {footer})"
  279. index = series.index
  280. have_header = _has_names(index)
  281. if isinstance(index, MultiIndex):
  282. fmt_index = index._format_multi(include_names=True, sparsify=None)
  283. adj = printing.get_adjustment()
  284. fmt_index = adj.adjoin(2, *fmt_index).split("\n")
  285. else:
  286. fmt_index = index._format_flat(include_name=True)
  287. fmt_values = self._get_formatted_values()
  288. if self.is_truncated_vertically:
  289. n_header_rows = 0
  290. row_num = self.tr_row_num
  291. row_num = cast(int, row_num)
  292. width = self.adj.len(fmt_values[row_num - 1])
  293. if width > 3:
  294. dot_str = "..."
  295. else:
  296. dot_str = ".."
  297. # Series uses mode=center because it has single value columns
  298. # DataFrame uses mode=left
  299. dot_str = self.adj.justify([dot_str], width, mode="center")[0]
  300. fmt_values.insert(row_num + n_header_rows, dot_str)
  301. fmt_index.insert(row_num + 1, "")
  302. if self.index:
  303. result = self.adj.adjoin(3, *[fmt_index[1:], fmt_values])
  304. else:
  305. result = self.adj.adjoin(3, fmt_values)
  306. if self.header and have_header:
  307. result = fmt_index[0] + "\n" + result
  308. if footer:
  309. result += "\n" + footer
  310. return str("".join(result))
  311. def get_dataframe_repr_params() -> dict[str, Any]:
  312. """Get the parameters used to repr(dataFrame) calls using DataFrame.to_string.
  313. Supplying these parameters to DataFrame.to_string is equivalent to calling
  314. ``repr(DataFrame)``. This is useful if you want to adjust the repr output.
  315. .. versionadded:: 1.4.0
  316. Example
  317. -------
  318. >>> import pandas as pd
  319. >>>
  320. >>> df = pd.DataFrame([[1, 2], [3, 4]])
  321. >>> repr_params = pd.io.formats.format.get_dataframe_repr_params()
  322. >>> repr(df) == df.to_string(**repr_params)
  323. True
  324. """
  325. from pandas.io.formats import console
  326. if get_option("display.expand_frame_repr"):
  327. line_width, _ = console.get_console_size()
  328. else:
  329. line_width = None
  330. return {
  331. "max_rows": get_option("display.max_rows"),
  332. "min_rows": get_option("display.min_rows"),
  333. "max_cols": get_option("display.max_columns"),
  334. "max_colwidth": get_option("display.max_colwidth"),
  335. "show_dimensions": get_option("display.show_dimensions"),
  336. "line_width": line_width,
  337. }
  338. def get_series_repr_params() -> dict[str, Any]:
  339. """Get the parameters used to repr(Series) calls using Series.to_string.
  340. Supplying these parameters to Series.to_string is equivalent to calling
  341. ``repr(series)``. This is useful if you want to adjust the series repr output.
  342. .. versionadded:: 1.4.0
  343. Example
  344. -------
  345. >>> import pandas as pd
  346. >>>
  347. >>> ser = pd.Series([1, 2, 3, 4])
  348. >>> repr_params = pd.io.formats.format.get_series_repr_params()
  349. >>> repr(ser) == ser.to_string(**repr_params)
  350. True
  351. """
  352. width, height = get_terminal_size()
  353. max_rows_opt = get_option("display.max_rows")
  354. max_rows = height if max_rows_opt == 0 else max_rows_opt
  355. min_rows = height if max_rows_opt == 0 else get_option("display.min_rows")
  356. return {
  357. "name": True,
  358. "dtype": True,
  359. "min_rows": min_rows,
  360. "max_rows": max_rows,
  361. "length": get_option("display.show_dimensions"),
  362. }
  363. class DataFrameFormatter:
  364. """
  365. Class for processing dataframe formatting options and data.
  366. Used by DataFrame.to_string, which backs DataFrame.__repr__.
  367. """
  368. __doc__ = __doc__ if __doc__ else ""
  369. __doc__ += common_docstring + return_docstring
  370. def __init__(
  371. self,
  372. frame: DataFrame,
  373. columns: Axes | None = None,
  374. col_space: ColspaceArgType | None = None,
  375. header: bool | SequenceNotStr[str] = True,
  376. index: bool = True,
  377. na_rep: str = "NaN",
  378. formatters: FormattersType | None = None,
  379. justify: str | None = None,
  380. float_format: FloatFormatType | None = None,
  381. sparsify: bool | None = None,
  382. index_names: bool = True,
  383. max_rows: int | None = None,
  384. min_rows: int | None = None,
  385. max_cols: int | None = None,
  386. show_dimensions: bool | str = False,
  387. decimal: str = ".",
  388. bold_rows: bool = False,
  389. escape: bool = True,
  390. ) -> None:
  391. self.frame = frame
  392. self.columns = self._initialize_columns(columns)
  393. self.col_space = self._initialize_colspace(col_space)
  394. self.header = header
  395. self.index = index
  396. self.na_rep = na_rep
  397. self.formatters = self._initialize_formatters(formatters)
  398. self.justify = self._initialize_justify(justify)
  399. self.float_format = float_format
  400. self.sparsify = self._initialize_sparsify(sparsify)
  401. self.show_index_names = index_names
  402. self.decimal = decimal
  403. self.bold_rows = bold_rows
  404. self.escape = escape
  405. self.max_rows = max_rows
  406. self.min_rows = min_rows
  407. self.max_cols = max_cols
  408. self.show_dimensions = show_dimensions
  409. self.max_cols_fitted = self._calc_max_cols_fitted()
  410. self.max_rows_fitted = self._calc_max_rows_fitted()
  411. self.tr_frame = self.frame
  412. self.truncate()
  413. self.adj = printing.get_adjustment()
  414. def get_strcols(self) -> list[list[str]]:
  415. """
  416. Render a DataFrame to a list of columns (as lists of strings).
  417. """
  418. strcols = self._get_strcols_without_index()
  419. if self.index:
  420. str_index = self._get_formatted_index(self.tr_frame)
  421. strcols.insert(0, str_index)
  422. return strcols
  423. @property
  424. def should_show_dimensions(self) -> bool:
  425. return self.show_dimensions is True or (
  426. self.show_dimensions == "truncate" and self.is_truncated
  427. )
  428. @property
  429. def is_truncated(self) -> bool:
  430. return bool(self.is_truncated_horizontally or self.is_truncated_vertically)
  431. @property
  432. def is_truncated_horizontally(self) -> bool:
  433. return bool(self.max_cols_fitted and (len(self.columns) > self.max_cols_fitted))
  434. @property
  435. def is_truncated_vertically(self) -> bool:
  436. return bool(self.max_rows_fitted and (len(self.frame) > self.max_rows_fitted))
  437. @property
  438. def dimensions_info(self) -> str:
  439. return f"\n\n[{len(self.frame)} rows x {len(self.frame.columns)} columns]"
  440. @property
  441. def has_index_names(self) -> bool:
  442. return _has_names(self.frame.index)
  443. @property
  444. def has_column_names(self) -> bool:
  445. return _has_names(self.frame.columns)
  446. @property
  447. def show_row_idx_names(self) -> bool:
  448. return all((self.has_index_names, self.index, self.show_index_names))
  449. @property
  450. def show_col_idx_names(self) -> bool:
  451. return all((self.has_column_names, self.show_index_names, self.header))
  452. @property
  453. def max_rows_displayed(self) -> int:
  454. return min(self.max_rows or len(self.frame), len(self.frame))
  455. def _initialize_sparsify(self, sparsify: bool | None) -> bool:
  456. if sparsify is None:
  457. return get_option("display.multi_sparse")
  458. return sparsify
  459. def _initialize_formatters(
  460. self, formatters: FormattersType | None
  461. ) -> FormattersType:
  462. if formatters is None:
  463. return {}
  464. elif len(self.frame.columns) == len(formatters) or isinstance(formatters, dict):
  465. return formatters
  466. else:
  467. raise ValueError(
  468. f"Formatters length({len(formatters)}) should match "
  469. f"DataFrame number of columns({len(self.frame.columns)})"
  470. )
  471. def _initialize_justify(self, justify: str | None) -> str:
  472. if justify is None:
  473. return get_option("display.colheader_justify")
  474. else:
  475. return justify
  476. def _initialize_columns(self, columns: Axes | None) -> Index:
  477. if columns is not None:
  478. cols = ensure_index(columns)
  479. self.frame = self.frame[cols]
  480. return cols
  481. else:
  482. return self.frame.columns
  483. def _initialize_colspace(self, col_space: ColspaceArgType | None) -> ColspaceType:
  484. result: ColspaceType
  485. if col_space is None:
  486. result = {}
  487. elif isinstance(col_space, (int, str)):
  488. result = {"": col_space}
  489. result.update({column: col_space for column in self.frame.columns})
  490. elif isinstance(col_space, Mapping):
  491. for column in col_space.keys():
  492. if column not in self.frame.columns and column != "":
  493. raise ValueError(
  494. f"Col_space is defined for an unknown column: {column}"
  495. )
  496. result = col_space
  497. else:
  498. if len(self.frame.columns) != len(col_space):
  499. raise ValueError(
  500. f"Col_space length({len(col_space)}) should match "
  501. f"DataFrame number of columns({len(self.frame.columns)})"
  502. )
  503. result = dict(zip(self.frame.columns, col_space))
  504. return result
  505. def _calc_max_cols_fitted(self) -> int | None:
  506. """Number of columns fitting the screen."""
  507. if not self._is_in_terminal():
  508. return self.max_cols
  509. width, _ = get_terminal_size()
  510. if self._is_screen_narrow(width):
  511. return width
  512. else:
  513. return self.max_cols
  514. def _calc_max_rows_fitted(self) -> int | None:
  515. """Number of rows with data fitting the screen."""
  516. max_rows: int | None
  517. if self._is_in_terminal():
  518. _, height = get_terminal_size()
  519. if self.max_rows == 0:
  520. # rows available to fill with actual data
  521. return height - self._get_number_of_auxiliary_rows()
  522. if self._is_screen_short(height):
  523. max_rows = height
  524. else:
  525. max_rows = self.max_rows
  526. else:
  527. max_rows = self.max_rows
  528. return self._adjust_max_rows(max_rows)
  529. def _adjust_max_rows(self, max_rows: int | None) -> int | None:
  530. """Adjust max_rows using display logic.
  531. See description here:
  532. https://pandas.pydata.org/docs/dev/user_guide/options.html#frequently-used-options
  533. GH #37359
  534. """
  535. if max_rows:
  536. if (len(self.frame) > max_rows) and self.min_rows:
  537. # if truncated, set max_rows showed to min_rows
  538. max_rows = min(self.min_rows, max_rows)
  539. return max_rows
  540. def _is_in_terminal(self) -> bool:
  541. """Check if the output is to be shown in terminal."""
  542. return bool(self.max_cols == 0 or self.max_rows == 0)
  543. def _is_screen_narrow(self, max_width) -> bool:
  544. return bool(self.max_cols == 0 and len(self.frame.columns) > max_width)
  545. def _is_screen_short(self, max_height) -> bool:
  546. return bool(self.max_rows == 0 and len(self.frame) > max_height)
  547. def _get_number_of_auxiliary_rows(self) -> int:
  548. """Get number of rows occupied by prompt, dots and dimension info."""
  549. dot_row = 1
  550. prompt_row = 1
  551. num_rows = dot_row + prompt_row
  552. if self.show_dimensions:
  553. num_rows += len(self.dimensions_info.splitlines())
  554. if self.header:
  555. num_rows += 1
  556. return num_rows
  557. def truncate(self) -> None:
  558. """
  559. Check whether the frame should be truncated. If so, slice the frame up.
  560. """
  561. if self.is_truncated_horizontally:
  562. self._truncate_horizontally()
  563. if self.is_truncated_vertically:
  564. self._truncate_vertically()
  565. def _truncate_horizontally(self) -> None:
  566. """Remove columns, which are not to be displayed and adjust formatters.
  567. Attributes affected:
  568. - tr_frame
  569. - formatters
  570. - tr_col_num
  571. """
  572. assert self.max_cols_fitted is not None
  573. col_num = self.max_cols_fitted // 2
  574. if col_num >= 1:
  575. left = self.tr_frame.iloc[:, :col_num]
  576. right = self.tr_frame.iloc[:, -col_num:]
  577. self.tr_frame = concat((left, right), axis=1)
  578. # truncate formatter
  579. if isinstance(self.formatters, (list, tuple)):
  580. self.formatters = [
  581. *self.formatters[:col_num],
  582. *self.formatters[-col_num:],
  583. ]
  584. else:
  585. col_num = cast(int, self.max_cols)
  586. self.tr_frame = self.tr_frame.iloc[:, :col_num]
  587. self.tr_col_num = col_num
  588. def _truncate_vertically(self) -> None:
  589. """Remove rows, which are not to be displayed.
  590. Attributes affected:
  591. - tr_frame
  592. - tr_row_num
  593. """
  594. assert self.max_rows_fitted is not None
  595. row_num = self.max_rows_fitted // 2
  596. if row_num >= 1:
  597. _len = len(self.tr_frame)
  598. _slice = np.hstack([np.arange(row_num), np.arange(_len - row_num, _len)])
  599. self.tr_frame = self.tr_frame.iloc[_slice]
  600. else:
  601. row_num = cast(int, self.max_rows)
  602. self.tr_frame = self.tr_frame.iloc[:row_num, :]
  603. self.tr_row_num = row_num
  604. def _get_strcols_without_index(self) -> list[list[str]]:
  605. strcols: list[list[str]] = []
  606. if not is_list_like(self.header) and not self.header:
  607. for i, c in enumerate(self.tr_frame):
  608. fmt_values = self.format_col(i)
  609. fmt_values = _make_fixed_width(
  610. strings=fmt_values,
  611. justify=self.justify,
  612. minimum=int(self.col_space.get(c, 0)),
  613. adj=self.adj,
  614. )
  615. strcols.append(fmt_values)
  616. return strcols
  617. if is_list_like(self.header):
  618. # cast here since can't be bool if is_list_like
  619. self.header = cast(list[str], self.header)
  620. if len(self.header) != len(self.columns):
  621. raise ValueError(
  622. f"Writing {len(self.columns)} cols "
  623. f"but got {len(self.header)} aliases"
  624. )
  625. str_columns = [[label] for label in self.header]
  626. else:
  627. str_columns = self._get_formatted_column_labels(self.tr_frame)
  628. if self.show_row_idx_names:
  629. for x in str_columns:
  630. x.append("")
  631. for i, c in enumerate(self.tr_frame):
  632. cheader = str_columns[i]
  633. header_colwidth = max(
  634. int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader)
  635. )
  636. fmt_values = self.format_col(i)
  637. fmt_values = _make_fixed_width(
  638. fmt_values, self.justify, minimum=header_colwidth, adj=self.adj
  639. )
  640. max_len = max(*(self.adj.len(x) for x in fmt_values), header_colwidth)
  641. cheader = self.adj.justify(cheader, max_len, mode=self.justify)
  642. strcols.append(cheader + fmt_values)
  643. return strcols
  644. def format_col(self, i: int) -> list[str]:
  645. frame = self.tr_frame
  646. formatter = self._get_formatter(i)
  647. return format_array(
  648. frame.iloc[:, i]._values,
  649. formatter,
  650. float_format=self.float_format,
  651. na_rep=self.na_rep,
  652. space=self.col_space.get(frame.columns[i]),
  653. decimal=self.decimal,
  654. leading_space=self.index,
  655. )
  656. def _get_formatter(self, i: str | int) -> Callable | None:
  657. if isinstance(self.formatters, (list, tuple)):
  658. if is_integer(i):
  659. i = cast(int, i)
  660. return self.formatters[i]
  661. else:
  662. return None
  663. else:
  664. if is_integer(i) and i not in self.columns:
  665. i = self.columns[i]
  666. return self.formatters.get(i, None)
  667. def _get_formatted_column_labels(self, frame: DataFrame) -> list[list[str]]:
  668. from pandas.core.indexes.multi import sparsify_labels
  669. columns = frame.columns
  670. if isinstance(columns, MultiIndex):
  671. fmt_columns = columns._format_multi(sparsify=False, include_names=False)
  672. fmt_columns = list(zip(*fmt_columns))
  673. dtypes = self.frame.dtypes._values
  674. # if we have a Float level, they don't use leading space at all
  675. restrict_formatting = any(level.is_floating for level in columns.levels)
  676. need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
  677. def space_format(x, y):
  678. if (
  679. y not in self.formatters
  680. and need_leadsp[x]
  681. and not restrict_formatting
  682. ):
  683. return " " + y
  684. return y
  685. str_columns_tuple = list(
  686. zip(*([space_format(x, y) for y in x] for x in fmt_columns))
  687. )
  688. if self.sparsify and len(str_columns_tuple):
  689. str_columns_tuple = sparsify_labels(str_columns_tuple)
  690. str_columns = [list(x) for x in zip(*str_columns_tuple)]
  691. else:
  692. fmt_columns = columns._format_flat(include_name=False)
  693. dtypes = self.frame.dtypes
  694. need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
  695. str_columns = [
  696. [" " + x if not self._get_formatter(i) and need_leadsp[x] else x]
  697. for i, x in enumerate(fmt_columns)
  698. ]
  699. # self.str_columns = str_columns
  700. return str_columns
  701. def _get_formatted_index(self, frame: DataFrame) -> list[str]:
  702. # Note: this is only used by to_string() and to_latex(), not by
  703. # to_html(). so safe to cast col_space here.
  704. col_space = {k: cast(int, v) for k, v in self.col_space.items()}
  705. index = frame.index
  706. columns = frame.columns
  707. fmt = self._get_formatter("__index__")
  708. if isinstance(index, MultiIndex):
  709. fmt_index = index._format_multi(
  710. sparsify=self.sparsify,
  711. include_names=self.show_row_idx_names,
  712. formatter=fmt,
  713. )
  714. else:
  715. fmt_index = [
  716. index._format_flat(include_name=self.show_row_idx_names, formatter=fmt)
  717. ]
  718. fmt_index = [
  719. tuple(
  720. _make_fixed_width(
  721. list(x), justify="left", minimum=col_space.get("", 0), adj=self.adj
  722. )
  723. )
  724. for x in fmt_index
  725. ]
  726. adjoined = self.adj.adjoin(1, *fmt_index).split("\n")
  727. # empty space for columns
  728. if self.show_col_idx_names:
  729. col_header = [str(x) for x in self._get_column_name_list()]
  730. else:
  731. col_header = [""] * columns.nlevels
  732. if self.header:
  733. return col_header + adjoined
  734. else:
  735. return adjoined
  736. def _get_column_name_list(self) -> list[Hashable]:
  737. names: list[Hashable] = []
  738. columns = self.frame.columns
  739. if isinstance(columns, MultiIndex):
  740. names.extend("" if name is None else name for name in columns.names)
  741. else:
  742. names.append("" if columns.name is None else columns.name)
  743. return names
  744. class DataFrameRenderer:
  745. """Class for creating dataframe output in multiple formats.
  746. Called in pandas.core.generic.NDFrame:
  747. - to_csv
  748. - to_latex
  749. Called in pandas.core.frame.DataFrame:
  750. - to_html
  751. - to_string
  752. Parameters
  753. ----------
  754. fmt : DataFrameFormatter
  755. Formatter with the formatting options.
  756. """
  757. def __init__(self, fmt: DataFrameFormatter) -> None:
  758. self.fmt = fmt
  759. def to_html(
  760. self,
  761. buf: FilePath | WriteBuffer[str] | None = None,
  762. encoding: str | None = None,
  763. classes: str | list | tuple | None = None,
  764. notebook: bool = False,
  765. border: int | bool | None = None,
  766. table_id: str | None = None,
  767. render_links: bool = False,
  768. ) -> str | None:
  769. """
  770. Render a DataFrame to a html table.
  771. Parameters
  772. ----------
  773. buf : str, path object, file-like object, or None, default None
  774. String, path object (implementing ``os.PathLike[str]``), or file-like
  775. object implementing a string ``write()`` function. If None, the result is
  776. returned as a string.
  777. encoding : str, default “utf-8”
  778. Set character encoding.
  779. classes : str or list-like
  780. classes to include in the `class` attribute of the opening
  781. ``<table>`` tag, in addition to the default "dataframe".
  782. notebook : {True, False}, optional, default False
  783. Whether the generated HTML is for IPython Notebook.
  784. border : int
  785. A ``border=border`` attribute is included in the opening
  786. ``<table>`` tag. Default ``pd.options.display.html.border``.
  787. table_id : str, optional
  788. A css id is included in the opening `<table>` tag if specified.
  789. render_links : bool, default False
  790. Convert URLs to HTML links.
  791. """
  792. from pandas.io.formats.html import (
  793. HTMLFormatter,
  794. NotebookFormatter,
  795. )
  796. Klass = NotebookFormatter if notebook else HTMLFormatter
  797. html_formatter = Klass(
  798. self.fmt,
  799. classes=classes,
  800. border=border,
  801. table_id=table_id,
  802. render_links=render_links,
  803. )
  804. string = html_formatter.to_string()
  805. return save_to_buffer(string, buf=buf, encoding=encoding)
  806. def to_string(
  807. self,
  808. buf: FilePath | WriteBuffer[str] | None = None,
  809. encoding: str | None = None,
  810. line_width: int | None = None,
  811. ) -> str | None:
  812. """
  813. Render a DataFrame to a console-friendly tabular output.
  814. Parameters
  815. ----------
  816. buf : str, path object, file-like object, or None, default None
  817. String, path object (implementing ``os.PathLike[str]``), or file-like
  818. object implementing a string ``write()`` function. If None, the result is
  819. returned as a string.
  820. encoding: str, default “utf-8”
  821. Set character encoding.
  822. line_width : int, optional
  823. Width to wrap a line in characters.
  824. """
  825. from pandas.io.formats.string import StringFormatter
  826. string_formatter = StringFormatter(self.fmt, line_width=line_width)
  827. string = string_formatter.to_string()
  828. return save_to_buffer(string, buf=buf, encoding=encoding)
  829. def to_csv(
  830. self,
  831. path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
  832. encoding: str | None = None,
  833. sep: str = ",",
  834. columns: Sequence[Hashable] | None = None,
  835. index_label: IndexLabel | None = None,
  836. mode: str = "w",
  837. compression: CompressionOptions = "infer",
  838. quoting: int | None = None,
  839. quotechar: str = '"',
  840. lineterminator: str | None = None,
  841. chunksize: int | None = None,
  842. date_format: str | None = None,
  843. doublequote: bool = True,
  844. escapechar: str | None = None,
  845. errors: str = "strict",
  846. storage_options: StorageOptions | None = None,
  847. ) -> str | None:
  848. """
  849. Render dataframe as comma-separated file.
  850. """
  851. from pandas.io.formats.csvs import CSVFormatter
  852. if path_or_buf is None:
  853. created_buffer = True
  854. path_or_buf = StringIO()
  855. else:
  856. created_buffer = False
  857. csv_formatter = CSVFormatter(
  858. path_or_buf=path_or_buf,
  859. lineterminator=lineterminator,
  860. sep=sep,
  861. encoding=encoding,
  862. errors=errors,
  863. compression=compression,
  864. quoting=quoting,
  865. cols=columns,
  866. index_label=index_label,
  867. mode=mode,
  868. chunksize=chunksize,
  869. quotechar=quotechar,
  870. date_format=date_format,
  871. doublequote=doublequote,
  872. escapechar=escapechar,
  873. storage_options=storage_options,
  874. formatter=self.fmt,
  875. )
  876. csv_formatter.save()
  877. if created_buffer:
  878. assert isinstance(path_or_buf, StringIO)
  879. content = path_or_buf.getvalue()
  880. path_or_buf.close()
  881. return content
  882. return None
  883. def save_to_buffer(
  884. string: str,
  885. buf: FilePath | WriteBuffer[str] | None = None,
  886. encoding: str | None = None,
  887. ) -> str | None:
  888. """
  889. Perform serialization. Write to buf or return as string if buf is None.
  890. """
  891. with _get_buffer(buf, encoding=encoding) as fd:
  892. fd.write(string)
  893. if buf is None:
  894. # error: "WriteBuffer[str]" has no attribute "getvalue"
  895. return fd.getvalue() # type: ignore[attr-defined]
  896. return None
  897. @contextmanager
  898. def _get_buffer(
  899. buf: FilePath | WriteBuffer[str] | None, encoding: str | None = None
  900. ) -> Generator[WriteBuffer[str], None, None] | Generator[StringIO, None, None]:
  901. """
  902. Context manager to open, yield and close buffer for filenames or Path-like
  903. objects, otherwise yield buf unchanged.
  904. """
  905. if buf is not None:
  906. buf = stringify_path(buf)
  907. else:
  908. buf = StringIO()
  909. if encoding is None:
  910. encoding = "utf-8"
  911. elif not isinstance(buf, str):
  912. raise ValueError("buf is not a file name and encoding is specified.")
  913. if hasattr(buf, "write"):
  914. # Incompatible types in "yield" (actual type "Union[str, WriteBuffer[str],
  915. # StringIO]", expected type "Union[WriteBuffer[str], StringIO]")
  916. yield buf # type: ignore[misc]
  917. elif isinstance(buf, str):
  918. check_parent_directory(str(buf))
  919. with open(buf, "w", encoding=encoding, newline="") as f:
  920. # GH#30034 open instead of codecs.open prevents a file leak
  921. # if we have an invalid encoding argument.
  922. # newline="" is needed to roundtrip correctly on
  923. # windows test_to_latex_filename
  924. yield f
  925. else:
  926. raise TypeError("buf is not a file name and it has no write method")
  927. # ----------------------------------------------------------------------
  928. # Array formatters
  929. def format_array(
  930. values: ArrayLike,
  931. formatter: Callable | None,
  932. float_format: FloatFormatType | None = None,
  933. na_rep: str = "NaN",
  934. digits: int | None = None,
  935. space: str | int | None = None,
  936. justify: str = "right",
  937. decimal: str = ".",
  938. leading_space: bool | None = True,
  939. quoting: int | None = None,
  940. fallback_formatter: Callable | None = None,
  941. ) -> list[str]:
  942. """
  943. Format an array for printing.
  944. Parameters
  945. ----------
  946. values : np.ndarray or ExtensionArray
  947. formatter
  948. float_format
  949. na_rep
  950. digits
  951. space
  952. justify
  953. decimal
  954. leading_space : bool, optional, default True
  955. Whether the array should be formatted with a leading space.
  956. When an array as a column of a Series or DataFrame, we do want
  957. the leading space to pad between columns.
  958. When formatting an Index subclass
  959. (e.g. IntervalIndex._get_values_for_csv), we don't want the
  960. leading space since it should be left-aligned.
  961. fallback_formatter
  962. Returns
  963. -------
  964. List[str]
  965. """
  966. fmt_klass: type[_GenericArrayFormatter]
  967. if lib.is_np_dtype(values.dtype, "M"):
  968. fmt_klass = _Datetime64Formatter
  969. values = cast(DatetimeArray, values)
  970. elif isinstance(values.dtype, DatetimeTZDtype):
  971. fmt_klass = _Datetime64TZFormatter
  972. values = cast(DatetimeArray, values)
  973. elif lib.is_np_dtype(values.dtype, "m"):
  974. fmt_klass = _Timedelta64Formatter
  975. values = cast(TimedeltaArray, values)
  976. elif isinstance(values.dtype, ExtensionDtype):
  977. fmt_klass = _ExtensionArrayFormatter
  978. elif lib.is_np_dtype(values.dtype, "fc"):
  979. fmt_klass = FloatArrayFormatter
  980. elif lib.is_np_dtype(values.dtype, "iu"):
  981. fmt_klass = _IntArrayFormatter
  982. else:
  983. fmt_klass = _GenericArrayFormatter
  984. if space is None:
  985. space = 12
  986. if float_format is None:
  987. float_format = get_option("display.float_format")
  988. if digits is None:
  989. digits = get_option("display.precision")
  990. fmt_obj = fmt_klass(
  991. values,
  992. digits=digits,
  993. na_rep=na_rep,
  994. float_format=float_format,
  995. formatter=formatter,
  996. space=space,
  997. justify=justify,
  998. decimal=decimal,
  999. leading_space=leading_space,
  1000. quoting=quoting,
  1001. fallback_formatter=fallback_formatter,
  1002. )
  1003. return fmt_obj.get_result()
  1004. class _GenericArrayFormatter:
  1005. def __init__(
  1006. self,
  1007. values: ArrayLike,
  1008. digits: int = 7,
  1009. formatter: Callable | None = None,
  1010. na_rep: str = "NaN",
  1011. space: str | int = 12,
  1012. float_format: FloatFormatType | None = None,
  1013. justify: str = "right",
  1014. decimal: str = ".",
  1015. quoting: int | None = None,
  1016. fixed_width: bool = True,
  1017. leading_space: bool | None = True,
  1018. fallback_formatter: Callable | None = None,
  1019. ) -> None:
  1020. self.values = values
  1021. self.digits = digits
  1022. self.na_rep = na_rep
  1023. self.space = space
  1024. self.formatter = formatter
  1025. self.float_format = float_format
  1026. self.justify = justify
  1027. self.decimal = decimal
  1028. self.quoting = quoting
  1029. self.fixed_width = fixed_width
  1030. self.leading_space = leading_space
  1031. self.fallback_formatter = fallback_formatter
  1032. def get_result(self) -> list[str]:
  1033. fmt_values = self._format_strings()
  1034. return _make_fixed_width(fmt_values, self.justify)
  1035. def _format_strings(self) -> list[str]:
  1036. if self.float_format is None:
  1037. float_format = get_option("display.float_format")
  1038. if float_format is None:
  1039. precision = get_option("display.precision")
  1040. float_format = lambda x: _trim_zeros_single_float(
  1041. f"{x: .{precision:d}f}"
  1042. )
  1043. else:
  1044. float_format = self.float_format
  1045. if self.formatter is not None:
  1046. formatter = self.formatter
  1047. elif self.fallback_formatter is not None:
  1048. formatter = self.fallback_formatter
  1049. else:
  1050. quote_strings = self.quoting is not None and self.quoting != QUOTE_NONE
  1051. formatter = partial(
  1052. printing.pprint_thing,
  1053. escape_chars=("\t", "\r", "\n"),
  1054. quote_strings=quote_strings,
  1055. )
  1056. def _format(x):
  1057. if self.na_rep is not None and is_scalar(x) and isna(x):
  1058. if x is None:
  1059. return "None"
  1060. elif x is NA:
  1061. return str(NA)
  1062. elif lib.is_float(x) and np.isinf(x):
  1063. # TODO(3.0): this will be unreachable when use_inf_as_na
  1064. # deprecation is enforced
  1065. return str(x)
  1066. elif x is NaT or isinstance(x, (np.datetime64, np.timedelta64)):
  1067. return "NaT"
  1068. return self.na_rep
  1069. elif isinstance(x, PandasObject):
  1070. return str(x)
  1071. elif isinstance(x, StringDtype) and x.na_value is NA:
  1072. return repr(x)
  1073. else:
  1074. # object dtype
  1075. return str(formatter(x))
  1076. vals = self.values
  1077. if not isinstance(vals, np.ndarray):
  1078. raise TypeError(
  1079. "ExtensionArray formatting should use _ExtensionArrayFormatter"
  1080. )
  1081. inferred = lib.map_infer(vals, is_float)
  1082. is_float_type = (
  1083. inferred
  1084. # vals may have 2 or more dimensions
  1085. & np.all(notna(vals), axis=tuple(range(1, len(vals.shape))))
  1086. )
  1087. leading_space = self.leading_space
  1088. if leading_space is None:
  1089. leading_space = is_float_type.any()
  1090. fmt_values = []
  1091. for i, v in enumerate(vals):
  1092. if (not is_float_type[i] or self.formatter is not None) and leading_space:
  1093. fmt_values.append(f" {_format(v)}")
  1094. elif is_float_type[i]:
  1095. fmt_values.append(float_format(v))
  1096. else:
  1097. if leading_space is False:
  1098. # False specifically, so that the default is
  1099. # to include a space if we get here.
  1100. tpl = "{v}"
  1101. else:
  1102. tpl = " {v}"
  1103. fmt_values.append(tpl.format(v=_format(v)))
  1104. return fmt_values
  1105. class FloatArrayFormatter(_GenericArrayFormatter):
  1106. def __init__(self, *args, **kwargs) -> None:
  1107. super().__init__(*args, **kwargs)
  1108. # float_format is expected to be a string
  1109. # formatter should be used to pass a function
  1110. if self.float_format is not None and self.formatter is None:
  1111. # GH21625, GH22270
  1112. self.fixed_width = False
  1113. if callable(self.float_format):
  1114. self.formatter = self.float_format
  1115. self.float_format = None
  1116. def _value_formatter(
  1117. self,
  1118. float_format: FloatFormatType | None = None,
  1119. threshold: float | None = None,
  1120. ) -> Callable:
  1121. """Returns a function to be applied on each value to format it"""
  1122. # the float_format parameter supersedes self.float_format
  1123. if float_format is None:
  1124. float_format = self.float_format
  1125. # we are going to compose different functions, to first convert to
  1126. # a string, then replace the decimal symbol, and finally chop according
  1127. # to the threshold
  1128. # when there is no float_format, we use str instead of '%g'
  1129. # because str(0.0) = '0.0' while '%g' % 0.0 = '0'
  1130. if float_format:
  1131. def base_formatter(v):
  1132. assert float_format is not None # for mypy
  1133. # error: "str" not callable
  1134. # error: Unexpected keyword argument "value" for "__call__" of
  1135. # "EngFormatter"
  1136. return (
  1137. float_format(value=v) # type: ignore[operator,call-arg]
  1138. if notna(v)
  1139. else self.na_rep
  1140. )
  1141. else:
  1142. def base_formatter(v):
  1143. return str(v) if notna(v) else self.na_rep
  1144. if self.decimal != ".":
  1145. def decimal_formatter(v):
  1146. return base_formatter(v).replace(".", self.decimal, 1)
  1147. else:
  1148. decimal_formatter = base_formatter
  1149. if threshold is None:
  1150. return decimal_formatter
  1151. def formatter(value):
  1152. if notna(value):
  1153. if abs(value) > threshold:
  1154. return decimal_formatter(value)
  1155. else:
  1156. return decimal_formatter(0.0)
  1157. else:
  1158. return self.na_rep
  1159. return formatter
  1160. def get_result_as_array(self) -> np.ndarray:
  1161. """
  1162. Returns the float values converted into strings using
  1163. the parameters given at initialisation, as a numpy array
  1164. """
  1165. def format_with_na_rep(values: ArrayLike, formatter: Callable, na_rep: str):
  1166. mask = isna(values)
  1167. formatted = np.array(
  1168. [
  1169. formatter(val) if not m else na_rep
  1170. for val, m in zip(values.ravel(), mask.ravel())
  1171. ]
  1172. ).reshape(values.shape)
  1173. return formatted
  1174. def format_complex_with_na_rep(
  1175. values: ArrayLike, formatter: Callable, na_rep: str
  1176. ):
  1177. real_values = np.real(values).ravel() # type: ignore[arg-type]
  1178. imag_values = np.imag(values).ravel() # type: ignore[arg-type]
  1179. real_mask, imag_mask = isna(real_values), isna(imag_values)
  1180. formatted_lst = []
  1181. for val, real_val, imag_val, re_isna, im_isna in zip(
  1182. values.ravel(),
  1183. real_values,
  1184. imag_values,
  1185. real_mask,
  1186. imag_mask,
  1187. ):
  1188. if not re_isna and not im_isna:
  1189. formatted_lst.append(formatter(val))
  1190. elif not re_isna: # xxx+nanj
  1191. formatted_lst.append(f"{formatter(real_val)}+{na_rep}j")
  1192. elif not im_isna: # nan[+/-]xxxj
  1193. # The imaginary part may either start with a "-" or a space
  1194. imag_formatted = formatter(imag_val).strip()
  1195. if imag_formatted.startswith("-"):
  1196. formatted_lst.append(f"{na_rep}{imag_formatted}j")
  1197. else:
  1198. formatted_lst.append(f"{na_rep}+{imag_formatted}j")
  1199. else: # nan+nanj
  1200. formatted_lst.append(f"{na_rep}+{na_rep}j")
  1201. return np.array(formatted_lst).reshape(values.shape)
  1202. if self.formatter is not None:
  1203. return format_with_na_rep(self.values, self.formatter, self.na_rep)
  1204. if self.fixed_width:
  1205. threshold = get_option("display.chop_threshold")
  1206. else:
  1207. threshold = None
  1208. # if we have a fixed_width, we'll need to try different float_format
  1209. def format_values_with(float_format):
  1210. formatter = self._value_formatter(float_format, threshold)
  1211. # default formatter leaves a space to the left when formatting
  1212. # floats, must be consistent for left-justifying NaNs (GH #25061)
  1213. na_rep = " " + self.na_rep if self.justify == "left" else self.na_rep
  1214. # different formatting strategies for complex and non-complex data
  1215. # need to distinguish complex and float NaNs (GH #53762)
  1216. values = self.values
  1217. is_complex = is_complex_dtype(values)
  1218. # separate the wheat from the chaff
  1219. if is_complex:
  1220. values = format_complex_with_na_rep(values, formatter, na_rep)
  1221. else:
  1222. values = format_with_na_rep(values, formatter, na_rep)
  1223. if self.fixed_width:
  1224. if is_complex:
  1225. result = _trim_zeros_complex(values, self.decimal)
  1226. else:
  1227. result = _trim_zeros_float(values, self.decimal)
  1228. return np.asarray(result, dtype="object")
  1229. return values
  1230. # There is a special default string when we are fixed-width
  1231. # The default is otherwise to use str instead of a formatting string
  1232. float_format: FloatFormatType | None
  1233. if self.float_format is None:
  1234. if self.fixed_width:
  1235. if self.leading_space is True:
  1236. fmt_str = "{value: .{digits:d}f}"
  1237. else:
  1238. fmt_str = "{value:.{digits:d}f}"
  1239. float_format = partial(fmt_str.format, digits=self.digits)
  1240. else:
  1241. float_format = self.float_format
  1242. else:
  1243. float_format = lambda value: self.float_format % value
  1244. formatted_values = format_values_with(float_format)
  1245. if not self.fixed_width:
  1246. return formatted_values
  1247. # we need do convert to engineering format if some values are too small
  1248. # and would appear as 0, or if some values are too big and take too
  1249. # much space
  1250. if len(formatted_values) > 0:
  1251. maxlen = max(len(x) for x in formatted_values)
  1252. too_long = maxlen > self.digits + 6
  1253. else:
  1254. too_long = False
  1255. abs_vals = np.abs(self.values)
  1256. # this is pretty arbitrary for now
  1257. # large values: more that 8 characters including decimal symbol
  1258. # and first digit, hence > 1e6
  1259. has_large_values = (abs_vals > 1e6).any()
  1260. has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  1261. if has_small_values or (too_long and has_large_values):
  1262. if self.leading_space is True:
  1263. fmt_str = "{value: .{digits:d}e}"
  1264. else:
  1265. fmt_str = "{value:.{digits:d}e}"
  1266. float_format = partial(fmt_str.format, digits=self.digits)
  1267. formatted_values = format_values_with(float_format)
  1268. return formatted_values
  1269. def _format_strings(self) -> list[str]:
  1270. return list(self.get_result_as_array())
  1271. class _IntArrayFormatter(_GenericArrayFormatter):
  1272. def _format_strings(self) -> list[str]:
  1273. if self.leading_space is False:
  1274. formatter_str = lambda x: f"{x:d}".format(x=x)
  1275. else:
  1276. formatter_str = lambda x: f"{x: d}".format(x=x)
  1277. formatter = self.formatter or formatter_str
  1278. fmt_values = [formatter(x) for x in self.values]
  1279. return fmt_values
  1280. class _Datetime64Formatter(_GenericArrayFormatter):
  1281. values: DatetimeArray
  1282. def __init__(
  1283. self,
  1284. values: DatetimeArray,
  1285. nat_rep: str = "NaT",
  1286. date_format: None = None,
  1287. **kwargs,
  1288. ) -> None:
  1289. super().__init__(values, **kwargs)
  1290. self.nat_rep = nat_rep
  1291. self.date_format = date_format
  1292. def _format_strings(self) -> list[str]:
  1293. """we by definition have DO NOT have a TZ"""
  1294. values = self.values
  1295. if self.formatter is not None:
  1296. return [self.formatter(x) for x in values]
  1297. fmt_values = values._format_native_types(
  1298. na_rep=self.nat_rep, date_format=self.date_format
  1299. )
  1300. return fmt_values.tolist()
  1301. class _ExtensionArrayFormatter(_GenericArrayFormatter):
  1302. values: ExtensionArray
  1303. def _format_strings(self) -> list[str]:
  1304. values = self.values
  1305. formatter = self.formatter
  1306. fallback_formatter = None
  1307. if formatter is None:
  1308. fallback_formatter = values._formatter(boxed=True)
  1309. if isinstance(values, Categorical):
  1310. # Categorical is special for now, so that we can preserve tzinfo
  1311. array = values._internal_get_values()
  1312. else:
  1313. array = np.asarray(values, dtype=object)
  1314. fmt_values = format_array(
  1315. array,
  1316. formatter,
  1317. float_format=self.float_format,
  1318. na_rep=self.na_rep,
  1319. digits=self.digits,
  1320. space=self.space,
  1321. justify=self.justify,
  1322. decimal=self.decimal,
  1323. leading_space=self.leading_space,
  1324. quoting=self.quoting,
  1325. fallback_formatter=fallback_formatter,
  1326. )
  1327. return fmt_values
  1328. def format_percentiles(
  1329. percentiles: (np.ndarray | Sequence[float]),
  1330. ) -> list[str]:
  1331. """
  1332. Outputs rounded and formatted percentiles.
  1333. Parameters
  1334. ----------
  1335. percentiles : list-like, containing floats from interval [0,1]
  1336. Returns
  1337. -------
  1338. formatted : list of strings
  1339. Notes
  1340. -----
  1341. Rounding precision is chosen so that: (1) if any two elements of
  1342. ``percentiles`` differ, they remain different after rounding
  1343. (2) no entry is *rounded* to 0% or 100%.
  1344. Any non-integer is always rounded to at least 1 decimal place.
  1345. Examples
  1346. --------
  1347. Keeps all entries different after rounding:
  1348. >>> format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999])
  1349. ['1.999%', '2.001%', '50%', '66.667%', '99.99%']
  1350. No element is rounded to 0% or 100% (unless already equal to it).
  1351. Duplicates are allowed:
  1352. >>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999])
  1353. ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%']
  1354. """
  1355. percentiles = np.asarray(percentiles)
  1356. # It checks for np.nan as well
  1357. if (
  1358. not is_numeric_dtype(percentiles)
  1359. or not np.all(percentiles >= 0)
  1360. or not np.all(percentiles <= 1)
  1361. ):
  1362. raise ValueError("percentiles should all be in the interval [0,1]")
  1363. percentiles = 100 * percentiles
  1364. prec = get_precision(percentiles)
  1365. percentiles_round_type = percentiles.round(prec).astype(int)
  1366. int_idx = np.isclose(percentiles_round_type, percentiles)
  1367. if np.all(int_idx):
  1368. out = percentiles_round_type.astype(str)
  1369. return [i + "%" for i in out]
  1370. unique_pcts = np.unique(percentiles)
  1371. prec = get_precision(unique_pcts)
  1372. out = np.empty_like(percentiles, dtype=object)
  1373. out[int_idx] = percentiles[int_idx].round().astype(int).astype(str)
  1374. out[~int_idx] = percentiles[~int_idx].round(prec).astype(str)
  1375. return [i + "%" for i in out]
  1376. def get_precision(array: np.ndarray | Sequence[float]) -> int:
  1377. to_begin = array[0] if array[0] > 0 else None
  1378. to_end = 100 - array[-1] if array[-1] < 100 else None
  1379. diff = np.ediff1d(array, to_begin=to_begin, to_end=to_end)
  1380. diff = abs(diff)
  1381. prec = -np.floor(np.log10(np.min(diff))).astype(int)
  1382. prec = max(1, prec)
  1383. return prec
  1384. def _format_datetime64(x: NaTType | Timestamp, nat_rep: str = "NaT") -> str:
  1385. if x is NaT:
  1386. return nat_rep
  1387. # Timestamp.__str__ falls back to datetime.datetime.__str__ = isoformat(sep=' ')
  1388. # so it already uses string formatting rather than strftime (faster).
  1389. return str(x)
  1390. def _format_datetime64_dateonly(
  1391. x: NaTType | Timestamp,
  1392. nat_rep: str = "NaT",
  1393. date_format: str | None = None,
  1394. ) -> str:
  1395. if isinstance(x, NaTType):
  1396. return nat_rep
  1397. if date_format:
  1398. return x.strftime(date_format)
  1399. else:
  1400. # Timestamp._date_repr relies on string formatting (faster than strftime)
  1401. return x._date_repr
  1402. def get_format_datetime64(
  1403. is_dates_only: bool, nat_rep: str = "NaT", date_format: str | None = None
  1404. ) -> Callable:
  1405. """Return a formatter callable taking a datetime64 as input and providing
  1406. a string as output"""
  1407. if is_dates_only:
  1408. return lambda x: _format_datetime64_dateonly(
  1409. x, nat_rep=nat_rep, date_format=date_format
  1410. )
  1411. else:
  1412. return lambda x: _format_datetime64(x, nat_rep=nat_rep)
  1413. class _Datetime64TZFormatter(_Datetime64Formatter):
  1414. values: DatetimeArray
  1415. def _format_strings(self) -> list[str]:
  1416. """we by definition have a TZ"""
  1417. ido = self.values._is_dates_only
  1418. values = self.values.astype(object)
  1419. formatter = self.formatter or get_format_datetime64(
  1420. ido, date_format=self.date_format
  1421. )
  1422. fmt_values = [formatter(x) for x in values]
  1423. return fmt_values
  1424. class _Timedelta64Formatter(_GenericArrayFormatter):
  1425. values: TimedeltaArray
  1426. def __init__(
  1427. self,
  1428. values: TimedeltaArray,
  1429. nat_rep: str = "NaT",
  1430. **kwargs,
  1431. ) -> None:
  1432. # TODO: nat_rep is never passed, na_rep is.
  1433. super().__init__(values, **kwargs)
  1434. self.nat_rep = nat_rep
  1435. def _format_strings(self) -> list[str]:
  1436. formatter = self.formatter or get_format_timedelta64(
  1437. self.values, nat_rep=self.nat_rep, box=False
  1438. )
  1439. return [formatter(x) for x in self.values]
  1440. def get_format_timedelta64(
  1441. values: TimedeltaArray,
  1442. nat_rep: str | float = "NaT",
  1443. box: bool = False,
  1444. ) -> Callable:
  1445. """
  1446. Return a formatter function for a range of timedeltas.
  1447. These will all have the same format argument
  1448. If box, then show the return in quotes
  1449. """
  1450. even_days = values._is_dates_only
  1451. if even_days:
  1452. format = None
  1453. else:
  1454. format = "long"
  1455. def _formatter(x):
  1456. if x is None or (is_scalar(x) and isna(x)):
  1457. return nat_rep
  1458. if not isinstance(x, Timedelta):
  1459. x = Timedelta(x)
  1460. # Timedelta._repr_base uses string formatting (faster than strftime)
  1461. result = x._repr_base(format=format)
  1462. if box:
  1463. result = f"'{result}'"
  1464. return result
  1465. return _formatter
  1466. def _make_fixed_width(
  1467. strings: list[str],
  1468. justify: str = "right",
  1469. minimum: int | None = None,
  1470. adj: printing._TextAdjustment | None = None,
  1471. ) -> list[str]:
  1472. if len(strings) == 0 or justify == "all":
  1473. return strings
  1474. if adj is None:
  1475. adjustment = printing.get_adjustment()
  1476. else:
  1477. adjustment = adj
  1478. max_len = max(adjustment.len(x) for x in strings)
  1479. if minimum is not None:
  1480. max_len = max(minimum, max_len)
  1481. conf_max = get_option("display.max_colwidth")
  1482. if conf_max is not None and max_len > conf_max:
  1483. max_len = conf_max
  1484. def just(x: str) -> str:
  1485. if conf_max is not None:
  1486. if (conf_max > 3) & (adjustment.len(x) > max_len):
  1487. x = x[: max_len - 3] + "..."
  1488. return x
  1489. strings = [just(x) for x in strings]
  1490. result = adjustment.justify(strings, max_len, mode=justify)
  1491. return result
  1492. def _trim_zeros_complex(str_complexes: ArrayLike, decimal: str = ".") -> list[str]:
  1493. """
  1494. Separates the real and imaginary parts from the complex number, and
  1495. executes the _trim_zeros_float method on each of those.
  1496. """
  1497. real_part, imag_part = [], []
  1498. for x in str_complexes:
  1499. # Complex numbers are represented as "(-)xxx(+/-)xxxj"
  1500. # The split will give [{"", "-"}, "xxx", "+/-", "xxx", "j", ""]
  1501. # Therefore, the imaginary part is the 4th and 3rd last elements,
  1502. # and the real part is everything before the imaginary part
  1503. trimmed = re.split(r"([j+-])", x)
  1504. real_part.append("".join(trimmed[:-4]))
  1505. imag_part.append("".join(trimmed[-4:-2]))
  1506. # We want to align the lengths of the real and imaginary parts of each complex
  1507. # number, as well as the lengths the real (resp. complex) parts of all numbers
  1508. # in the array
  1509. n = len(str_complexes)
  1510. padded_parts = _trim_zeros_float(real_part + imag_part, decimal)
  1511. if len(padded_parts) == 0:
  1512. return []
  1513. padded_length = max(len(part) for part in padded_parts) - 1
  1514. padded = [
  1515. real_pt # real part, possibly NaN
  1516. + imag_pt[0] # +/-
  1517. + f"{imag_pt[1:]:>{padded_length}}" # complex part (no sign), possibly nan
  1518. + "j"
  1519. for real_pt, imag_pt in zip(padded_parts[:n], padded_parts[n:])
  1520. ]
  1521. return padded
  1522. def _trim_zeros_single_float(str_float: str) -> str:
  1523. """
  1524. Trims trailing zeros after a decimal point,
  1525. leaving just one if necessary.
  1526. """
  1527. str_float = str_float.rstrip("0")
  1528. if str_float.endswith("."):
  1529. str_float += "0"
  1530. return str_float
  1531. def _trim_zeros_float(
  1532. str_floats: ArrayLike | list[str], decimal: str = "."
  1533. ) -> list[str]:
  1534. """
  1535. Trims the maximum number of trailing zeros equally from
  1536. all numbers containing decimals, leaving just one if
  1537. necessary.
  1538. """
  1539. trimmed = str_floats
  1540. number_regex = re.compile(rf"^\s*[\+-]?[0-9]+\{decimal}[0-9]*$")
  1541. def is_number_with_decimal(x) -> bool:
  1542. return re.match(number_regex, x) is not None
  1543. def should_trim(values: ArrayLike | list[str]) -> bool:
  1544. """
  1545. Determine if an array of strings should be trimmed.
  1546. Returns True if all numbers containing decimals (defined by the
  1547. above regular expression) within the array end in a zero, otherwise
  1548. returns False.
  1549. """
  1550. numbers = [x for x in values if is_number_with_decimal(x)]
  1551. return len(numbers) > 0 and all(x.endswith("0") for x in numbers)
  1552. while should_trim(trimmed):
  1553. trimmed = [x[:-1] if is_number_with_decimal(x) else x for x in trimmed]
  1554. # leave one 0 after the decimal points if need be.
  1555. result = [
  1556. x + "0" if is_number_with_decimal(x) and x.endswith(decimal) else x
  1557. for x in trimmed
  1558. ]
  1559. return result
  1560. def _has_names(index: Index) -> bool:
  1561. if isinstance(index, MultiIndex):
  1562. return com.any_not_none(*index.names)
  1563. else:
  1564. return index.name is not None
  1565. class EngFormatter:
  1566. """
  1567. Formats float values according to engineering format.
  1568. Based on matplotlib.ticker.EngFormatter
  1569. """
  1570. # The SI engineering prefixes
  1571. ENG_PREFIXES = {
  1572. -24: "y",
  1573. -21: "z",
  1574. -18: "a",
  1575. -15: "f",
  1576. -12: "p",
  1577. -9: "n",
  1578. -6: "u",
  1579. -3: "m",
  1580. 0: "",
  1581. 3: "k",
  1582. 6: "M",
  1583. 9: "G",
  1584. 12: "T",
  1585. 15: "P",
  1586. 18: "E",
  1587. 21: "Z",
  1588. 24: "Y",
  1589. }
  1590. def __init__(
  1591. self, accuracy: int | None = None, use_eng_prefix: bool = False
  1592. ) -> None:
  1593. self.accuracy = accuracy
  1594. self.use_eng_prefix = use_eng_prefix
  1595. def __call__(self, num: float) -> str:
  1596. """
  1597. Formats a number in engineering notation, appending a letter
  1598. representing the power of 1000 of the original number. Some examples:
  1599. >>> format_eng = EngFormatter(accuracy=0, use_eng_prefix=True)
  1600. >>> format_eng(0)
  1601. ' 0'
  1602. >>> format_eng = EngFormatter(accuracy=1, use_eng_prefix=True)
  1603. >>> format_eng(1_000_000)
  1604. ' 1.0M'
  1605. >>> format_eng = EngFormatter(accuracy=2, use_eng_prefix=False)
  1606. >>> format_eng("-1e-6")
  1607. '-1.00E-06'
  1608. @param num: the value to represent
  1609. @type num: either a numeric value or a string that can be converted to
  1610. a numeric value (as per decimal.Decimal constructor)
  1611. @return: engineering formatted string
  1612. """
  1613. dnum = Decimal(str(num))
  1614. if Decimal.is_nan(dnum):
  1615. return "NaN"
  1616. if Decimal.is_infinite(dnum):
  1617. return "inf"
  1618. sign = 1
  1619. if dnum < 0: # pragma: no cover
  1620. sign = -1
  1621. dnum = -dnum
  1622. if dnum != 0:
  1623. pow10 = Decimal(int(math.floor(dnum.log10() / 3) * 3))
  1624. else:
  1625. pow10 = Decimal(0)
  1626. pow10 = pow10.min(max(self.ENG_PREFIXES.keys()))
  1627. pow10 = pow10.max(min(self.ENG_PREFIXES.keys()))
  1628. int_pow10 = int(pow10)
  1629. if self.use_eng_prefix:
  1630. prefix = self.ENG_PREFIXES[int_pow10]
  1631. elif int_pow10 < 0:
  1632. prefix = f"E-{-int_pow10:02d}"
  1633. else:
  1634. prefix = f"E+{int_pow10:02d}"
  1635. mant = sign * dnum / (10**pow10)
  1636. if self.accuracy is None: # pragma: no cover
  1637. format_str = "{mant: g}{prefix}"
  1638. else:
  1639. format_str = f"{{mant: .{self.accuracy:d}f}}{{prefix}}"
  1640. formatted = format_str.format(mant=mant, prefix=prefix)
  1641. return formatted
  1642. def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> None:
  1643. """
  1644. Format float representation in DataFrame with SI notation.
  1645. Parameters
  1646. ----------
  1647. accuracy : int, default 3
  1648. Number of decimal digits after the floating point.
  1649. use_eng_prefix : bool, default False
  1650. Whether to represent a value with SI prefixes.
  1651. Returns
  1652. -------
  1653. None
  1654. Examples
  1655. --------
  1656. >>> df = pd.DataFrame([1e-9, 1e-3, 1, 1e3, 1e6])
  1657. >>> df
  1658. 0
  1659. 0 1.000000e-09
  1660. 1 1.000000e-03
  1661. 2 1.000000e+00
  1662. 3 1.000000e+03
  1663. 4 1.000000e+06
  1664. >>> pd.set_eng_float_format(accuracy=1)
  1665. >>> df
  1666. 0
  1667. 0 1.0E-09
  1668. 1 1.0E-03
  1669. 2 1.0E+00
  1670. 3 1.0E+03
  1671. 4 1.0E+06
  1672. >>> pd.set_eng_float_format(use_eng_prefix=True)
  1673. >>> df
  1674. 0
  1675. 0 1.000n
  1676. 1 1.000m
  1677. 2 1.000
  1678. 3 1.000k
  1679. 4 1.000M
  1680. >>> pd.set_eng_float_format(accuracy=1, use_eng_prefix=True)
  1681. >>> df
  1682. 0
  1683. 0 1.0n
  1684. 1 1.0m
  1685. 2 1.0
  1686. 3 1.0k
  1687. 4 1.0M
  1688. >>> pd.set_option("display.float_format", None) # unset option
  1689. """
  1690. set_option("display.float_format", EngFormatter(accuracy, use_eng_prefix))
  1691. def get_level_lengths(
  1692. levels: Any, sentinel: bool | object | str = ""
  1693. ) -> list[dict[int, int]]:
  1694. """
  1695. For each index in each level the function returns lengths of indexes.
  1696. Parameters
  1697. ----------
  1698. levels : list of lists
  1699. List of values on for level.
  1700. sentinel : string, optional
  1701. Value which states that no new index starts on there.
  1702. Returns
  1703. -------
  1704. Returns list of maps. For each level returns map of indexes (key is index
  1705. in row and value is length of index).
  1706. """
  1707. if len(levels) == 0:
  1708. return []
  1709. control = [True] * len(levels[0])
  1710. result = []
  1711. for level in levels:
  1712. last_index = 0
  1713. lengths = {}
  1714. for i, key in enumerate(level):
  1715. if control[i] and key == sentinel:
  1716. pass
  1717. else:
  1718. control[i] = False
  1719. lengths[last_index] = i - last_index
  1720. last_index = i
  1721. lengths[last_index] = len(level) - last_index
  1722. result.append(lengths)
  1723. return result
  1724. def buffer_put_lines(buf: WriteBuffer[str], lines: list[str]) -> None:
  1725. """
  1726. Appends lines to a buffer.
  1727. Parameters
  1728. ----------
  1729. buf
  1730. The buffer to write to
  1731. lines
  1732. The lines to append.
  1733. """
  1734. if any(isinstance(x, str) for x in lines):
  1735. lines = [str(x) for x in lines]
  1736. buf.write("\n".join(lines))