html.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646
  1. """
  2. Module for formatting output data in HTML.
  3. """
  4. from __future__ import annotations
  5. from textwrap import dedent
  6. from typing import (
  7. TYPE_CHECKING,
  8. Any,
  9. Final,
  10. cast,
  11. )
  12. from pandas._config import get_option
  13. from pandas._libs import lib
  14. from pandas import (
  15. MultiIndex,
  16. option_context,
  17. )
  18. from pandas.io.common import is_url
  19. from pandas.io.formats.format import (
  20. DataFrameFormatter,
  21. get_level_lengths,
  22. )
  23. from pandas.io.formats.printing import pprint_thing
  24. if TYPE_CHECKING:
  25. from collections.abc import (
  26. Hashable,
  27. Iterable,
  28. Mapping,
  29. )
  30. class HTMLFormatter:
  31. """
  32. Internal class for formatting output data in html.
  33. This class is intended for shared functionality between
  34. DataFrame.to_html() and DataFrame._repr_html_().
  35. Any logic in common with other output formatting methods
  36. should ideally be inherited from classes in format.py
  37. and this class responsible for only producing html markup.
  38. """
  39. indent_delta: Final = 2
  40. def __init__(
  41. self,
  42. formatter: DataFrameFormatter,
  43. classes: str | list[str] | tuple[str, ...] | None = None,
  44. border: int | bool | None = None,
  45. table_id: str | None = None,
  46. render_links: bool = False,
  47. ) -> None:
  48. self.fmt = formatter
  49. self.classes = classes
  50. self.frame = self.fmt.frame
  51. self.columns = self.fmt.tr_frame.columns
  52. self.elements: list[str] = []
  53. self.bold_rows = self.fmt.bold_rows
  54. self.escape = self.fmt.escape
  55. self.show_dimensions = self.fmt.show_dimensions
  56. if border is None or border is True:
  57. border = cast(int, get_option("display.html.border"))
  58. elif not border:
  59. border = None
  60. self.border = border
  61. self.table_id = table_id
  62. self.render_links = render_links
  63. self.col_space = {}
  64. is_multi_index = isinstance(self.columns, MultiIndex)
  65. for column, value in self.fmt.col_space.items():
  66. col_space_value = f"{value}px" if isinstance(value, int) else value
  67. self.col_space[column] = col_space_value
  68. # GH 53885: Handling case where column is index
  69. # Flatten the data in the multi index and add in the map
  70. if is_multi_index and isinstance(column, tuple):
  71. for column_index in column:
  72. self.col_space[str(column_index)] = col_space_value
  73. def to_string(self) -> str:
  74. lines = self.render()
  75. if any(isinstance(x, str) for x in lines):
  76. lines = [str(x) for x in lines]
  77. return "\n".join(lines)
  78. def render(self) -> list[str]:
  79. self._write_table()
  80. if self.should_show_dimensions:
  81. by = chr(215) # × # noqa: RUF003
  82. self.write(
  83. f"<p>{len(self.frame)} rows {by} {len(self.frame.columns)} columns</p>"
  84. )
  85. return self.elements
  86. @property
  87. def should_show_dimensions(self) -> bool:
  88. return self.fmt.should_show_dimensions
  89. @property
  90. def show_row_idx_names(self) -> bool:
  91. return self.fmt.show_row_idx_names
  92. @property
  93. def show_col_idx_names(self) -> bool:
  94. return self.fmt.show_col_idx_names
  95. @property
  96. def row_levels(self) -> int:
  97. if self.fmt.index:
  98. # showing (row) index
  99. return self.frame.index.nlevels
  100. elif self.show_col_idx_names:
  101. # see gh-22579
  102. # Column misalignment also occurs for
  103. # a standard index when the columns index is named.
  104. # If the row index is not displayed a column of
  105. # blank cells need to be included before the DataFrame values.
  106. return 1
  107. # not showing (row) index
  108. return 0
  109. def _get_columns_formatted_values(self) -> Iterable:
  110. return self.columns
  111. @property
  112. def is_truncated(self) -> bool:
  113. return self.fmt.is_truncated
  114. @property
  115. def ncols(self) -> int:
  116. return len(self.fmt.tr_frame.columns)
  117. def write(self, s: Any, indent: int = 0) -> None:
  118. rs = pprint_thing(s)
  119. self.elements.append(" " * indent + rs)
  120. def write_th(
  121. self, s: Any, header: bool = False, indent: int = 0, tags: str | None = None
  122. ) -> None:
  123. """
  124. Method for writing a formatted <th> cell.
  125. If col_space is set on the formatter then that is used for
  126. the value of min-width.
  127. Parameters
  128. ----------
  129. s : object
  130. The data to be written inside the cell.
  131. header : bool, default False
  132. Set to True if the <th> is for use inside <thead>. This will
  133. cause min-width to be set if there is one.
  134. indent : int, default 0
  135. The indentation level of the cell.
  136. tags : str, default None
  137. Tags to include in the cell.
  138. Returns
  139. -------
  140. A written <th> cell.
  141. """
  142. col_space = self.col_space.get(s, None)
  143. if header and col_space is not None:
  144. tags = tags or ""
  145. tags += f'style="min-width: {col_space};"'
  146. self._write_cell(s, kind="th", indent=indent, tags=tags)
  147. def write_td(self, s: Any, indent: int = 0, tags: str | None = None) -> None:
  148. self._write_cell(s, kind="td", indent=indent, tags=tags)
  149. def _write_cell(
  150. self, s: Any, kind: str = "td", indent: int = 0, tags: str | None = None
  151. ) -> None:
  152. if tags is not None:
  153. start_tag = f"<{kind} {tags}>"
  154. else:
  155. start_tag = f"<{kind}>"
  156. if self.escape:
  157. # escape & first to prevent double escaping of &
  158. esc = {"&": r"&amp;", "<": r"&lt;", ">": r"&gt;"}
  159. else:
  160. esc = {}
  161. rs = pprint_thing(s, escape_chars=esc).strip()
  162. if self.render_links and is_url(rs):
  163. rs_unescaped = pprint_thing(s, escape_chars={}).strip()
  164. start_tag += f'<a href="{rs_unescaped}" target="_blank">'
  165. end_a = "</a>"
  166. else:
  167. end_a = ""
  168. self.write(f"{start_tag}{rs}{end_a}</{kind}>", indent)
  169. def write_tr(
  170. self,
  171. line: Iterable,
  172. indent: int = 0,
  173. indent_delta: int = 0,
  174. header: bool = False,
  175. align: str | None = None,
  176. tags: dict[int, str] | None = None,
  177. nindex_levels: int = 0,
  178. ) -> None:
  179. if tags is None:
  180. tags = {}
  181. if align is None:
  182. self.write("<tr>", indent)
  183. else:
  184. self.write(f'<tr style="text-align: {align};">', indent)
  185. indent += indent_delta
  186. for i, s in enumerate(line):
  187. val_tag = tags.get(i, None)
  188. if header or (self.bold_rows and i < nindex_levels):
  189. self.write_th(s, indent=indent, header=header, tags=val_tag)
  190. else:
  191. self.write_td(s, indent, tags=val_tag)
  192. indent -= indent_delta
  193. self.write("</tr>", indent)
  194. def _write_table(self, indent: int = 0) -> None:
  195. _classes = ["dataframe"] # Default class.
  196. use_mathjax = get_option("display.html.use_mathjax")
  197. if not use_mathjax:
  198. _classes.append("tex2jax_ignore")
  199. if self.classes is not None:
  200. if isinstance(self.classes, str):
  201. self.classes = self.classes.split()
  202. if not isinstance(self.classes, (list, tuple)):
  203. raise TypeError(
  204. "classes must be a string, list, "
  205. f"or tuple, not {type(self.classes)}"
  206. )
  207. _classes.extend(self.classes)
  208. if self.table_id is None:
  209. id_section = ""
  210. else:
  211. id_section = f' id="{self.table_id}"'
  212. if self.border is None:
  213. border_attr = ""
  214. else:
  215. border_attr = f' border="{self.border}"'
  216. self.write(
  217. f'<table{border_attr} class="{" ".join(_classes)}"{id_section}>',
  218. indent,
  219. )
  220. if self.fmt.header or self.show_row_idx_names:
  221. self._write_header(indent + self.indent_delta)
  222. self._write_body(indent + self.indent_delta)
  223. self.write("</table>", indent)
  224. def _write_col_header(self, indent: int) -> None:
  225. row: list[Hashable]
  226. is_truncated_horizontally = self.fmt.is_truncated_horizontally
  227. if isinstance(self.columns, MultiIndex):
  228. template = 'colspan="{span:d}" halign="left"'
  229. sentinel: lib.NoDefault | bool
  230. if self.fmt.sparsify:
  231. # GH3547
  232. sentinel = lib.no_default
  233. else:
  234. sentinel = False
  235. levels = self.columns._format_multi(sparsify=sentinel, include_names=False)
  236. level_lengths = get_level_lengths(levels, sentinel)
  237. inner_lvl = len(level_lengths) - 1
  238. for lnum, (records, values) in enumerate(zip(level_lengths, levels)):
  239. if is_truncated_horizontally:
  240. # modify the header lines
  241. ins_col = self.fmt.tr_col_num
  242. if self.fmt.sparsify:
  243. recs_new = {}
  244. # Increment tags after ... col.
  245. for tag, span in list(records.items()):
  246. if tag >= ins_col:
  247. recs_new[tag + 1] = span
  248. elif tag + span > ins_col:
  249. recs_new[tag] = span + 1
  250. if lnum == inner_lvl:
  251. values = (
  252. values[:ins_col] + ("...",) + values[ins_col:]
  253. )
  254. else:
  255. # sparse col headers do not receive a ...
  256. values = (
  257. values[:ins_col]
  258. + (values[ins_col - 1],)
  259. + values[ins_col:]
  260. )
  261. else:
  262. recs_new[tag] = span
  263. # if ins_col lies between tags, all col headers
  264. # get ...
  265. if tag + span == ins_col:
  266. recs_new[ins_col] = 1
  267. values = values[:ins_col] + ("...",) + values[ins_col:]
  268. records = recs_new
  269. inner_lvl = len(level_lengths) - 1
  270. if lnum == inner_lvl:
  271. records[ins_col] = 1
  272. else:
  273. recs_new = {}
  274. for tag, span in list(records.items()):
  275. if tag >= ins_col:
  276. recs_new[tag + 1] = span
  277. else:
  278. recs_new[tag] = span
  279. recs_new[ins_col] = 1
  280. records = recs_new
  281. values = values[:ins_col] + ["..."] + values[ins_col:]
  282. # see gh-22579
  283. # Column Offset Bug with to_html(index=False) with
  284. # MultiIndex Columns and Index.
  285. # Initially fill row with blank cells before column names.
  286. # TODO: Refactor to remove code duplication with code
  287. # block below for standard columns index.
  288. row = [""] * (self.row_levels - 1)
  289. if self.fmt.index or self.show_col_idx_names:
  290. # see gh-22747
  291. # If to_html(index_names=False) do not show columns
  292. # index names.
  293. # TODO: Refactor to use _get_column_name_list from
  294. # DataFrameFormatter class and create a
  295. # _get_formatted_column_labels function for code
  296. # parity with DataFrameFormatter class.
  297. if self.fmt.show_index_names:
  298. name = self.columns.names[lnum]
  299. row.append(pprint_thing(name or ""))
  300. else:
  301. row.append("")
  302. tags = {}
  303. j = len(row)
  304. for i, v in enumerate(values):
  305. if i in records:
  306. if records[i] > 1:
  307. tags[j] = template.format(span=records[i])
  308. else:
  309. continue
  310. j += 1
  311. row.append(v)
  312. self.write_tr(row, indent, self.indent_delta, tags=tags, header=True)
  313. else:
  314. # see gh-22579
  315. # Column misalignment also occurs for
  316. # a standard index when the columns index is named.
  317. # Initially fill row with blank cells before column names.
  318. # TODO: Refactor to remove code duplication with code block
  319. # above for columns MultiIndex.
  320. row = [""] * (self.row_levels - 1)
  321. if self.fmt.index or self.show_col_idx_names:
  322. # see gh-22747
  323. # If to_html(index_names=False) do not show columns
  324. # index names.
  325. # TODO: Refactor to use _get_column_name_list from
  326. # DataFrameFormatter class.
  327. if self.fmt.show_index_names:
  328. row.append(self.columns.name or "")
  329. else:
  330. row.append("")
  331. row.extend(self._get_columns_formatted_values())
  332. align = self.fmt.justify
  333. if is_truncated_horizontally:
  334. ins_col = self.row_levels + self.fmt.tr_col_num
  335. row.insert(ins_col, "...")
  336. self.write_tr(row, indent, self.indent_delta, header=True, align=align)
  337. def _write_row_header(self, indent: int) -> None:
  338. is_truncated_horizontally = self.fmt.is_truncated_horizontally
  339. row = [x if x is not None else "" for x in self.frame.index.names] + [""] * (
  340. self.ncols + (1 if is_truncated_horizontally else 0)
  341. )
  342. self.write_tr(row, indent, self.indent_delta, header=True)
  343. def _write_header(self, indent: int) -> None:
  344. self.write("<thead>", indent)
  345. if self.fmt.header:
  346. self._write_col_header(indent + self.indent_delta)
  347. if self.show_row_idx_names:
  348. self._write_row_header(indent + self.indent_delta)
  349. self.write("</thead>", indent)
  350. def _get_formatted_values(self) -> dict[int, list[str]]:
  351. with option_context("display.max_colwidth", None):
  352. fmt_values = {i: self.fmt.format_col(i) for i in range(self.ncols)}
  353. return fmt_values
  354. def _write_body(self, indent: int) -> None:
  355. self.write("<tbody>", indent)
  356. fmt_values = self._get_formatted_values()
  357. # write values
  358. if self.fmt.index and isinstance(self.frame.index, MultiIndex):
  359. self._write_hierarchical_rows(fmt_values, indent + self.indent_delta)
  360. else:
  361. self._write_regular_rows(fmt_values, indent + self.indent_delta)
  362. self.write("</tbody>", indent)
  363. def _write_regular_rows(
  364. self, fmt_values: Mapping[int, list[str]], indent: int
  365. ) -> None:
  366. is_truncated_horizontally = self.fmt.is_truncated_horizontally
  367. is_truncated_vertically = self.fmt.is_truncated_vertically
  368. nrows = len(self.fmt.tr_frame)
  369. if self.fmt.index:
  370. fmt = self.fmt._get_formatter("__index__")
  371. if fmt is not None:
  372. index_values = self.fmt.tr_frame.index.map(fmt)
  373. else:
  374. # only reached with non-Multi index
  375. index_values = self.fmt.tr_frame.index._format_flat(include_name=False)
  376. row: list[str] = []
  377. for i in range(nrows):
  378. if is_truncated_vertically and i == (self.fmt.tr_row_num):
  379. str_sep_row = ["..."] * len(row)
  380. self.write_tr(
  381. str_sep_row,
  382. indent,
  383. self.indent_delta,
  384. tags=None,
  385. nindex_levels=self.row_levels,
  386. )
  387. row = []
  388. if self.fmt.index:
  389. row.append(index_values[i])
  390. # see gh-22579
  391. # Column misalignment also occurs for
  392. # a standard index when the columns index is named.
  393. # Add blank cell before data cells.
  394. elif self.show_col_idx_names:
  395. row.append("")
  396. row.extend(fmt_values[j][i] for j in range(self.ncols))
  397. if is_truncated_horizontally:
  398. dot_col_ix = self.fmt.tr_col_num + self.row_levels
  399. row.insert(dot_col_ix, "...")
  400. self.write_tr(
  401. row, indent, self.indent_delta, tags=None, nindex_levels=self.row_levels
  402. )
  403. def _write_hierarchical_rows(
  404. self, fmt_values: Mapping[int, list[str]], indent: int
  405. ) -> None:
  406. template = 'rowspan="{span}" valign="top"'
  407. is_truncated_horizontally = self.fmt.is_truncated_horizontally
  408. is_truncated_vertically = self.fmt.is_truncated_vertically
  409. frame = self.fmt.tr_frame
  410. nrows = len(frame)
  411. assert isinstance(frame.index, MultiIndex)
  412. idx_values = frame.index._format_multi(sparsify=False, include_names=False)
  413. idx_values = list(zip(*idx_values))
  414. if self.fmt.sparsify:
  415. # GH3547
  416. sentinel = lib.no_default
  417. levels = frame.index._format_multi(sparsify=sentinel, include_names=False)
  418. level_lengths = get_level_lengths(levels, sentinel)
  419. inner_lvl = len(level_lengths) - 1
  420. if is_truncated_vertically:
  421. # Insert ... row and adjust idx_values and
  422. # level_lengths to take this into account.
  423. ins_row = self.fmt.tr_row_num
  424. inserted = False
  425. for lnum, records in enumerate(level_lengths):
  426. rec_new = {}
  427. for tag, span in list(records.items()):
  428. if tag >= ins_row:
  429. rec_new[tag + 1] = span
  430. elif tag + span > ins_row:
  431. rec_new[tag] = span + 1
  432. # GH 14882 - Make sure insertion done once
  433. if not inserted:
  434. dot_row = list(idx_values[ins_row - 1])
  435. dot_row[-1] = "..."
  436. idx_values.insert(ins_row, tuple(dot_row))
  437. inserted = True
  438. else:
  439. dot_row = list(idx_values[ins_row])
  440. dot_row[inner_lvl - lnum] = "..."
  441. idx_values[ins_row] = tuple(dot_row)
  442. else:
  443. rec_new[tag] = span
  444. # If ins_row lies between tags, all cols idx cols
  445. # receive ...
  446. if tag + span == ins_row:
  447. rec_new[ins_row] = 1
  448. if lnum == 0:
  449. idx_values.insert(
  450. ins_row, tuple(["..."] * len(level_lengths))
  451. )
  452. # GH 14882 - Place ... in correct level
  453. elif inserted:
  454. dot_row = list(idx_values[ins_row])
  455. dot_row[inner_lvl - lnum] = "..."
  456. idx_values[ins_row] = tuple(dot_row)
  457. level_lengths[lnum] = rec_new
  458. level_lengths[inner_lvl][ins_row] = 1
  459. for ix_col in fmt_values:
  460. fmt_values[ix_col].insert(ins_row, "...")
  461. nrows += 1
  462. for i in range(nrows):
  463. row = []
  464. tags = {}
  465. sparse_offset = 0
  466. j = 0
  467. for records, v in zip(level_lengths, idx_values[i]):
  468. if i in records:
  469. if records[i] > 1:
  470. tags[j] = template.format(span=records[i])
  471. else:
  472. sparse_offset += 1
  473. continue
  474. j += 1
  475. row.append(v)
  476. row.extend(fmt_values[j][i] for j in range(self.ncols))
  477. if is_truncated_horizontally:
  478. row.insert(
  479. self.row_levels - sparse_offset + self.fmt.tr_col_num, "..."
  480. )
  481. self.write_tr(
  482. row,
  483. indent,
  484. self.indent_delta,
  485. tags=tags,
  486. nindex_levels=len(levels) - sparse_offset,
  487. )
  488. else:
  489. row = []
  490. for i in range(len(frame)):
  491. if is_truncated_vertically and i == (self.fmt.tr_row_num):
  492. str_sep_row = ["..."] * len(row)
  493. self.write_tr(
  494. str_sep_row,
  495. indent,
  496. self.indent_delta,
  497. tags=None,
  498. nindex_levels=self.row_levels,
  499. )
  500. idx_values = list(
  501. zip(*frame.index._format_multi(sparsify=False, include_names=False))
  502. )
  503. row = []
  504. row.extend(idx_values[i])
  505. row.extend(fmt_values[j][i] for j in range(self.ncols))
  506. if is_truncated_horizontally:
  507. row.insert(self.row_levels + self.fmt.tr_col_num, "...")
  508. self.write_tr(
  509. row,
  510. indent,
  511. self.indent_delta,
  512. tags=None,
  513. nindex_levels=frame.index.nlevels,
  514. )
  515. class NotebookFormatter(HTMLFormatter):
  516. """
  517. Internal class for formatting output data in html for display in Jupyter
  518. Notebooks. This class is intended for functionality specific to
  519. DataFrame._repr_html_() and DataFrame.to_html(notebook=True)
  520. """
  521. def _get_formatted_values(self) -> dict[int, list[str]]:
  522. return {i: self.fmt.format_col(i) for i in range(self.ncols)}
  523. def _get_columns_formatted_values(self) -> list[str]:
  524. # only reached with non-Multi Index
  525. return self.columns._format_flat(include_name=False)
  526. def write_style(self) -> None:
  527. # We use the "scoped" attribute here so that the desired
  528. # style properties for the data frame are not then applied
  529. # throughout the entire notebook.
  530. template_first = """\
  531. <style scoped>"""
  532. template_last = """\
  533. </style>"""
  534. template_select = """\
  535. .dataframe %s {
  536. %s: %s;
  537. }"""
  538. element_props = [
  539. ("tbody tr th:only-of-type", "vertical-align", "middle"),
  540. ("tbody tr th", "vertical-align", "top"),
  541. ]
  542. if isinstance(self.columns, MultiIndex):
  543. element_props.append(("thead tr th", "text-align", "left"))
  544. if self.show_row_idx_names:
  545. element_props.append(
  546. ("thead tr:last-of-type th", "text-align", "right")
  547. )
  548. else:
  549. element_props.append(("thead th", "text-align", "right"))
  550. template_mid = "\n\n".join(template_select % t for t in element_props)
  551. template = dedent(f"{template_first}\n{template_mid}\n{template_last}")
  552. self.write(template)
  553. def render(self) -> list[str]:
  554. self.write("<div>")
  555. self.write_style()
  556. super().render()
  557. self.write("</div>")
  558. return self.elements