textwrap.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. """
  2. Sequence-aware text wrapping functions.
  3. This module provides functions for wrapping text that may contain terminal escape sequences, with
  4. proper handling of Unicode grapheme clusters and character display widths.
  5. """
  6. from __future__ import annotations
  7. # std imports
  8. import textwrap
  9. from typing import TYPE_CHECKING
  10. # local
  11. from .wcwidth import width as _width
  12. from .wcwidth import iter_sequences
  13. from .grapheme import iter_graphemes
  14. from .escape_sequences import ZERO_WIDTH_PATTERN
  15. if TYPE_CHECKING: # pragma: no cover
  16. from typing import Any, Literal
  17. class SequenceTextWrapper(textwrap.TextWrapper):
  18. """
  19. Sequence-aware text wrapper extending :class:`textwrap.TextWrapper`.
  20. This wrapper properly handles terminal escape sequences and Unicode grapheme clusters when
  21. calculating text width for wrapping.
  22. This implementation is based on the SequenceTextWrapper from the 'blessed' library, with
  23. contributions from Avram Lubkin and grayjk.
  24. The key difference from the blessed implementation is the addition of grapheme cluster support
  25. via :func:`~.iter_graphemes`, providing width calculation for ZWJ emoji sequences, VS-16 emojis
  26. and variations, regional indicator flags, and combining characters.
  27. """
  28. def __init__(self, width: int = 70, *,
  29. control_codes: Literal['parse', 'strict', 'ignore'] = 'parse',
  30. tabsize: int = 8,
  31. ambiguous_width: int = 1,
  32. **kwargs: Any) -> None:
  33. """
  34. Initialize the wrapper.
  35. :param width: Maximum line width in display cells.
  36. :param control_codes: How to handle control sequences (see :func:`~.width`).
  37. :param tabsize: Tab stop width for tab expansion.
  38. :param ambiguous_width: Width to use for East Asian Ambiguous (A) characters.
  39. :param kwargs: Additional arguments passed to :class:`textwrap.TextWrapper`.
  40. """
  41. super().__init__(width=width, **kwargs)
  42. self.control_codes = control_codes
  43. self.tabsize = tabsize
  44. self.ambiguous_width = ambiguous_width
  45. def _width(self, text: str) -> int:
  46. """Measure text width accounting for sequences."""
  47. return _width(text, control_codes=self.control_codes, tabsize=self.tabsize,
  48. ambiguous_width=self.ambiguous_width)
  49. def _strip_sequences(self, text: str) -> str:
  50. """Strip all terminal sequences from text."""
  51. result = []
  52. for segment, is_seq in iter_sequences(text):
  53. if not is_seq:
  54. result.append(segment)
  55. return ''.join(result)
  56. def _extract_sequences(self, text: str) -> str:
  57. """Extract only terminal sequences from text."""
  58. result = []
  59. for segment, is_seq in iter_sequences(text):
  60. if is_seq:
  61. result.append(segment)
  62. return ''.join(result)
  63. def _split(self, text: str) -> list[str]: # pylint: disable=too-many-locals
  64. """
  65. Sequence-aware variant of :meth:`textwrap.TextWrapper._split`.
  66. This method ensures that terminal escape sequences don't interfere with the text splitting
  67. logic, particularly for hyphen-based word breaking. It builds a position mapping from
  68. stripped text to original text, calls the parent's _split on stripped text, then maps chunks
  69. back.
  70. """
  71. # pylint: disable=too-many-locals,too-many-branches
  72. # Build a mapping from stripped text positions to original text positions.
  73. # We track where each character ENDS so that sequences between characters
  74. # attach to the following text (not preceding text). This ensures sequences
  75. # aren't lost when whitespace is dropped.
  76. #
  77. # char_end[i] = position in original text right after the i-th stripped char
  78. char_end: list[int] = []
  79. stripped_text = ''
  80. original_pos = 0
  81. for segment, is_seq in iter_sequences(text):
  82. if not is_seq:
  83. for char in segment:
  84. original_pos += 1
  85. char_end.append(original_pos)
  86. stripped_text += char
  87. else:
  88. # Escape sequences advance position but don't add to stripped text
  89. original_pos += len(segment)
  90. # Add sentinel for final position
  91. char_end.append(original_pos)
  92. # Use parent's _split on the stripped text
  93. # pylint: disable-next=protected-access
  94. stripped_chunks = textwrap.TextWrapper._split(self, stripped_text)
  95. # Handle text that contains only sequences (no visible characters).
  96. # Return the sequences as a single chunk to preserve them.
  97. if not stripped_chunks and text:
  98. return [text]
  99. # Map the chunks back to the original text with sequences
  100. result: list[str] = []
  101. stripped_pos = 0
  102. num_chunks = len(stripped_chunks)
  103. for idx, chunk in enumerate(stripped_chunks):
  104. chunk_len = len(chunk)
  105. # Start is where previous character ended (or 0 for first chunk)
  106. start_orig = 0 if stripped_pos == 0 else char_end[stripped_pos - 1]
  107. # End is where next character starts. For last chunk, use sentinel
  108. # to include any trailing sequences.
  109. if idx == num_chunks - 1:
  110. end_orig = char_end[-1] # sentinel includes trailing sequences
  111. else:
  112. end_orig = char_end[stripped_pos + chunk_len - 1]
  113. # Extract the corresponding portion from the original text
  114. result.append(text[start_orig:end_orig])
  115. stripped_pos += chunk_len
  116. return result
  117. def _wrap_chunks(self, chunks: list[str]) -> list[str]: # pylint: disable=too-many-branches
  118. """
  119. Wrap chunks into lines using sequence-aware width.
  120. Override TextWrapper._wrap_chunks to use _width instead of len. Follows stdlib's algorithm:
  121. greedily fill lines, handle long words.
  122. """
  123. # pylint: disable=too-many-branches
  124. if not chunks:
  125. return []
  126. lines: list[str] = []
  127. is_first_line = True
  128. # Arrange in reverse order so items can be efficiently popped
  129. chunks = list(reversed(chunks))
  130. while chunks:
  131. current_line: list[str] = []
  132. current_width = 0
  133. # Get the indent and available width for current line
  134. indent = self.initial_indent if is_first_line else self.subsequent_indent
  135. line_width = self.width - self._width(indent)
  136. # Drop leading whitespace (except at very start)
  137. # When dropping, transfer any sequences to the next chunk.
  138. # Only drop if there's actual whitespace text, not if it's only sequences.
  139. stripped = self._strip_sequences(chunks[-1])
  140. if self.drop_whitespace and lines and stripped and not stripped.strip():
  141. sequences = self._extract_sequences(chunks[-1])
  142. del chunks[-1]
  143. if sequences and chunks:
  144. chunks[-1] = sequences + chunks[-1]
  145. # Greedily add chunks that fit
  146. while chunks:
  147. chunk = chunks[-1]
  148. chunk_width = self._width(chunk)
  149. if current_width + chunk_width <= line_width:
  150. current_line.append(chunks.pop())
  151. current_width += chunk_width
  152. else:
  153. break
  154. # Handle chunk that's too long for any line
  155. if chunks and self._width(chunks[-1]) > line_width:
  156. self._handle_long_word(
  157. chunks, current_line, current_width, line_width
  158. )
  159. current_width = self._width(''.join(current_line))
  160. # Remove any empty chunks left by _handle_long_word
  161. while chunks and not chunks[-1]:
  162. del chunks[-1]
  163. # Drop trailing whitespace
  164. # When dropping, transfer any sequences to the previous chunk.
  165. # Only drop if there's actual whitespace text, not if it's only sequences.
  166. stripped_last = self._strip_sequences(current_line[-1]) if current_line else ''
  167. if (self.drop_whitespace and current_line and
  168. stripped_last and not stripped_last.strip()):
  169. sequences = self._extract_sequences(current_line[-1])
  170. current_width -= self._width(current_line[-1])
  171. del current_line[-1]
  172. if sequences and current_line:
  173. current_line[-1] = current_line[-1] + sequences
  174. if current_line:
  175. line_content = ''.join(current_line)
  176. # Strip trailing whitespace when drop_whitespace is enabled
  177. # (matches CPython #140627 fix behavior)
  178. if self.drop_whitespace:
  179. line_content = line_content.rstrip()
  180. lines.append(indent + line_content)
  181. is_first_line = False
  182. return lines
  183. def _handle_long_word(self, reversed_chunks: list[str],
  184. cur_line: list[str], cur_len: int,
  185. width: int) -> None:
  186. """
  187. Sequence-aware :meth:`textwrap.TextWrapper._handle_long_word`.
  188. This method ensures that word boundaries are not broken mid-sequence, and respects grapheme
  189. cluster boundaries when breaking long words.
  190. """
  191. if width < 1:
  192. space_left = 1
  193. else:
  194. space_left = width - cur_len
  195. if self.break_long_words:
  196. chunk = reversed_chunks[-1]
  197. break_at_hyphen = False
  198. hyphen_end = 0
  199. # Handle break_on_hyphens: find last hyphen within space_left
  200. if self.break_on_hyphens:
  201. # Strip sequences to find hyphen in logical text
  202. stripped = self._strip_sequences(chunk)
  203. if len(stripped) > space_left:
  204. # Find last hyphen in the portion that fits
  205. hyphen_pos = stripped.rfind('-', 0, space_left)
  206. if hyphen_pos > 0 and any(c != '-' for c in stripped[:hyphen_pos]):
  207. # Map back to original position including sequences
  208. hyphen_end = self._map_stripped_pos_to_original(chunk, hyphen_pos + 1)
  209. break_at_hyphen = True
  210. # Break at grapheme boundaries to avoid splitting multi-codepoint characters
  211. if break_at_hyphen:
  212. actual_end = hyphen_end
  213. else:
  214. actual_end = self._find_break_position(chunk, space_left)
  215. # If no progress possible (e.g., wide char exceeds line width),
  216. # force at least one grapheme to avoid infinite loop.
  217. # Only force when cur_line is empty; if line has content,
  218. # appending nothing is safe and the line will be committed.
  219. if actual_end == 0 and not cur_line:
  220. actual_end = self._find_first_grapheme_end(chunk)
  221. cur_line.append(chunk[:actual_end])
  222. reversed_chunks[-1] = chunk[actual_end:]
  223. elif not cur_line:
  224. cur_line.append(reversed_chunks.pop())
  225. def _map_stripped_pos_to_original(self, text: str, stripped_pos: int) -> int:
  226. """Map a position in stripped text back to original text position."""
  227. stripped_idx = 0
  228. original_idx = 0
  229. for segment, is_seq in iter_sequences(text):
  230. if is_seq:
  231. original_idx += len(segment)
  232. elif stripped_idx + len(segment) > stripped_pos:
  233. # Position is within this segment
  234. return original_idx + (stripped_pos - stripped_idx)
  235. else:
  236. stripped_idx += len(segment)
  237. original_idx += len(segment)
  238. # Caller guarantees stripped_pos < total stripped chars, so we always
  239. # return from within the loop. This line satisfies the type checker.
  240. return original_idx # pragma: no cover
  241. def _find_break_position(self, text: str, max_width: int) -> int:
  242. """Find string index in text that fits within max_width cells."""
  243. idx = 0
  244. width_so_far = 0
  245. while idx < len(text):
  246. char = text[idx]
  247. # Skip escape sequences (they don't add width)
  248. if char == '\x1b':
  249. match = ZERO_WIDTH_PATTERN.match(text, idx)
  250. if match:
  251. idx = match.end()
  252. continue
  253. # Get grapheme
  254. grapheme = next(iter_graphemes(text[idx:]))
  255. grapheme_width = self._width(grapheme)
  256. if width_so_far + grapheme_width > max_width:
  257. return idx # Found break point
  258. width_so_far += grapheme_width
  259. idx += len(grapheme)
  260. # Caller guarantees chunk_width > max_width, so a grapheme always
  261. # exceeds and we return from within the loop. Type checker requires this.
  262. return idx # pragma: no cover
  263. def _find_first_grapheme_end(self, text: str) -> int:
  264. """Find the end position of the first grapheme."""
  265. return len(next(iter_graphemes(text)))
  266. def wrap(text: str, width: int = 70, *,
  267. control_codes: Literal['parse', 'strict', 'ignore'] = 'parse',
  268. tabsize: int = 8,
  269. ambiguous_width: int = 1,
  270. initial_indent: str = '',
  271. subsequent_indent: str = '',
  272. break_long_words: bool = True,
  273. break_on_hyphens: bool = True) -> list[str]:
  274. r"""
  275. Wrap text to fit within given width, returning a list of wrapped lines.
  276. Like :func:`textwrap.wrap`, but measures width in display cells rather than
  277. characters, correctly handling wide characters, combining marks, and terminal
  278. escape sequences.
  279. :param text: Text to wrap, may contain terminal sequences.
  280. :param width: Maximum line width in display cells.
  281. :param control_codes: How to handle terminal sequences (see :func:`~.width`).
  282. :param tabsize: Tab stop width for tab expansion.
  283. :param ambiguous_width: Width to use for East Asian Ambiguous (A)
  284. characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.
  285. :param initial_indent: String prepended to first line.
  286. :param subsequent_indent: String prepended to subsequent lines.
  287. :param break_long_words: If True, break words longer than width.
  288. :param break_on_hyphens: If True, allow breaking at hyphens.
  289. :returns: List of wrapped lines without trailing newlines.
  290. Like :func:`textwrap.wrap`, newlines in the input text are treated as
  291. whitespace and collapsed. To preserve paragraph breaks, wrap each
  292. paragraph separately::
  293. >>> text = 'First line.\\nSecond line.'
  294. >>> wrap(text, 40) # newline collapsed to space
  295. ['First line. Second line.']
  296. >>> [line for para in text.split('\\n')
  297. ... for line in (wrap(para, 40) if para else [''])]
  298. ['First line.', 'Second line.']
  299. .. seealso::
  300. :func:`textwrap.wrap`, :class:`textwrap.TextWrapper`
  301. Standard library text wrapping (character-based).
  302. :class:`.SequenceTextWrapper`
  303. Class interface for advanced wrapping options.
  304. .. versionadded:: 0.3.0
  305. Example::
  306. >>> from wcwidth import wrap
  307. >>> wrap('hello world', 5)
  308. ['hello', 'world']
  309. >>> wrap('中文字符', 4) # CJK characters (2 cells each)
  310. ['中文', '字符']
  311. """
  312. wrapper = SequenceTextWrapper(
  313. width=width,
  314. control_codes=control_codes,
  315. tabsize=tabsize,
  316. ambiguous_width=ambiguous_width,
  317. initial_indent=initial_indent,
  318. subsequent_indent=subsequent_indent,
  319. break_long_words=break_long_words,
  320. break_on_hyphens=break_on_hyphens,
  321. )
  322. return wrapper.wrap(text)