grapheme.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. """
  2. Grapheme cluster segmentation following Unicode Standard Annex #29.
  3. This module provides pure-Python implementation of the grapheme cluster boundary algorithm as
  4. defined in UAX #29: Unicode Text Segmentation.
  5. https://www.unicode.org/reports/tr29/
  6. """
  7. from __future__ import annotations
  8. # std imports
  9. from enum import IntEnum
  10. from functools import lru_cache
  11. from typing import TYPE_CHECKING, NamedTuple
  12. # local
  13. from .bisearch import bisearch as _bisearch
  14. from .table_grapheme import (GRAPHEME_L,
  15. GRAPHEME_T,
  16. GRAPHEME_V,
  17. GRAPHEME_LV,
  18. INCB_EXTEND,
  19. INCB_LINKER,
  20. GRAPHEME_LVT,
  21. INCB_CONSONANT,
  22. GRAPHEME_EXTEND,
  23. GRAPHEME_CONTROL,
  24. GRAPHEME_PREPEND,
  25. GRAPHEME_SPACINGMARK,
  26. EXTENDED_PICTOGRAPHIC,
  27. GRAPHEME_REGIONAL_INDICATOR)
  28. if TYPE_CHECKING: # pragma: no cover
  29. # std imports
  30. from collections.abc import Iterator
  31. class GCB(IntEnum):
  32. """Grapheme Cluster Break property values."""
  33. OTHER = 0
  34. CR = 1
  35. LF = 2
  36. CONTROL = 3
  37. EXTEND = 4
  38. ZWJ = 5
  39. REGIONAL_INDICATOR = 6
  40. PREPEND = 7
  41. SPACING_MARK = 8
  42. L = 9
  43. V = 10
  44. T = 11
  45. LV = 12
  46. LVT = 13
  47. @lru_cache(maxsize=1000)
  48. def _grapheme_cluster_break(ucs: int) -> GCB:
  49. # pylint: disable=too-many-branches,too-complex
  50. """Return the Grapheme_Cluster_Break property for a codepoint."""
  51. # Single codepoint matches
  52. if ucs == 0x000d:
  53. return GCB.CR
  54. if ucs == 0x000a:
  55. return GCB.LF
  56. if ucs == 0x200d:
  57. return GCB.ZWJ
  58. # Matching by codepoint ranges, requiring binary search
  59. if _bisearch(ucs, GRAPHEME_CONTROL):
  60. return GCB.CONTROL
  61. if _bisearch(ucs, GRAPHEME_EXTEND):
  62. return GCB.EXTEND
  63. if _bisearch(ucs, GRAPHEME_REGIONAL_INDICATOR):
  64. return GCB.REGIONAL_INDICATOR
  65. if _bisearch(ucs, GRAPHEME_PREPEND):
  66. return GCB.PREPEND
  67. if _bisearch(ucs, GRAPHEME_SPACINGMARK):
  68. return GCB.SPACING_MARK
  69. if _bisearch(ucs, GRAPHEME_L):
  70. return GCB.L
  71. if _bisearch(ucs, GRAPHEME_V):
  72. return GCB.V
  73. if _bisearch(ucs, GRAPHEME_T):
  74. return GCB.T
  75. if _bisearch(ucs, GRAPHEME_LV):
  76. return GCB.LV
  77. if _bisearch(ucs, GRAPHEME_LVT):
  78. return GCB.LVT
  79. return GCB.OTHER
  80. @lru_cache(maxsize=512)
  81. def _is_extended_pictographic(ucs: int) -> bool:
  82. """Check if codepoint has Extended_Pictographic property."""
  83. return bool(_bisearch(ucs, EXTENDED_PICTOGRAPHIC))
  84. @lru_cache(maxsize=128)
  85. def _is_incb_linker(ucs: int) -> bool:
  86. """Check if codepoint has InCB=Linker property."""
  87. return bool(_bisearch(ucs, INCB_LINKER))
  88. @lru_cache(maxsize=256)
  89. def _is_incb_consonant(ucs: int) -> bool:
  90. """Check if codepoint has InCB=Consonant property."""
  91. return bool(_bisearch(ucs, INCB_CONSONANT))
  92. @lru_cache(maxsize=256)
  93. def _is_incb_extend(ucs: int) -> bool:
  94. """Check if codepoint has InCB=Extend property."""
  95. return bool(_bisearch(ucs, INCB_EXTEND))
  96. class BreakResult(NamedTuple):
  97. """Result of grapheme cluster break decision."""
  98. should_break: bool
  99. ri_count: int
  100. @lru_cache(maxsize=196) # 14 GCB values × 14 = 196 max combinations
  101. def _simple_break_check(prev_gcb: GCB, curr_gcb: GCB) -> BreakResult | None:
  102. """
  103. Check simple GCB-pair-based break rules (cacheable).
  104. Returns BreakResult for rules that can be determined from GCB properties alone, or None if
  105. complex lookback rules (GB9c, GB11) need to be checked.
  106. """
  107. # GB3: CR x LF
  108. if prev_gcb == GCB.CR and curr_gcb == GCB.LF:
  109. return BreakResult(should_break=False, ri_count=0)
  110. # GB4: (Control|CR|LF) ÷
  111. if prev_gcb in (GCB.CONTROL, GCB.CR, GCB.LF):
  112. return BreakResult(should_break=True, ri_count=0)
  113. # GB5: ÷ (Control|CR|LF)
  114. if curr_gcb in (GCB.CONTROL, GCB.CR, GCB.LF):
  115. return BreakResult(should_break=True, ri_count=0)
  116. # GB6: L x (L|V|LV|LVT)
  117. if prev_gcb == GCB.L and curr_gcb in (GCB.L, GCB.V, GCB.LV, GCB.LVT):
  118. return BreakResult(should_break=False, ri_count=0)
  119. # GB7: (LV|V) x (V|T)
  120. if prev_gcb in (GCB.LV, GCB.V) and curr_gcb in (GCB.V, GCB.T):
  121. return BreakResult(should_break=False, ri_count=0)
  122. # GB8: (LVT|T) x T
  123. if prev_gcb in (GCB.LVT, GCB.T) and curr_gcb == GCB.T:
  124. return BreakResult(should_break=False, ri_count=0)
  125. # GB9: x (Extend|ZWJ) - but ZWJ needs GB11 check, so only handle Extend here
  126. if curr_gcb == GCB.EXTEND:
  127. return BreakResult(should_break=False, ri_count=0)
  128. # GB9a: x SpacingMark
  129. if curr_gcb == GCB.SPACING_MARK:
  130. return BreakResult(should_break=False, ri_count=0)
  131. # GB9b: Prepend x
  132. if prev_gcb == GCB.PREPEND:
  133. return BreakResult(should_break=False, ri_count=0)
  134. # GB9c and GB11 need lookback - return None to signal complex check needed
  135. # GB12/13 (RI pairs) need ri_count state - also handled in main function
  136. return None
  137. def _should_break(
  138. prev_gcb: GCB,
  139. curr_gcb: GCB,
  140. text: str,
  141. curr_idx: int,
  142. ri_count: int,
  143. ) -> BreakResult:
  144. # pylint: disable=too-many-branches,too-complex
  145. """
  146. Determine if there should be a grapheme cluster break between prev and curr.
  147. Implements UAX #29 grapheme cluster boundary rules.
  148. """
  149. # Try cached simple rules first
  150. result = _simple_break_check(prev_gcb, curr_gcb)
  151. if result is not None:
  152. return result
  153. # GB9: x ZWJ (not cached because GB11 needs lookback when prev is ZWJ)
  154. if curr_gcb == GCB.ZWJ:
  155. return BreakResult(should_break=False, ri_count=0)
  156. # GB9c: Indic conjunct cluster
  157. # \p{InCB=Consonant} [\p{InCB=Extend}\p{InCB=Linker}]* \p{InCB=Linker}
  158. # [\p{InCB=Extend}\p{InCB=Linker}]* x \p{InCB=Consonant}
  159. curr_ucs = ord(text[curr_idx])
  160. if _is_incb_consonant(curr_ucs):
  161. has_linker = False
  162. i = curr_idx - 1
  163. while i >= 0:
  164. prev_ucs = ord(text[i])
  165. if _is_incb_linker(prev_ucs):
  166. has_linker = True
  167. i -= 1
  168. elif _is_incb_extend(prev_ucs):
  169. i -= 1
  170. elif _is_incb_consonant(prev_ucs):
  171. if has_linker:
  172. return BreakResult(should_break=False, ri_count=0)
  173. break
  174. else:
  175. break
  176. # GB11: ExtPict Extend* ZWJ x ExtPict
  177. if prev_gcb == GCB.ZWJ and _is_extended_pictographic(curr_ucs):
  178. i = curr_idx - 2 # Skip the ZWJ at curr_idx - 1
  179. while i >= 0:
  180. prev_ucs = ord(text[i])
  181. prev_prop = _grapheme_cluster_break(prev_ucs)
  182. if prev_prop == GCB.EXTEND:
  183. i -= 1
  184. elif _is_extended_pictographic(prev_ucs):
  185. return BreakResult(should_break=False, ri_count=0)
  186. else:
  187. break
  188. # GB12/GB13: RI x RI (pair matching)
  189. if prev_gcb == GCB.REGIONAL_INDICATOR and curr_gcb == GCB.REGIONAL_INDICATOR:
  190. if ri_count % 2 == 1:
  191. return BreakResult(should_break=False, ri_count=ri_count + 1)
  192. return BreakResult(should_break=True, ri_count=1)
  193. # GB999: Any ÷ Any
  194. ri_count = 1 if curr_gcb == GCB.REGIONAL_INDICATOR else 0
  195. return BreakResult(should_break=True, ri_count=ri_count)
  196. def iter_graphemes(
  197. unistr: str,
  198. start: int = 0,
  199. end: int | None = None,
  200. ) -> Iterator[str]:
  201. r"""
  202. Iterate over grapheme clusters in a Unicode string.
  203. Grapheme clusters are "user-perceived characters" - what a user would
  204. consider a single character, which may consist of multiple Unicode
  205. codepoints (e.g., a base character with combining marks, emoji sequences).
  206. :param unistr: The Unicode string to segment.
  207. :param start: Starting index (default 0).
  208. :param end: Ending index (default len(unistr)).
  209. :yields: Grapheme cluster substrings.
  210. Example::
  211. >>> list(iter_graphemes('cafe\\u0301'))
  212. ['c', 'a', 'f', 'e\\u0301']
  213. >>> list(iter_graphemes('\\U0001F468\\u200D\\U0001F469\\u200D\\U0001F467'))
  214. ['o', 'k', '\\U0001F468\\u200D\\U0001F469\\u200D\\U0001F467']
  215. >>> list(iter_graphemes('\\U0001F1FA\\U0001F1F8'))
  216. ['o', 'k', '\\U0001F1FA\\U0001F1F8']
  217. .. versionadded:: 0.3.0
  218. """
  219. if not unistr:
  220. return
  221. length = len(unistr)
  222. if end is None:
  223. end = length
  224. if start >= end or start >= length:
  225. return
  226. end = min(end, length)
  227. # Track state for grapheme cluster boundaries
  228. cluster_start = start
  229. ri_count = 0
  230. # Get GCB for first character
  231. prev_gcb = _grapheme_cluster_break(ord(unistr[start]))
  232. # Handle Regional Indicator count initialization
  233. if prev_gcb == GCB.REGIONAL_INDICATOR:
  234. ri_count = 1
  235. for idx in range(start + 1, end):
  236. curr_gcb = _grapheme_cluster_break(ord(unistr[idx]))
  237. result = _should_break(prev_gcb, curr_gcb, unistr, idx, ri_count)
  238. ri_count = result.ri_count
  239. if result.should_break:
  240. yield unistr[cluster_start:idx]
  241. cluster_start = idx
  242. prev_gcb = curr_gcb
  243. # Yield the final cluster
  244. yield unistr[cluster_start:end]