wcwidth.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. """
  2. This is a python implementation of wcwidth() and wcswidth().
  3. https://github.com/jquast/wcwidth
  4. from Markus Kuhn's C code, retrieved from:
  5. http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
  6. This is an implementation of wcwidth() and wcswidth() (defined in
  7. IEEE Std 1002.1-2001) for Unicode.
  8. http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
  9. http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
  10. In fixed-width output devices, Latin characters all occupy a single
  11. "cell" position of equal width, whereas ideographic CJK characters
  12. occupy two such cells. Interoperability between terminal-line
  13. applications and (teletype-style) character terminals using the
  14. UTF-8 encoding requires agreement on which character should advance
  15. the cursor by how many cell positions. No established formal
  16. standards exist at present on which Unicode character shall occupy
  17. how many cell positions on character terminals. These routines are
  18. a first attempt of defining such behavior based on simple rules
  19. applied to data provided by the Unicode Consortium.
  20. For some graphical characters, the Unicode standard explicitly
  21. defines a character-cell width via the definition of the East Asian
  22. FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
  23. In all these cases, there is no ambiguity about which width a
  24. terminal shall use. For characters in the East Asian Ambiguous (A)
  25. class, the width choice depends purely on a preference of backward
  26. compatibility with either historic CJK or Western practice.
  27. Choosing single-width for these characters is easy to justify as
  28. the appropriate long-term solution, as the CJK practice of
  29. displaying these characters as double-width comes from historic
  30. implementation simplicity (8-bit encoded characters were displayed
  31. single-width and 16-bit ones double-width, even for Greek,
  32. Cyrillic, etc.) and not any typographic considerations.
  33. Much less clear is the choice of width for the Not East Asian
  34. (Neutral) class. Existing practice does not dictate a width for any
  35. of these characters. It would nevertheless make sense
  36. typographically to allocate two character cells to characters such
  37. as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
  38. represented adequately with a single-width glyph. The following
  39. routines at present merely assign a single-cell width to all
  40. neutral characters, in the interest of simplicity. This is not
  41. entirely satisfactory and should be reconsidered before
  42. establishing a formal standard in this area. At the moment, the
  43. decision which Not East Asian (Neutral) characters should be
  44. represented by double-width glyphs cannot yet be answered by
  45. applying a simple rule from the Unicode database content. Setting
  46. up a proper standard for the behavior of UTF-8 character terminals
  47. will require a careful analysis not only of each Unicode character,
  48. but also of each presentation form, something the author of these
  49. routines has avoided to do so far.
  50. http://www.unicode.org/unicode/reports/tr11/
  51. Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
  52. """
  53. # std imports
  54. import os
  55. import warnings
  56. from functools import lru_cache
  57. # local
  58. from .table_vs16 import VS16_NARROW_TO_WIDE
  59. from .table_wide import WIDE_EASTASIAN
  60. from .table_zero import ZERO_WIDTH
  61. from .unicode_versions import list_versions
  62. def _bisearch(ucs, table):
  63. """
  64. Auxiliary function for binary search in interval table.
  65. :arg int ucs: Ordinal value of unicode character.
  66. :arg list table: List of starting and ending ranges of ordinal values,
  67. in form of ``[(start, end), ...]``.
  68. :rtype: int
  69. :returns: 1 if ordinal value ucs is found within lookup table, else 0.
  70. """
  71. lbound = 0
  72. ubound = len(table) - 1
  73. if ucs < table[0][0] or ucs > table[ubound][1]:
  74. return 0
  75. while ubound >= lbound:
  76. mid = (lbound + ubound) // 2
  77. if ucs > table[mid][1]:
  78. lbound = mid + 1
  79. elif ucs < table[mid][0]:
  80. ubound = mid - 1
  81. else:
  82. return 1
  83. return 0
  84. @lru_cache(maxsize=1000)
  85. def wcwidth(wc, unicode_version='auto'):
  86. r"""
  87. Given one Unicode character, return its printable length on a terminal.
  88. :param str wc: A single Unicode character.
  89. :param str unicode_version: A Unicode version number, such as
  90. ``'6.0.0'``. A list of version levels suported by wcwidth
  91. is returned by :func:`list_versions`.
  92. Any version string may be specified without error -- the nearest
  93. matching version is selected. When ``latest`` (default), the
  94. highest Unicode version level is used.
  95. :return: The width, in cells, necessary to display the character of
  96. Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has
  97. no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is
  98. not printable, or has an indeterminate effect on the terminal, such as
  99. a control character. Otherwise, the number of column positions the
  100. character occupies on a graphic terminal (1 or 2) is returned.
  101. :rtype: int
  102. See :ref:`Specification` for details of cell measurement.
  103. """
  104. ucs = ord(wc) if wc else 0
  105. # small optimization: early return of 1 for printable ASCII, this provides
  106. # approximately 40% performance improvement for mostly-ascii documents, with
  107. # less than 1% impact to others.
  108. if 32 <= ucs < 0x7f:
  109. return 1
  110. # C0/C1 control characters are -1 for compatibility with POSIX-like calls
  111. if ucs and ucs < 32 or 0x07F <= ucs < 0x0A0:
  112. return -1
  113. _unicode_version = _wcmatch_version(unicode_version)
  114. # Zero width
  115. if _bisearch(ucs, ZERO_WIDTH[_unicode_version]):
  116. return 0
  117. # 1 or 2 width
  118. return 1 + _bisearch(ucs, WIDE_EASTASIAN[_unicode_version])
  119. def wcswidth(pwcs, n=None, unicode_version='auto'):
  120. """
  121. Given a unicode string, return its printable length on a terminal.
  122. :param str pwcs: Measure width of given unicode string.
  123. :param int n: When ``n`` is None (default), return the length of the entire
  124. string, otherwise only the first ``n`` characters are measured. This
  125. argument exists only for compatibility with the C POSIX function
  126. signature. It is suggested instead to use python's string slicing
  127. capability, ``wcswidth(pwcs[:n])``
  128. :param str unicode_version: An explicit definition of the unicode version
  129. level to use for determination, may be ``auto`` (default), which uses
  130. the Environment Variable, ``UNICODE_VERSION`` if defined, or the latest
  131. available unicode version, otherwise.
  132. :rtype: int
  133. :returns: The width, in cells, needed to display the first ``n`` characters
  134. of the unicode string ``pwcs``. Returns ``-1`` for C0 and C1 control
  135. characters!
  136. See :ref:`Specification` for details of cell measurement.
  137. """
  138. # this 'n' argument is a holdover for POSIX function
  139. _unicode_version = None
  140. end = len(pwcs) if n is None else n
  141. width = 0
  142. idx = 0
  143. last_measured_char = None
  144. while idx < end:
  145. char = pwcs[idx]
  146. if char == '\u200D':
  147. # Zero Width Joiner, do not measure this or next character
  148. idx += 2
  149. continue
  150. if char == '\uFE0F' and last_measured_char:
  151. # on variation selector 16 (VS16) following another character,
  152. # conditionally add '1' to the measured width if that character is
  153. # known to be converted from narrow to wide by the VS16 character.
  154. if _unicode_version is None:
  155. _unicode_version = _wcversion_value(_wcmatch_version(unicode_version))
  156. if _unicode_version >= (9, 0, 0):
  157. width += _bisearch(ord(last_measured_char), VS16_NARROW_TO_WIDE["9.0.0"])
  158. last_measured_char = None
  159. idx += 1
  160. continue
  161. # measure character at current index
  162. wcw = wcwidth(char, unicode_version)
  163. if wcw < 0:
  164. # early return -1 on C0 and C1 control characters
  165. return wcw
  166. if wcw > 0:
  167. # track last character measured to contain a cell, so that
  168. # subsequent VS-16 modifiers may be understood
  169. last_measured_char = char
  170. width += wcw
  171. idx += 1
  172. return width
  173. @lru_cache(maxsize=128)
  174. def _wcversion_value(ver_string):
  175. """
  176. Integer-mapped value of given dotted version string.
  177. :param str ver_string: Unicode version string, of form ``n.n.n``.
  178. :rtype: tuple(int)
  179. :returns: tuple of digit tuples, ``tuple(int, [...])``.
  180. """
  181. retval = tuple(map(int, (ver_string.split('.'))))
  182. return retval
  183. @lru_cache(maxsize=8)
  184. def _wcmatch_version(given_version):
  185. """
  186. Return nearest matching supported Unicode version level.
  187. If an exact match is not determined, the nearest lowest version level is
  188. returned after a warning is emitted. For example, given supported levels
  189. ``4.1.0`` and ``5.0.0``, and a version string of ``4.9.9``, then ``4.1.0``
  190. is selected and returned:
  191. >>> _wcmatch_version('4.9.9')
  192. '4.1.0'
  193. >>> _wcmatch_version('8.0')
  194. '8.0.0'
  195. >>> _wcmatch_version('1')
  196. '4.1.0'
  197. :param str given_version: given version for compare, may be ``auto``
  198. (default), to select Unicode Version from Environment Variable,
  199. ``UNICODE_VERSION``. If the environment variable is not set, then the
  200. latest is used.
  201. :rtype: str
  202. :returns: unicode string.
  203. """
  204. # Design note: the choice to return the same type that is given certainly
  205. # complicates it for python 2 str-type, but allows us to define an api that
  206. # uses 'string-type' for unicode version level definitions, so all of our
  207. # example code works with all versions of python.
  208. #
  209. # That, along with the string-to-numeric and comparisons of earliest,
  210. # latest, matching, or nearest, greatly complicates this function.
  211. # Performance is somewhat curbed by memoization.
  212. unicode_versions = list_versions()
  213. latest_version = unicode_versions[-1]
  214. if given_version == 'auto':
  215. given_version = os.environ.get(
  216. 'UNICODE_VERSION',
  217. 'latest')
  218. if given_version == 'latest':
  219. # default match, when given as 'latest', use the most latest unicode
  220. # version specification level supported.
  221. return latest_version
  222. if given_version in unicode_versions:
  223. # exact match, downstream has specified an explicit matching version
  224. # matching any value of list_versions().
  225. return given_version
  226. # The user's version is not supported by ours. We return the newest unicode
  227. # version level that we support below their given value.
  228. try:
  229. cmp_given = _wcversion_value(given_version)
  230. except ValueError:
  231. # submitted value raises ValueError in int(), warn and use latest.
  232. warnings.warn("UNICODE_VERSION value, {given_version!r}, is invalid. "
  233. "Value should be in form of `integer[.]+', the latest "
  234. "supported unicode version {latest_version!r} has been "
  235. "inferred.".format(given_version=given_version,
  236. latest_version=latest_version))
  237. return latest_version
  238. # given version is less than any available version, return earliest
  239. # version.
  240. earliest_version = unicode_versions[0]
  241. cmp_earliest_version = _wcversion_value(earliest_version)
  242. if cmp_given <= cmp_earliest_version:
  243. # this probably isn't what you wanted, the oldest wcwidth.c you will
  244. # find in the wild is likely version 5 or 6, which we both support,
  245. # but it's better than not saying anything at all.
  246. warnings.warn("UNICODE_VERSION value, {given_version!r}, is lower "
  247. "than any available unicode version. Returning lowest "
  248. "version level, {earliest_version!r}".format(
  249. given_version=given_version,
  250. earliest_version=earliest_version))
  251. return earliest_version
  252. # create list of versions which are less than our equal to given version,
  253. # and return the tail value, which is the highest level we may support,
  254. # or the latest value we support, when completely unmatched or higher
  255. # than any supported version.
  256. #
  257. # function will never complete, always returns.
  258. for idx, unicode_version in enumerate(unicode_versions):
  259. # look ahead to next value
  260. try:
  261. cmp_next_version = _wcversion_value(unicode_versions[idx + 1])
  262. except IndexError:
  263. # at end of list, return latest version
  264. return latest_version
  265. # Maybe our given version has less parts, as in tuple(8, 0), than the
  266. # next compare version tuple(8, 0, 0). Test for an exact match by
  267. # comparison of only the leading dotted piece(s): (8, 0) == (8, 0).
  268. if cmp_given == cmp_next_version[:len(cmp_given)]:
  269. return unicode_versions[idx + 1]
  270. # Or, if any next value is greater than our given support level
  271. # version, return the current value in index. Even though it must
  272. # be less than the given value, it's our closest possible match. That
  273. # is, 4.1 is returned for given 4.9.9, where 4.1 and 5.0 are available.
  274. if cmp_next_version > cmp_given:
  275. return unicode_version
  276. assert False, ("Code path unreachable", given_version, unicode_versions) # pragma: no cover