strings.py 49 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813
  1. """
  2. This module contains a set of functions for vectorized string
  3. operations.
  4. """
  5. import functools
  6. import sys
  7. import numpy as np
  8. from numpy import (
  9. add,
  10. equal,
  11. greater,
  12. greater_equal,
  13. less,
  14. less_equal,
  15. multiply as _multiply_ufunc,
  16. not_equal,
  17. )
  18. from numpy._core.multiarray import _vec_string
  19. from numpy._core.overrides import array_function_dispatch, set_module
  20. from numpy._core.umath import (
  21. _center,
  22. _expandtabs,
  23. _expandtabs_length,
  24. _ljust,
  25. _lstrip_chars,
  26. _lstrip_whitespace,
  27. _partition,
  28. _partition_index,
  29. _replace,
  30. _rjust,
  31. _rpartition,
  32. _rpartition_index,
  33. _rstrip_chars,
  34. _rstrip_whitespace,
  35. _slice,
  36. _strip_chars,
  37. _strip_whitespace,
  38. _zfill,
  39. count as _count_ufunc,
  40. endswith as _endswith_ufunc,
  41. find as _find_ufunc,
  42. index as _index_ufunc,
  43. isalnum,
  44. isalpha,
  45. isdecimal,
  46. isdigit,
  47. islower,
  48. isnumeric,
  49. isspace,
  50. istitle,
  51. isupper,
  52. rfind as _rfind_ufunc,
  53. rindex as _rindex_ufunc,
  54. startswith as _startswith_ufunc,
  55. str_len,
  56. )
  57. def _override___module__():
  58. for ufunc in [
  59. isalnum, isalpha, isdecimal, isdigit, islower, isnumeric, isspace,
  60. istitle, isupper, str_len,
  61. ]:
  62. ufunc.__module__ = "numpy.strings"
  63. ufunc.__qualname__ = ufunc.__name__
  64. _override___module__()
  65. __all__ = [
  66. # UFuncs
  67. "equal", "not_equal", "less", "less_equal", "greater", "greater_equal",
  68. "add", "multiply", "isalpha", "isdigit", "isspace", "isalnum", "islower",
  69. "isupper", "istitle", "isdecimal", "isnumeric", "str_len", "find",
  70. "rfind", "index", "rindex", "count", "startswith", "endswith", "lstrip",
  71. "rstrip", "strip", "replace", "expandtabs", "center", "ljust", "rjust",
  72. "zfill", "partition", "rpartition", "slice",
  73. # _vec_string - Will gradually become ufuncs as well
  74. "upper", "lower", "swapcase", "capitalize", "title",
  75. # _vec_string - Will probably not become ufuncs
  76. "mod", "decode", "encode", "translate",
  77. # Removed from namespace until behavior has been crystallized
  78. # "join", "split", "rsplit", "splitlines",
  79. ]
  80. MAX = np.iinfo(np.int64).max
  81. array_function_dispatch = functools.partial(
  82. array_function_dispatch, module='numpy.strings')
  83. def _get_num_chars(a):
  84. """
  85. Helper function that returns the number of characters per field in
  86. a string or unicode array. This is to abstract out the fact that
  87. for a unicode array this is itemsize / 4.
  88. """
  89. if issubclass(a.dtype.type, np.str_):
  90. return a.itemsize // 4
  91. return a.itemsize
  92. def _to_bytes_or_str_array(result, output_dtype_like):
  93. """
  94. Helper function to cast a result back into an array
  95. with the appropriate dtype if an object array must be used
  96. as an intermediary.
  97. """
  98. output_dtype_like = np.asarray(output_dtype_like)
  99. if result.size == 0:
  100. # Calling asarray & tolist in an empty array would result
  101. # in losing shape information
  102. return result.astype(output_dtype_like.dtype)
  103. ret = np.asarray(result.tolist())
  104. if isinstance(output_dtype_like.dtype, np.dtypes.StringDType):
  105. return ret.astype(type(output_dtype_like.dtype))
  106. return ret.astype(type(output_dtype_like.dtype)(_get_num_chars(ret)))
  107. def _clean_args(*args):
  108. """
  109. Helper function for delegating arguments to Python string
  110. functions.
  111. Many of the Python string operations that have optional arguments
  112. do not use 'None' to indicate a default value. In these cases,
  113. we need to remove all None arguments, and those following them.
  114. """
  115. newargs = []
  116. for chk in args:
  117. if chk is None:
  118. break
  119. newargs.append(chk)
  120. return newargs
  121. def _multiply_dispatcher(a, i):
  122. return (a,)
  123. @set_module("numpy.strings")
  124. @array_function_dispatch(_multiply_dispatcher)
  125. def multiply(a, i):
  126. """
  127. Return (a * i), that is string multiple concatenation,
  128. element-wise.
  129. Values in ``i`` of less than 0 are treated as 0 (which yields an
  130. empty string).
  131. Parameters
  132. ----------
  133. a : array_like, with ``StringDType``, ``bytes_`` or ``str_`` dtype
  134. i : array_like, with any integer dtype
  135. Returns
  136. -------
  137. out : ndarray
  138. Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
  139. depending on input types
  140. Examples
  141. --------
  142. >>> import numpy as np
  143. >>> a = np.array(["a", "b", "c"])
  144. >>> np.strings.multiply(a, 3)
  145. array(['aaa', 'bbb', 'ccc'], dtype='<U3')
  146. >>> i = np.array([1, 2, 3])
  147. >>> np.strings.multiply(a, i)
  148. array(['a', 'bb', 'ccc'], dtype='<U3')
  149. >>> np.strings.multiply(np.array(['a']), i)
  150. array(['a', 'aa', 'aaa'], dtype='<U3')
  151. >>> a = np.array(['a', 'b', 'c', 'd', 'e', 'f']).reshape((2, 3))
  152. >>> np.strings.multiply(a, 3)
  153. array([['aaa', 'bbb', 'ccc'],
  154. ['ddd', 'eee', 'fff']], dtype='<U3')
  155. >>> np.strings.multiply(a, i)
  156. array([['a', 'bb', 'ccc'],
  157. ['d', 'ee', 'fff']], dtype='<U3')
  158. """
  159. a = np.asanyarray(a)
  160. i = np.asanyarray(i)
  161. if not np.issubdtype(i.dtype, np.integer):
  162. raise TypeError(f"unsupported type {i.dtype} for operand 'i'")
  163. i = np.maximum(i, 0)
  164. # delegate to stringdtype loops that also do overflow checking
  165. if a.dtype.char == "T":
  166. return a * i
  167. a_len = str_len(a)
  168. # Ensure we can do a_len * i without overflow.
  169. if np.any(a_len > sys.maxsize / np.maximum(i, 1)):
  170. raise OverflowError("Overflow encountered in string multiply")
  171. buffersizes = a_len * i
  172. out_dtype = f"{a.dtype.char}{buffersizes.max()}"
  173. out = np.empty_like(a, shape=buffersizes.shape, dtype=out_dtype)
  174. return _multiply_ufunc(a, i, out=out)
  175. def _mod_dispatcher(a, values):
  176. return (a, values)
  177. @set_module("numpy.strings")
  178. @array_function_dispatch(_mod_dispatcher)
  179. def mod(a, values):
  180. """
  181. Return (a % i), that is pre-Python 2.6 string formatting
  182. (interpolation), element-wise for a pair of array_likes of str
  183. or unicode.
  184. Parameters
  185. ----------
  186. a : array_like, with `np.bytes_` or `np.str_` dtype
  187. values : array_like of values
  188. These values will be element-wise interpolated into the string.
  189. Returns
  190. -------
  191. out : ndarray
  192. Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
  193. depending on input types
  194. Examples
  195. --------
  196. >>> import numpy as np
  197. >>> a = np.array(["NumPy is a %s library"])
  198. >>> np.strings.mod(a, values=["Python"])
  199. array(['NumPy is a Python library'], dtype='<U25')
  200. >>> a = np.array([b'%d bytes', b'%d bits'])
  201. >>> values = np.array([8, 64])
  202. >>> np.strings.mod(a, values)
  203. array([b'8 bytes', b'64 bits'], dtype='|S7')
  204. """
  205. return _to_bytes_or_str_array(
  206. _vec_string(a, np.object_, '__mod__', (values,)), a)
  207. @set_module("numpy.strings")
  208. def find(a, sub, start=0, end=None):
  209. """
  210. For each element, return the lowest index in the string where
  211. substring ``sub`` is found, such that ``sub`` is contained in the
  212. range [``start``, ``end``).
  213. Parameters
  214. ----------
  215. a : array_like, with ``StringDType``, ``bytes_`` or ``str_`` dtype
  216. sub : array_like, with `np.bytes_` or `np.str_` dtype
  217. The substring to search for.
  218. start, end : array_like, with any integer dtype
  219. The range to look in, interpreted as in slice notation.
  220. Returns
  221. -------
  222. y : ndarray
  223. Output array of ints
  224. See Also
  225. --------
  226. str.find
  227. Examples
  228. --------
  229. >>> import numpy as np
  230. >>> a = np.array(["NumPy is a Python library"])
  231. >>> np.strings.find(a, "Python")
  232. array([11])
  233. """
  234. end = end if end is not None else MAX
  235. return _find_ufunc(a, sub, start, end)
  236. @set_module("numpy.strings")
  237. def rfind(a, sub, start=0, end=None):
  238. """
  239. For each element, return the highest index in the string where
  240. substring ``sub`` is found, such that ``sub`` is contained in the
  241. range [``start``, ``end``).
  242. Parameters
  243. ----------
  244. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  245. sub : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  246. The substring to search for.
  247. start, end : array_like, with any integer dtype
  248. The range to look in, interpreted as in slice notation.
  249. Returns
  250. -------
  251. y : ndarray
  252. Output array of ints
  253. See Also
  254. --------
  255. str.rfind
  256. Examples
  257. --------
  258. >>> import numpy as np
  259. >>> a = np.array(["Computer Science"])
  260. >>> np.strings.rfind(a, "Science", start=0, end=None)
  261. array([9])
  262. >>> np.strings.rfind(a, "Science", start=0, end=8)
  263. array([-1])
  264. >>> b = np.array(["Computer Science", "Science"])
  265. >>> np.strings.rfind(b, "Science", start=0, end=None)
  266. array([9, 0])
  267. """
  268. end = end if end is not None else MAX
  269. return _rfind_ufunc(a, sub, start, end)
  270. @set_module("numpy.strings")
  271. def index(a, sub, start=0, end=None):
  272. """
  273. Like `find`, but raises :exc:`ValueError` when the substring is not found.
  274. Parameters
  275. ----------
  276. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  277. sub : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  278. start, end : array_like, with any integer dtype, optional
  279. Returns
  280. -------
  281. out : ndarray
  282. Output array of ints.
  283. See Also
  284. --------
  285. find, str.index
  286. Examples
  287. --------
  288. >>> import numpy as np
  289. >>> a = np.array(["Computer Science"])
  290. >>> np.strings.index(a, "Science", start=0, end=None)
  291. array([9])
  292. """
  293. end = end if end is not None else MAX
  294. return _index_ufunc(a, sub, start, end)
  295. @set_module("numpy.strings")
  296. def rindex(a, sub, start=0, end=None):
  297. """
  298. Like `rfind`, but raises :exc:`ValueError` when the substring `sub` is
  299. not found.
  300. Parameters
  301. ----------
  302. a : array-like, with `np.bytes_` or `np.str_` dtype
  303. sub : array-like, with `np.bytes_` or `np.str_` dtype
  304. start, end : array-like, with any integer dtype, optional
  305. Returns
  306. -------
  307. out : ndarray
  308. Output array of ints.
  309. See Also
  310. --------
  311. rfind, str.rindex
  312. Examples
  313. --------
  314. >>> a = np.array(["Computer Science"])
  315. >>> np.strings.rindex(a, "Science", start=0, end=None)
  316. array([9])
  317. """
  318. end = end if end is not None else MAX
  319. return _rindex_ufunc(a, sub, start, end)
  320. @set_module("numpy.strings")
  321. def count(a, sub, start=0, end=None):
  322. """
  323. Returns an array with the number of non-overlapping occurrences of
  324. substring ``sub`` in the range [``start``, ``end``).
  325. Parameters
  326. ----------
  327. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  328. sub : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  329. The substring to search for.
  330. start, end : array_like, with any integer dtype
  331. The range to look in, interpreted as in slice notation.
  332. Returns
  333. -------
  334. y : ndarray
  335. Output array of ints
  336. See Also
  337. --------
  338. str.count
  339. Examples
  340. --------
  341. >>> import numpy as np
  342. >>> c = np.array(['aAaAaA', ' aA ', 'abBABba'])
  343. >>> c
  344. array(['aAaAaA', ' aA ', 'abBABba'], dtype='<U7')
  345. >>> np.strings.count(c, 'A')
  346. array([3, 1, 1])
  347. >>> np.strings.count(c, 'aA')
  348. array([3, 1, 0])
  349. >>> np.strings.count(c, 'A', start=1, end=4)
  350. array([2, 1, 1])
  351. >>> np.strings.count(c, 'A', start=1, end=3)
  352. array([1, 0, 0])
  353. """
  354. end = end if end is not None else MAX
  355. return _count_ufunc(a, sub, start, end)
  356. @set_module("numpy.strings")
  357. def startswith(a, prefix, start=0, end=None):
  358. """
  359. Returns a boolean array which is `True` where the string element
  360. in ``a`` starts with ``prefix``, otherwise `False`.
  361. Parameters
  362. ----------
  363. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  364. prefix : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  365. start, end : array_like, with any integer dtype
  366. With ``start``, test beginning at that position. With ``end``,
  367. stop comparing at that position.
  368. Returns
  369. -------
  370. out : ndarray
  371. Output array of bools
  372. See Also
  373. --------
  374. str.startswith
  375. Examples
  376. --------
  377. >>> import numpy as np
  378. >>> s = np.array(['foo', 'bar'])
  379. >>> s
  380. array(['foo', 'bar'], dtype='<U3')
  381. >>> np.strings.startswith(s, 'fo')
  382. array([True, False])
  383. >>> np.strings.startswith(s, 'o', start=1, end=2)
  384. array([True, False])
  385. """
  386. end = end if end is not None else MAX
  387. return _startswith_ufunc(a, prefix, start, end)
  388. @set_module("numpy.strings")
  389. def endswith(a, suffix, start=0, end=None):
  390. """
  391. Returns a boolean array which is `True` where the string element
  392. in ``a`` ends with ``suffix``, otherwise `False`.
  393. Parameters
  394. ----------
  395. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  396. suffix : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  397. start, end : array_like, with any integer dtype
  398. With ``start``, test beginning at that position. With ``end``,
  399. stop comparing at that position.
  400. Returns
  401. -------
  402. out : ndarray
  403. Output array of bools
  404. See Also
  405. --------
  406. str.endswith
  407. Examples
  408. --------
  409. >>> import numpy as np
  410. >>> s = np.array(['foo', 'bar'])
  411. >>> s
  412. array(['foo', 'bar'], dtype='<U3')
  413. >>> np.strings.endswith(s, 'ar')
  414. array([False, True])
  415. >>> np.strings.endswith(s, 'a', start=1, end=2)
  416. array([False, True])
  417. """
  418. end = end if end is not None else MAX
  419. return _endswith_ufunc(a, suffix, start, end)
  420. def _code_dispatcher(a, encoding=None, errors=None):
  421. return (a,)
  422. @set_module("numpy.strings")
  423. @array_function_dispatch(_code_dispatcher)
  424. def decode(a, encoding=None, errors=None):
  425. r"""
  426. Calls :meth:`bytes.decode` element-wise.
  427. The set of available codecs comes from the Python standard library,
  428. and may be extended at runtime. For more information, see the
  429. :mod:`codecs` module.
  430. Parameters
  431. ----------
  432. a : array_like, with ``bytes_`` dtype
  433. encoding : str, optional
  434. The name of an encoding
  435. errors : str, optional
  436. Specifies how to handle encoding errors
  437. Returns
  438. -------
  439. out : ndarray
  440. See Also
  441. --------
  442. :py:meth:`bytes.decode`
  443. Notes
  444. -----
  445. The type of the result will depend on the encoding specified.
  446. Examples
  447. --------
  448. >>> import numpy as np
  449. >>> c = np.array([b'\x81\xc1\x81\xc1\x81\xc1', b'@@\x81\xc1@@',
  450. ... b'\x81\x82\xc2\xc1\xc2\x82\x81'])
  451. >>> c
  452. array([b'\x81\xc1\x81\xc1\x81\xc1', b'@@\x81\xc1@@',
  453. b'\x81\x82\xc2\xc1\xc2\x82\x81'], dtype='|S7')
  454. >>> np.strings.decode(c, encoding='cp037')
  455. array(['aAaAaA', ' aA ', 'abBABba'], dtype='<U7')
  456. """
  457. return _to_bytes_or_str_array(
  458. _vec_string(a, np.object_, 'decode', _clean_args(encoding, errors)),
  459. np.str_(''))
  460. @set_module("numpy.strings")
  461. @array_function_dispatch(_code_dispatcher)
  462. def encode(a, encoding=None, errors=None):
  463. """
  464. Calls :meth:`str.encode` element-wise.
  465. The set of available codecs comes from the Python standard library,
  466. and may be extended at runtime. For more information, see the
  467. :mod:`codecs` module.
  468. Parameters
  469. ----------
  470. a : array_like, with ``StringDType`` or ``str_`` dtype
  471. encoding : str, optional
  472. The name of an encoding
  473. errors : str, optional
  474. Specifies how to handle encoding errors
  475. Returns
  476. -------
  477. out : ndarray
  478. See Also
  479. --------
  480. str.encode
  481. Notes
  482. -----
  483. The type of the result will depend on the encoding specified.
  484. Examples
  485. --------
  486. >>> import numpy as np
  487. >>> a = np.array(['aAaAaA', ' aA ', 'abBABba'])
  488. >>> np.strings.encode(a, encoding='cp037')
  489. array([b'\x81\xc1\x81\xc1\x81\xc1', b'@@\x81\xc1@@',
  490. b'\x81\x82\xc2\xc1\xc2\x82\x81'], dtype='|S7')
  491. """
  492. return _to_bytes_or_str_array(
  493. _vec_string(a, np.object_, 'encode', _clean_args(encoding, errors)),
  494. np.bytes_(b''))
  495. def _expandtabs_dispatcher(a, tabsize=None):
  496. return (a,)
  497. @set_module("numpy.strings")
  498. @array_function_dispatch(_expandtabs_dispatcher)
  499. def expandtabs(a, tabsize=8):
  500. """
  501. Return a copy of each string element where all tab characters are
  502. replaced by one or more spaces.
  503. Calls :meth:`str.expandtabs` element-wise.
  504. Return a copy of each string element where all tab characters are
  505. replaced by one or more spaces, depending on the current column
  506. and the given `tabsize`. The column number is reset to zero after
  507. each newline occurring in the string. This doesn't understand other
  508. non-printing characters or escape sequences.
  509. Parameters
  510. ----------
  511. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  512. Input array
  513. tabsize : int, optional
  514. Replace tabs with `tabsize` number of spaces. If not given defaults
  515. to 8 spaces.
  516. Returns
  517. -------
  518. out : ndarray
  519. Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
  520. depending on input type
  521. See Also
  522. --------
  523. str.expandtabs
  524. Examples
  525. --------
  526. >>> import numpy as np
  527. >>> a = np.array(['\t\tHello\tworld'])
  528. >>> np.strings.expandtabs(a, tabsize=4) # doctest: +SKIP
  529. array([' Hello world'], dtype='<U21') # doctest: +SKIP
  530. """
  531. a = np.asanyarray(a)
  532. tabsize = np.asanyarray(tabsize)
  533. if a.dtype.char == "T":
  534. return _expandtabs(a, tabsize)
  535. buffersizes = _expandtabs_length(a, tabsize)
  536. out_dtype = f"{a.dtype.char}{buffersizes.max()}"
  537. out = np.empty_like(a, shape=buffersizes.shape, dtype=out_dtype)
  538. return _expandtabs(a, tabsize, out=out)
  539. def _just_dispatcher(a, width, fillchar=None):
  540. return (a,)
  541. @set_module("numpy.strings")
  542. @array_function_dispatch(_just_dispatcher)
  543. def center(a, width, fillchar=' '):
  544. """
  545. Return a copy of `a` with its elements centered in a string of
  546. length `width`.
  547. Parameters
  548. ----------
  549. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  550. width : array_like, with any integer dtype
  551. The length of the resulting strings, unless ``width < str_len(a)``.
  552. fillchar : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  553. Optional padding character to use (default is space).
  554. Returns
  555. -------
  556. out : ndarray
  557. Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
  558. depending on input types
  559. See Also
  560. --------
  561. str.center
  562. Notes
  563. -----
  564. While it is possible for ``a`` and ``fillchar`` to have different dtypes,
  565. passing a non-ASCII character in ``fillchar`` when ``a`` is of dtype "S"
  566. is not allowed, and a ``ValueError`` is raised.
  567. Examples
  568. --------
  569. >>> import numpy as np
  570. >>> c = np.array(['a1b2','1b2a','b2a1','2a1b']); c
  571. array(['a1b2', '1b2a', 'b2a1', '2a1b'], dtype='<U4')
  572. >>> np.strings.center(c, width=9)
  573. array([' a1b2 ', ' 1b2a ', ' b2a1 ', ' 2a1b '], dtype='<U9')
  574. >>> np.strings.center(c, width=9, fillchar='*')
  575. array(['***a1b2**', '***1b2a**', '***b2a1**', '***2a1b**'], dtype='<U9')
  576. >>> np.strings.center(c, width=1)
  577. array(['a1b2', '1b2a', 'b2a1', '2a1b'], dtype='<U4')
  578. """
  579. width = np.asanyarray(width)
  580. if not np.issubdtype(width.dtype, np.integer):
  581. raise TypeError(f"unsupported type {width.dtype} for operand 'width'")
  582. a = np.asanyarray(a)
  583. fillchar = np.asanyarray(fillchar)
  584. if np.any(str_len(fillchar) != 1):
  585. raise TypeError(
  586. "The fill character must be exactly one character long")
  587. if np.result_type(a, fillchar).char == "T":
  588. return _center(a, width, fillchar)
  589. fillchar = fillchar.astype(a.dtype, copy=False)
  590. width = np.maximum(str_len(a), width)
  591. out_dtype = f"{a.dtype.char}{width.max()}"
  592. shape = np.broadcast_shapes(a.shape, width.shape, fillchar.shape)
  593. out = np.empty_like(a, shape=shape, dtype=out_dtype)
  594. return _center(a, width, fillchar, out=out)
  595. @set_module("numpy.strings")
  596. @array_function_dispatch(_just_dispatcher)
  597. def ljust(a, width, fillchar=' '):
  598. """
  599. Return an array with the elements of `a` left-justified in a
  600. string of length `width`.
  601. Parameters
  602. ----------
  603. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  604. width : array_like, with any integer dtype
  605. The length of the resulting strings, unless ``width < str_len(a)``.
  606. fillchar : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  607. Optional character to use for padding (default is space).
  608. Returns
  609. -------
  610. out : ndarray
  611. Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
  612. depending on input types
  613. See Also
  614. --------
  615. str.ljust
  616. Notes
  617. -----
  618. While it is possible for ``a`` and ``fillchar`` to have different dtypes,
  619. passing a non-ASCII character in ``fillchar`` when ``a`` is of dtype "S"
  620. is not allowed, and a ``ValueError`` is raised.
  621. Examples
  622. --------
  623. >>> import numpy as np
  624. >>> c = np.array(['aAaAaA', ' aA ', 'abBABba'])
  625. >>> np.strings.ljust(c, width=3)
  626. array(['aAaAaA', ' aA ', 'abBABba'], dtype='<U7')
  627. >>> np.strings.ljust(c, width=9)
  628. array(['aAaAaA ', ' aA ', 'abBABba '], dtype='<U9')
  629. """
  630. width = np.asanyarray(width)
  631. if not np.issubdtype(width.dtype, np.integer):
  632. raise TypeError(f"unsupported type {width.dtype} for operand 'width'")
  633. a = np.asanyarray(a)
  634. fillchar = np.asanyarray(fillchar)
  635. if np.any(str_len(fillchar) != 1):
  636. raise TypeError(
  637. "The fill character must be exactly one character long")
  638. if np.result_type(a, fillchar).char == "T":
  639. return _ljust(a, width, fillchar)
  640. fillchar = fillchar.astype(a.dtype, copy=False)
  641. width = np.maximum(str_len(a), width)
  642. shape = np.broadcast_shapes(a.shape, width.shape, fillchar.shape)
  643. out_dtype = f"{a.dtype.char}{width.max()}"
  644. out = np.empty_like(a, shape=shape, dtype=out_dtype)
  645. return _ljust(a, width, fillchar, out=out)
  646. @set_module("numpy.strings")
  647. @array_function_dispatch(_just_dispatcher)
  648. def rjust(a, width, fillchar=' '):
  649. """
  650. Return an array with the elements of `a` right-justified in a
  651. string of length `width`.
  652. Parameters
  653. ----------
  654. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  655. width : array_like, with any integer dtype
  656. The length of the resulting strings, unless ``width < str_len(a)``.
  657. fillchar : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  658. Optional padding character to use (default is space).
  659. Returns
  660. -------
  661. out : ndarray
  662. Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
  663. depending on input types
  664. See Also
  665. --------
  666. str.rjust
  667. Notes
  668. -----
  669. While it is possible for ``a`` and ``fillchar`` to have different dtypes,
  670. passing a non-ASCII character in ``fillchar`` when ``a`` is of dtype "S"
  671. is not allowed, and a ``ValueError`` is raised.
  672. Examples
  673. --------
  674. >>> import numpy as np
  675. >>> a = np.array(['aAaAaA', ' aA ', 'abBABba'])
  676. >>> np.strings.rjust(a, width=3)
  677. array(['aAaAaA', ' aA ', 'abBABba'], dtype='<U7')
  678. >>> np.strings.rjust(a, width=9)
  679. array([' aAaAaA', ' aA ', ' abBABba'], dtype='<U9')
  680. """
  681. width = np.asanyarray(width)
  682. if not np.issubdtype(width.dtype, np.integer):
  683. raise TypeError(f"unsupported type {width.dtype} for operand 'width'")
  684. a = np.asanyarray(a)
  685. fillchar = np.asanyarray(fillchar)
  686. if np.any(str_len(fillchar) != 1):
  687. raise TypeError(
  688. "The fill character must be exactly one character long")
  689. if np.result_type(a, fillchar).char == "T":
  690. return _rjust(a, width, fillchar)
  691. fillchar = fillchar.astype(a.dtype, copy=False)
  692. width = np.maximum(str_len(a), width)
  693. shape = np.broadcast_shapes(a.shape, width.shape, fillchar.shape)
  694. out_dtype = f"{a.dtype.char}{width.max()}"
  695. out = np.empty_like(a, shape=shape, dtype=out_dtype)
  696. return _rjust(a, width, fillchar, out=out)
  697. def _zfill_dispatcher(a, width):
  698. return (a,)
  699. @set_module("numpy.strings")
  700. @array_function_dispatch(_zfill_dispatcher)
  701. def zfill(a, width):
  702. """
  703. Return the numeric string left-filled with zeros. A leading
  704. sign prefix (``+``/``-``) is handled by inserting the padding
  705. after the sign character rather than before.
  706. Parameters
  707. ----------
  708. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  709. width : array_like, with any integer dtype
  710. Width of string to left-fill elements in `a`.
  711. Returns
  712. -------
  713. out : ndarray
  714. Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
  715. depending on input type
  716. See Also
  717. --------
  718. str.zfill
  719. Examples
  720. --------
  721. >>> import numpy as np
  722. >>> np.strings.zfill(['1', '-1', '+1'], 3)
  723. array(['001', '-01', '+01'], dtype='<U3')
  724. """
  725. width = np.asanyarray(width)
  726. if not np.issubdtype(width.dtype, np.integer):
  727. raise TypeError(f"unsupported type {width.dtype} for operand 'width'")
  728. a = np.asanyarray(a)
  729. if a.dtype.char == "T":
  730. return _zfill(a, width)
  731. width = np.maximum(str_len(a), width)
  732. shape = np.broadcast_shapes(a.shape, width.shape)
  733. out_dtype = f"{a.dtype.char}{width.max()}"
  734. out = np.empty_like(a, shape=shape, dtype=out_dtype)
  735. return _zfill(a, width, out=out)
  736. @set_module("numpy.strings")
  737. def lstrip(a, chars=None):
  738. """
  739. For each element in `a`, return a copy with the leading characters
  740. removed.
  741. Parameters
  742. ----------
  743. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  744. chars : scalar with the same dtype as ``a``, optional
  745. The ``chars`` argument is a string specifying the set of
  746. characters to be removed. If ``None``, the ``chars``
  747. argument defaults to removing whitespace. The ``chars`` argument
  748. is not a prefix or suffix; rather, all combinations of its
  749. values are stripped.
  750. Returns
  751. -------
  752. out : ndarray
  753. Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
  754. depending on input types
  755. See Also
  756. --------
  757. str.lstrip
  758. Examples
  759. --------
  760. >>> import numpy as np
  761. >>> c = np.array(['aAaAaA', ' aA ', 'abBABba'])
  762. >>> c
  763. array(['aAaAaA', ' aA ', 'abBABba'], dtype='<U7')
  764. # The 'a' variable is unstripped from c[1] because of leading whitespace.
  765. >>> np.strings.lstrip(c, 'a')
  766. array(['AaAaA', ' aA ', 'bBABba'], dtype='<U7')
  767. >>> np.strings.lstrip(c, 'A') # leaves c unchanged
  768. array(['aAaAaA', ' aA ', 'abBABba'], dtype='<U7')
  769. >>> (np.strings.lstrip(c, ' ') == np.strings.lstrip(c, '')).all()
  770. np.False_
  771. >>> (np.strings.lstrip(c, ' ') == np.strings.lstrip(c)).all()
  772. np.True_
  773. """
  774. if chars is None:
  775. return _lstrip_whitespace(a)
  776. return _lstrip_chars(a, chars)
  777. @set_module("numpy.strings")
  778. def rstrip(a, chars=None):
  779. """
  780. For each element in `a`, return a copy with the trailing characters
  781. removed.
  782. Parameters
  783. ----------
  784. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  785. chars : scalar with the same dtype as ``a``, optional
  786. The ``chars`` argument is a string specifying the set of
  787. characters to be removed. If ``None``, the ``chars``
  788. argument defaults to removing whitespace. The ``chars`` argument
  789. is not a prefix or suffix; rather, all combinations of its
  790. values are stripped.
  791. Returns
  792. -------
  793. out : ndarray
  794. Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
  795. depending on input types
  796. See Also
  797. --------
  798. str.rstrip
  799. Examples
  800. --------
  801. >>> import numpy as np
  802. >>> c = np.array(['aAaAaA', 'abBABba'])
  803. >>> c
  804. array(['aAaAaA', 'abBABba'], dtype='<U7')
  805. >>> np.strings.rstrip(c, 'a')
  806. array(['aAaAaA', 'abBABb'], dtype='<U7')
  807. >>> np.strings.rstrip(c, 'A')
  808. array(['aAaAa', 'abBABba'], dtype='<U7')
  809. """
  810. if chars is None:
  811. return _rstrip_whitespace(a)
  812. return _rstrip_chars(a, chars)
  813. @set_module("numpy.strings")
  814. def strip(a, chars=None):
  815. """
  816. For each element in `a`, return a copy with the leading and
  817. trailing characters removed.
  818. Parameters
  819. ----------
  820. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  821. chars : scalar with the same dtype as ``a``, optional
  822. The ``chars`` argument is a string specifying the set of
  823. characters to be removed. If ``None``, the ``chars``
  824. argument defaults to removing whitespace. The ``chars`` argument
  825. is not a prefix or suffix; rather, all combinations of its
  826. values are stripped.
  827. Returns
  828. -------
  829. out : ndarray
  830. Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
  831. depending on input types
  832. See Also
  833. --------
  834. str.strip
  835. Examples
  836. --------
  837. >>> import numpy as np
  838. >>> c = np.array(['aAaAaA', ' aA ', 'abBABba'])
  839. >>> c
  840. array(['aAaAaA', ' aA ', 'abBABba'], dtype='<U7')
  841. >>> np.strings.strip(c)
  842. array(['aAaAaA', 'aA', 'abBABba'], dtype='<U7')
  843. # 'a' unstripped from c[1] because of leading whitespace.
  844. >>> np.strings.strip(c, 'a')
  845. array(['AaAaA', ' aA ', 'bBABb'], dtype='<U7')
  846. # 'A' unstripped from c[1] because of trailing whitespace.
  847. >>> np.strings.strip(c, 'A')
  848. array(['aAaAa', ' aA ', 'abBABba'], dtype='<U7')
  849. """
  850. if chars is None:
  851. return _strip_whitespace(a)
  852. return _strip_chars(a, chars)
  853. def _unary_op_dispatcher(a):
  854. return (a,)
  855. @set_module("numpy.strings")
  856. @array_function_dispatch(_unary_op_dispatcher)
  857. def upper(a):
  858. """
  859. Return an array with the elements converted to uppercase.
  860. Calls :meth:`str.upper` element-wise.
  861. For 8-bit strings, this method is locale-dependent.
  862. Parameters
  863. ----------
  864. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  865. Input array.
  866. Returns
  867. -------
  868. out : ndarray
  869. Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
  870. depending on input types
  871. See Also
  872. --------
  873. str.upper
  874. Examples
  875. --------
  876. >>> import numpy as np
  877. >>> c = np.array(['a1b c', '1bca', 'bca1']); c
  878. array(['a1b c', '1bca', 'bca1'], dtype='<U5')
  879. >>> np.strings.upper(c)
  880. array(['A1B C', '1BCA', 'BCA1'], dtype='<U5')
  881. """
  882. a_arr = np.asarray(a)
  883. return _vec_string(a_arr, a_arr.dtype, 'upper')
  884. @set_module("numpy.strings")
  885. @array_function_dispatch(_unary_op_dispatcher)
  886. def lower(a):
  887. """
  888. Return an array with the elements converted to lowercase.
  889. Call :meth:`str.lower` element-wise.
  890. For 8-bit strings, this method is locale-dependent.
  891. Parameters
  892. ----------
  893. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  894. Input array.
  895. Returns
  896. -------
  897. out : ndarray
  898. Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
  899. depending on input types
  900. See Also
  901. --------
  902. str.lower
  903. Examples
  904. --------
  905. >>> import numpy as np
  906. >>> c = np.array(['A1B C', '1BCA', 'BCA1']); c
  907. array(['A1B C', '1BCA', 'BCA1'], dtype='<U5')
  908. >>> np.strings.lower(c)
  909. array(['a1b c', '1bca', 'bca1'], dtype='<U5')
  910. """
  911. a_arr = np.asarray(a)
  912. return _vec_string(a_arr, a_arr.dtype, 'lower')
  913. @set_module("numpy.strings")
  914. @array_function_dispatch(_unary_op_dispatcher)
  915. def swapcase(a):
  916. """
  917. Return element-wise a copy of the string with
  918. uppercase characters converted to lowercase and vice versa.
  919. Calls :meth:`str.swapcase` element-wise.
  920. For 8-bit strings, this method is locale-dependent.
  921. Parameters
  922. ----------
  923. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  924. Input array.
  925. Returns
  926. -------
  927. out : ndarray
  928. Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
  929. depending on input types
  930. See Also
  931. --------
  932. str.swapcase
  933. Examples
  934. --------
  935. >>> import numpy as np
  936. >>> c=np.array(['a1B c','1b Ca','b Ca1','cA1b'],'S5'); c
  937. array(['a1B c', '1b Ca', 'b Ca1', 'cA1b'],
  938. dtype='|S5')
  939. >>> np.strings.swapcase(c)
  940. array(['A1b C', '1B cA', 'B cA1', 'Ca1B'],
  941. dtype='|S5')
  942. """
  943. a_arr = np.asarray(a)
  944. return _vec_string(a_arr, a_arr.dtype, 'swapcase')
  945. @set_module("numpy.strings")
  946. @array_function_dispatch(_unary_op_dispatcher)
  947. def capitalize(a):
  948. """
  949. Return a copy of ``a`` with only the first character of each element
  950. capitalized.
  951. Calls :meth:`str.capitalize` element-wise.
  952. For byte strings, this method is locale-dependent.
  953. Parameters
  954. ----------
  955. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  956. Input array of strings to capitalize.
  957. Returns
  958. -------
  959. out : ndarray
  960. Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
  961. depending on input types
  962. See Also
  963. --------
  964. str.capitalize
  965. Examples
  966. --------
  967. >>> import numpy as np
  968. >>> c = np.array(['a1b2','1b2a','b2a1','2a1b'],'S4'); c
  969. array(['a1b2', '1b2a', 'b2a1', '2a1b'],
  970. dtype='|S4')
  971. >>> np.strings.capitalize(c)
  972. array(['A1b2', '1b2a', 'B2a1', '2a1b'],
  973. dtype='|S4')
  974. """
  975. a_arr = np.asarray(a)
  976. return _vec_string(a_arr, a_arr.dtype, 'capitalize')
  977. @set_module("numpy.strings")
  978. @array_function_dispatch(_unary_op_dispatcher)
  979. def title(a):
  980. """
  981. Return element-wise title cased version of string or unicode.
  982. Title case words start with uppercase characters, all remaining cased
  983. characters are lowercase.
  984. Calls :meth:`str.title` element-wise.
  985. For 8-bit strings, this method is locale-dependent.
  986. Parameters
  987. ----------
  988. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  989. Input array.
  990. Returns
  991. -------
  992. out : ndarray
  993. Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
  994. depending on input types
  995. See Also
  996. --------
  997. str.title
  998. Examples
  999. --------
  1000. >>> import numpy as np
  1001. >>> c=np.array(['a1b c','1b ca','b ca1','ca1b'],'S5'); c
  1002. array(['a1b c', '1b ca', 'b ca1', 'ca1b'],
  1003. dtype='|S5')
  1004. >>> np.strings.title(c)
  1005. array(['A1B C', '1B Ca', 'B Ca1', 'Ca1B'],
  1006. dtype='|S5')
  1007. """
  1008. a_arr = np.asarray(a)
  1009. return _vec_string(a_arr, a_arr.dtype, 'title')
  1010. def _replace_dispatcher(a, old, new, count=None):
  1011. return (a,)
  1012. @set_module("numpy.strings")
  1013. @array_function_dispatch(_replace_dispatcher)
  1014. def replace(a, old, new, count=-1):
  1015. """
  1016. For each element in ``a``, return a copy of the string with
  1017. occurrences of substring ``old`` replaced by ``new``.
  1018. Parameters
  1019. ----------
  1020. a : array_like, with ``bytes_`` or ``str_`` dtype
  1021. old, new : array_like, with ``bytes_`` or ``str_`` dtype
  1022. count : array_like, with ``int_`` dtype
  1023. If the optional argument ``count`` is given, only the first
  1024. ``count`` occurrences are replaced.
  1025. Returns
  1026. -------
  1027. out : ndarray
  1028. Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
  1029. depending on input types
  1030. See Also
  1031. --------
  1032. str.replace
  1033. Examples
  1034. --------
  1035. >>> import numpy as np
  1036. >>> a = np.array(["That is a mango", "Monkeys eat mangos"])
  1037. >>> np.strings.replace(a, 'mango', 'banana')
  1038. array(['That is a banana', 'Monkeys eat bananas'], dtype='<U19')
  1039. >>> a = np.array(["The dish is fresh", "This is it"])
  1040. >>> np.strings.replace(a, 'is', 'was')
  1041. array(['The dwash was fresh', 'Thwas was it'], dtype='<U19')
  1042. """
  1043. count = np.asanyarray(count)
  1044. if not np.issubdtype(count.dtype, np.integer):
  1045. raise TypeError(f"unsupported type {count.dtype} for operand 'count'")
  1046. arr = np.asanyarray(a)
  1047. old_dtype = getattr(old, 'dtype', None)
  1048. old = np.asanyarray(old)
  1049. new_dtype = getattr(new, 'dtype', None)
  1050. new = np.asanyarray(new)
  1051. if np.result_type(arr, old, new).char == "T":
  1052. return _replace(arr, old, new, count)
  1053. a_dt = arr.dtype
  1054. old = old.astype(old_dtype or a_dt, copy=False)
  1055. new = new.astype(new_dtype or a_dt, copy=False)
  1056. max_int64 = np.iinfo(np.int64).max
  1057. counts = _count_ufunc(arr, old, 0, max_int64)
  1058. counts = np.where(count < 0, counts, np.minimum(counts, count))
  1059. buffersizes = str_len(arr) + counts * (str_len(new) - str_len(old))
  1060. out_dtype = f"{arr.dtype.char}{buffersizes.max()}"
  1061. out = np.empty_like(arr, shape=buffersizes.shape, dtype=out_dtype)
  1062. return _replace(arr, old, new, counts, out=out)
  1063. def _join_dispatcher(sep, seq):
  1064. return (sep, seq)
  1065. @array_function_dispatch(_join_dispatcher)
  1066. def _join(sep, seq):
  1067. """
  1068. Return a string which is the concatenation of the strings in the
  1069. sequence `seq`.
  1070. Calls :meth:`str.join` element-wise.
  1071. Parameters
  1072. ----------
  1073. sep : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  1074. seq : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  1075. Returns
  1076. -------
  1077. out : ndarray
  1078. Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
  1079. depending on input types
  1080. See Also
  1081. --------
  1082. str.join
  1083. Examples
  1084. --------
  1085. >>> import numpy as np
  1086. >>> np.strings.join('-', 'osd') # doctest: +SKIP
  1087. array('o-s-d', dtype='<U5') # doctest: +SKIP
  1088. >>> np.strings.join(['-', '.'], ['ghc', 'osd']) # doctest: +SKIP
  1089. array(['g-h-c', 'o.s.d'], dtype='<U5') # doctest: +SKIP
  1090. """
  1091. return _to_bytes_or_str_array(
  1092. _vec_string(sep, np.object_, 'join', (seq,)), seq)
  1093. def _split_dispatcher(a, sep=None, maxsplit=None):
  1094. return (a,)
  1095. @array_function_dispatch(_split_dispatcher)
  1096. def _split(a, sep=None, maxsplit=None):
  1097. """
  1098. For each element in `a`, return a list of the words in the
  1099. string, using `sep` as the delimiter string.
  1100. Calls :meth:`str.split` element-wise.
  1101. Parameters
  1102. ----------
  1103. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  1104. sep : str or unicode, optional
  1105. If `sep` is not specified or None, any whitespace string is a
  1106. separator.
  1107. maxsplit : int, optional
  1108. If `maxsplit` is given, at most `maxsplit` splits are done.
  1109. Returns
  1110. -------
  1111. out : ndarray
  1112. Array of list objects
  1113. Examples
  1114. --------
  1115. >>> import numpy as np
  1116. >>> x = np.array("Numpy is nice!")
  1117. >>> np.strings.split(x, " ") # doctest: +SKIP
  1118. array(list(['Numpy', 'is', 'nice!']), dtype=object) # doctest: +SKIP
  1119. >>> np.strings.split(x, " ", 1) # doctest: +SKIP
  1120. array(list(['Numpy', 'is nice!']), dtype=object) # doctest: +SKIP
  1121. See Also
  1122. --------
  1123. str.split, rsplit
  1124. """
  1125. # This will return an array of lists of different sizes, so we
  1126. # leave it as an object array
  1127. return _vec_string(
  1128. a, np.object_, 'split', [sep] + _clean_args(maxsplit))
  1129. @array_function_dispatch(_split_dispatcher)
  1130. def _rsplit(a, sep=None, maxsplit=None):
  1131. """
  1132. For each element in `a`, return a list of the words in the
  1133. string, using `sep` as the delimiter string.
  1134. Calls :meth:`str.rsplit` element-wise.
  1135. Except for splitting from the right, `rsplit`
  1136. behaves like `split`.
  1137. Parameters
  1138. ----------
  1139. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  1140. sep : str or unicode, optional
  1141. If `sep` is not specified or None, any whitespace string
  1142. is a separator.
  1143. maxsplit : int, optional
  1144. If `maxsplit` is given, at most `maxsplit` splits are done,
  1145. the rightmost ones.
  1146. Returns
  1147. -------
  1148. out : ndarray
  1149. Array of list objects
  1150. See Also
  1151. --------
  1152. str.rsplit, split
  1153. Examples
  1154. --------
  1155. >>> import numpy as np
  1156. >>> a = np.array(['aAaAaA', 'abBABba'])
  1157. >>> np.strings.rsplit(a, 'A') # doctest: +SKIP
  1158. array([list(['a', 'a', 'a', '']), # doctest: +SKIP
  1159. list(['abB', 'Bba'])], dtype=object) # doctest: +SKIP
  1160. """
  1161. # This will return an array of lists of different sizes, so we
  1162. # leave it as an object array
  1163. return _vec_string(
  1164. a, np.object_, 'rsplit', [sep] + _clean_args(maxsplit))
  1165. def _splitlines_dispatcher(a, keepends=None):
  1166. return (a,)
  1167. @array_function_dispatch(_splitlines_dispatcher)
  1168. def _splitlines(a, keepends=None):
  1169. """
  1170. For each element in `a`, return a list of the lines in the
  1171. element, breaking at line boundaries.
  1172. Calls :meth:`str.splitlines` element-wise.
  1173. Parameters
  1174. ----------
  1175. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  1176. keepends : bool, optional
  1177. Line breaks are not included in the resulting list unless
  1178. keepends is given and true.
  1179. Returns
  1180. -------
  1181. out : ndarray
  1182. Array of list objects
  1183. See Also
  1184. --------
  1185. str.splitlines
  1186. Examples
  1187. --------
  1188. >>> np.char.splitlines("first line\\nsecond line")
  1189. array(list(['first line', 'second line']), dtype=object)
  1190. >>> a = np.array(["first\\nsecond", "third\\nfourth"])
  1191. >>> np.char.splitlines(a)
  1192. array([list(['first', 'second']), list(['third', 'fourth'])], dtype=object)
  1193. """
  1194. return _vec_string(
  1195. a, np.object_, 'splitlines', _clean_args(keepends))
  1196. def _partition_dispatcher(a, sep):
  1197. return (a,)
  1198. @set_module("numpy.strings")
  1199. @array_function_dispatch(_partition_dispatcher)
  1200. def partition(a, sep):
  1201. """
  1202. Partition each element in ``a`` around ``sep``.
  1203. For each element in ``a``, split the element at the first
  1204. occurrence of ``sep``, and return a 3-tuple containing the part
  1205. before the separator, the separator itself, and the part after
  1206. the separator. If the separator is not found, the first item of
  1207. the tuple will contain the whole string, and the second and third
  1208. ones will be the empty string.
  1209. Parameters
  1210. ----------
  1211. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  1212. Input array
  1213. sep : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  1214. Separator to split each string element in ``a``.
  1215. Returns
  1216. -------
  1217. out : 3-tuple:
  1218. - array with ``StringDType``, ``bytes_`` or ``str_`` dtype with the
  1219. part before the separator
  1220. - array with ``StringDType``, ``bytes_`` or ``str_`` dtype with the
  1221. separator
  1222. - array with ``StringDType``, ``bytes_`` or ``str_`` dtype with the
  1223. part after the separator
  1224. See Also
  1225. --------
  1226. str.partition
  1227. Examples
  1228. --------
  1229. >>> import numpy as np
  1230. >>> x = np.array(["Numpy is nice!"])
  1231. >>> np.strings.partition(x, " ")
  1232. (array(['Numpy'], dtype='<U5'),
  1233. array([' '], dtype='<U1'),
  1234. array(['is nice!'], dtype='<U8'))
  1235. """
  1236. a = np.asanyarray(a)
  1237. sep = np.asanyarray(sep)
  1238. if np.result_type(a, sep).char == "T":
  1239. return _partition(a, sep)
  1240. sep = sep.astype(a.dtype, copy=False)
  1241. pos = _find_ufunc(a, sep, 0, MAX)
  1242. a_len = str_len(a)
  1243. sep_len = str_len(sep)
  1244. not_found = pos < 0
  1245. buffersizes1 = np.where(not_found, a_len, pos)
  1246. buffersizes3 = np.where(not_found, 0, a_len - pos - sep_len)
  1247. out_dtype = ",".join([f"{a.dtype.char}{n}" for n in (
  1248. buffersizes1.max(),
  1249. 1 if np.all(not_found) else sep_len.max(),
  1250. buffersizes3.max(),
  1251. )])
  1252. shape = np.broadcast_shapes(a.shape, sep.shape)
  1253. out = np.empty_like(a, shape=shape, dtype=out_dtype)
  1254. return _partition_index(a, sep, pos, out=(out["f0"], out["f1"], out["f2"]))
  1255. @set_module("numpy.strings")
  1256. @array_function_dispatch(_partition_dispatcher)
  1257. def rpartition(a, sep):
  1258. """
  1259. Partition (split) each element around the right-most separator.
  1260. For each element in ``a``, split the element at the last
  1261. occurrence of ``sep``, and return a 3-tuple containing the part
  1262. before the separator, the separator itself, and the part after
  1263. the separator. If the separator is not found, the third item of
  1264. the tuple will contain the whole string, and the first and second
  1265. ones will be the empty string.
  1266. Parameters
  1267. ----------
  1268. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  1269. Input array
  1270. sep : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  1271. Separator to split each string element in ``a``.
  1272. Returns
  1273. -------
  1274. out : 3-tuple:
  1275. - array with ``StringDType``, ``bytes_`` or ``str_`` dtype with the
  1276. part before the separator
  1277. - array with ``StringDType``, ``bytes_`` or ``str_`` dtype with the
  1278. separator
  1279. - array with ``StringDType``, ``bytes_`` or ``str_`` dtype with the
  1280. part after the separator
  1281. See Also
  1282. --------
  1283. str.rpartition
  1284. Examples
  1285. --------
  1286. >>> import numpy as np
  1287. >>> a = np.array(['aAaAaA', ' aA ', 'abBABba'])
  1288. >>> np.strings.rpartition(a, 'A')
  1289. (array(['aAaAa', ' a', 'abB'], dtype='<U5'),
  1290. array(['A', 'A', 'A'], dtype='<U1'),
  1291. array(['', ' ', 'Bba'], dtype='<U3'))
  1292. """
  1293. a = np.asanyarray(a)
  1294. sep = np.asanyarray(sep)
  1295. if np.result_type(a, sep).char == "T":
  1296. return _rpartition(a, sep)
  1297. sep = sep.astype(a.dtype, copy=False)
  1298. pos = _rfind_ufunc(a, sep, 0, MAX)
  1299. a_len = str_len(a)
  1300. sep_len = str_len(sep)
  1301. not_found = pos < 0
  1302. buffersizes1 = np.where(not_found, 0, pos)
  1303. buffersizes3 = np.where(not_found, a_len, a_len - pos - sep_len)
  1304. out_dtype = ",".join([f"{a.dtype.char}{n}" for n in (
  1305. buffersizes1.max(),
  1306. 1 if np.all(not_found) else sep_len.max(),
  1307. buffersizes3.max(),
  1308. )])
  1309. shape = np.broadcast_shapes(a.shape, sep.shape)
  1310. out = np.empty_like(a, shape=shape, dtype=out_dtype)
  1311. return _rpartition_index(
  1312. a, sep, pos, out=(out["f0"], out["f1"], out["f2"]))
  1313. def _translate_dispatcher(a, table, deletechars=None):
  1314. return (a,)
  1315. @set_module("numpy.strings")
  1316. @array_function_dispatch(_translate_dispatcher)
  1317. def translate(a, table, deletechars=None):
  1318. """
  1319. For each element in `a`, return a copy of the string where all
  1320. characters occurring in the optional argument `deletechars` are
  1321. removed, and the remaining characters have been mapped through the
  1322. given translation table.
  1323. Calls :meth:`str.translate` element-wise.
  1324. Parameters
  1325. ----------
  1326. a : array-like, with `np.bytes_` or `np.str_` dtype
  1327. table : str of length 256
  1328. deletechars : str
  1329. Returns
  1330. -------
  1331. out : ndarray
  1332. Output array of str or unicode, depending on input type
  1333. See Also
  1334. --------
  1335. str.translate
  1336. Examples
  1337. --------
  1338. >>> import numpy as np
  1339. >>> a = np.array(['a1b c', '1bca', 'bca1'])
  1340. >>> table = a[0].maketrans('abc', '123')
  1341. >>> deletechars = ' '
  1342. >>> np.char.translate(a, table, deletechars)
  1343. array(['112 3', '1231', '2311'], dtype='<U5')
  1344. """
  1345. a_arr = np.asarray(a)
  1346. if issubclass(a_arr.dtype.type, np.str_):
  1347. return _vec_string(
  1348. a_arr, a_arr.dtype, 'translate', (table,))
  1349. else:
  1350. return _vec_string(
  1351. a_arr,
  1352. a_arr.dtype,
  1353. 'translate',
  1354. [table] + _clean_args(deletechars)
  1355. )
  1356. @set_module("numpy.strings")
  1357. def slice(a, start=None, stop=np._NoValue, step=None, /):
  1358. """
  1359. Slice the strings in `a` by slices specified by `start`, `stop`, `step`.
  1360. Like in the regular Python `slice` object, if only `start` is
  1361. specified then it is interpreted as the `stop`.
  1362. Parameters
  1363. ----------
  1364. a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
  1365. Input array
  1366. start : None, an integer or an array of integers
  1367. The start of the slice, broadcasted to `a`'s shape
  1368. stop : None, an integer or an array of integers
  1369. The end of the slice, broadcasted to `a`'s shape
  1370. step : None, an integer or an array of integers
  1371. The step for the slice, broadcasted to `a`'s shape
  1372. Returns
  1373. -------
  1374. out : ndarray
  1375. Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
  1376. depending on input type
  1377. Examples
  1378. --------
  1379. >>> import numpy as np
  1380. >>> a = np.array(['hello', 'world'])
  1381. >>> np.strings.slice(a, 2)
  1382. array(['he', 'wo'], dtype='<U5')
  1383. >>> np.strings.slice(a, 2, None)
  1384. array(['llo', 'rld'], dtype='<U5')
  1385. >>> np.strings.slice(a, 1, 5, 2)
  1386. array(['el', 'ol'], dtype='<U5')
  1387. One can specify different start/stop/step for different array entries:
  1388. >>> np.strings.slice(a, np.array([1, 2]), np.array([4, 5]))
  1389. array(['ell', 'rld'], dtype='<U5')
  1390. Negative slices have the same meaning as in regular Python:
  1391. >>> b = np.array(['hello world', 'γεια σου κόσμε', '你好世界', '👋 🌍'],
  1392. ... dtype=np.dtypes.StringDType())
  1393. >>> np.strings.slice(b, -2)
  1394. array(['hello wor', 'γεια σου κόσ', '你好', '👋'], dtype=StringDType())
  1395. >>> np.strings.slice(b, -2, None)
  1396. array(['ld', 'με', '世界', ' 🌍'], dtype=StringDType())
  1397. >>> np.strings.slice(b, [3, -10, 2, -3], [-1, -2, -1, 3])
  1398. array(['lo worl', ' σου κόσ', '世', '👋 🌍'], dtype=StringDType())
  1399. >>> np.strings.slice(b, None, None, -1)
  1400. array(['dlrow olleh', 'εμσόκ υοσ αιεγ', '界世好你', '🌍 👋'],
  1401. dtype=StringDType())
  1402. """
  1403. # Just like in the construction of a regular slice object, if only start
  1404. # is specified then start will become stop, see logic in slice_new.
  1405. if stop is np._NoValue:
  1406. stop = start
  1407. start = None
  1408. # adjust start, stop, step to be integers, see logic in PySlice_Unpack
  1409. if step is None:
  1410. step = 1
  1411. step = np.asanyarray(step)
  1412. if not np.issubdtype(step.dtype, np.integer):
  1413. raise TypeError(f"unsupported type {step.dtype} for operand 'step'")
  1414. if np.any(step == 0):
  1415. raise ValueError("slice step cannot be zero")
  1416. if start is None:
  1417. start = np.where(step < 0, np.iinfo(np.intp).max, 0)
  1418. if stop is None:
  1419. stop = np.where(step < 0, np.iinfo(np.intp).min, np.iinfo(np.intp).max)
  1420. return _slice(a, start, stop, step)