__init__.pyi 31 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015
  1. # Generated content DO NOT EDIT
  2. class PreTokenizer:
  3. """
  4. Base class for all pre-tokenizers
  5. This class is not supposed to be instantiated directly. Instead, any implementation of a
  6. PreTokenizer will return an instance of this class when instantiated.
  7. """
  8. def __getstate__(self):
  9. """ """
  10. pass
  11. def __setstate__(self, state):
  12. """ """
  13. pass
  14. @staticmethod
  15. def custom(pretok):
  16. """ """
  17. pass
  18. def pre_tokenize(self, pretok):
  19. """
  20. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  21. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  22. keep track of the pre-tokenization, and leverage the capabilities of the
  23. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  24. the pre-tokenization of a raw string, you can use
  25. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  26. Args:
  27. pretok (:class:`~tokenizers.PreTokenizedString):
  28. The pre-tokenized string on which to apply this
  29. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  30. """
  31. pass
  32. def pre_tokenize_str(self, sequence):
  33. """
  34. Pre tokenize the given string
  35. This method provides a way to visualize the effect of a
  36. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  37. alignment, nor does it provide all the capabilities of the
  38. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  39. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  40. Args:
  41. sequence (:obj:`str`):
  42. A string to pre-tokeize
  43. Returns:
  44. :obj:`List[Tuple[str, Offsets]]`:
  45. A list of tuple with the pre-tokenized parts and their offsets
  46. """
  47. pass
  48. class BertPreTokenizer(PreTokenizer):
  49. """
  50. BertPreTokenizer
  51. This pre-tokenizer splits tokens on spaces, and also on punctuation.
  52. Each occurrence of a punctuation character will be treated separately.
  53. """
  54. def __init__(self):
  55. pass
  56. def __getstate__(self):
  57. """ """
  58. pass
  59. def __setstate__(self, state):
  60. """ """
  61. pass
  62. @staticmethod
  63. def custom(pretok):
  64. """ """
  65. pass
  66. def pre_tokenize(self, pretok):
  67. """
  68. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  69. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  70. keep track of the pre-tokenization, and leverage the capabilities of the
  71. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  72. the pre-tokenization of a raw string, you can use
  73. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  74. Args:
  75. pretok (:class:`~tokenizers.PreTokenizedString):
  76. The pre-tokenized string on which to apply this
  77. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  78. """
  79. pass
  80. def pre_tokenize_str(self, sequence):
  81. """
  82. Pre tokenize the given string
  83. This method provides a way to visualize the effect of a
  84. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  85. alignment, nor does it provide all the capabilities of the
  86. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  87. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  88. Args:
  89. sequence (:obj:`str`):
  90. A string to pre-tokeize
  91. Returns:
  92. :obj:`List[Tuple[str, Offsets]]`:
  93. A list of tuple with the pre-tokenized parts and their offsets
  94. """
  95. pass
  96. class ByteLevel(PreTokenizer):
  97. """
  98. ByteLevel PreTokenizer
  99. This pre-tokenizer takes care of replacing all bytes of the given string
  100. with a corresponding representation, as well as splitting into words.
  101. Args:
  102. add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
  103. Whether to add a space to the first word if there isn't already one. This
  104. lets us treat `hello` exactly like `say hello`.
  105. use_regex (:obj:`bool`, `optional`, defaults to :obj:`True`):
  106. Set this to :obj:`False` to prevent this `pre_tokenizer` from using
  107. the GPT2 specific regexp for spliting on whitespace.
  108. """
  109. def __init__(self, add_prefix_space=True, trim_offsets=True, use_regex=True):
  110. pass
  111. def __getstate__(self):
  112. """ """
  113. pass
  114. def __setstate__(self, state):
  115. """ """
  116. pass
  117. @property
  118. def add_prefix_space(self):
  119. """ """
  120. pass
  121. @add_prefix_space.setter
  122. def add_prefix_space(self, value):
  123. """ """
  124. pass
  125. @staticmethod
  126. def alphabet():
  127. """
  128. Returns the alphabet used by this PreTokenizer.
  129. Since the ByteLevel works as its name suggests, at the byte level, it
  130. encodes each byte value to a unique visible character. This means that there is a
  131. total of 256 different characters composing this alphabet.
  132. Returns:
  133. :obj:`List[str]`: A list of characters that compose the alphabet
  134. """
  135. pass
  136. @staticmethod
  137. def custom(pretok):
  138. """ """
  139. pass
  140. def pre_tokenize(self, pretok):
  141. """
  142. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  143. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  144. keep track of the pre-tokenization, and leverage the capabilities of the
  145. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  146. the pre-tokenization of a raw string, you can use
  147. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  148. Args:
  149. pretok (:class:`~tokenizers.PreTokenizedString):
  150. The pre-tokenized string on which to apply this
  151. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  152. """
  153. pass
  154. def pre_tokenize_str(self, sequence):
  155. """
  156. Pre tokenize the given string
  157. This method provides a way to visualize the effect of a
  158. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  159. alignment, nor does it provide all the capabilities of the
  160. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  161. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  162. Args:
  163. sequence (:obj:`str`):
  164. A string to pre-tokeize
  165. Returns:
  166. :obj:`List[Tuple[str, Offsets]]`:
  167. A list of tuple with the pre-tokenized parts and their offsets
  168. """
  169. pass
  170. @property
  171. def trim_offsets(self):
  172. """ """
  173. pass
  174. @trim_offsets.setter
  175. def trim_offsets(self, value):
  176. """ """
  177. pass
  178. @property
  179. def use_regex(self):
  180. """ """
  181. pass
  182. @use_regex.setter
  183. def use_regex(self, value):
  184. """ """
  185. pass
  186. class CharDelimiterSplit(PreTokenizer):
  187. """
  188. This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
  189. Args:
  190. delimiter: str:
  191. The delimiter char that will be used to split input
  192. """
  193. def __init__(self, delimiter):
  194. pass
  195. def __getnewargs__(self):
  196. """ """
  197. pass
  198. def __getstate__(self):
  199. """ """
  200. pass
  201. def __setstate__(self, state):
  202. """ """
  203. pass
  204. @staticmethod
  205. def custom(pretok):
  206. """ """
  207. pass
  208. @property
  209. def delimiter(self):
  210. """ """
  211. pass
  212. @delimiter.setter
  213. def delimiter(self, value):
  214. """ """
  215. pass
  216. def pre_tokenize(self, pretok):
  217. """
  218. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  219. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  220. keep track of the pre-tokenization, and leverage the capabilities of the
  221. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  222. the pre-tokenization of a raw string, you can use
  223. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  224. Args:
  225. pretok (:class:`~tokenizers.PreTokenizedString):
  226. The pre-tokenized string on which to apply this
  227. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  228. """
  229. pass
  230. def pre_tokenize_str(self, sequence):
  231. """
  232. Pre tokenize the given string
  233. This method provides a way to visualize the effect of a
  234. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  235. alignment, nor does it provide all the capabilities of the
  236. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  237. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  238. Args:
  239. sequence (:obj:`str`):
  240. A string to pre-tokeize
  241. Returns:
  242. :obj:`List[Tuple[str, Offsets]]`:
  243. A list of tuple with the pre-tokenized parts and their offsets
  244. """
  245. pass
  246. class Digits(PreTokenizer):
  247. """
  248. This pre-tokenizer simply splits using the digits in separate tokens
  249. Args:
  250. individual_digits (:obj:`bool`, `optional`, defaults to :obj:`False`):
  251. If set to True, digits will each be separated as follows::
  252. "Call 123 please" -> "Call ", "1", "2", "3", " please"
  253. If set to False, digits will grouped as follows::
  254. "Call 123 please" -> "Call ", "123", " please"
  255. """
  256. def __init__(self, individual_digits=False):
  257. pass
  258. def __getstate__(self):
  259. """ """
  260. pass
  261. def __setstate__(self, state):
  262. """ """
  263. pass
  264. @staticmethod
  265. def custom(pretok):
  266. """ """
  267. pass
  268. @property
  269. def individual_digits(self):
  270. """ """
  271. pass
  272. @individual_digits.setter
  273. def individual_digits(self, value):
  274. """ """
  275. pass
  276. def pre_tokenize(self, pretok):
  277. """
  278. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  279. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  280. keep track of the pre-tokenization, and leverage the capabilities of the
  281. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  282. the pre-tokenization of a raw string, you can use
  283. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  284. Args:
  285. pretok (:class:`~tokenizers.PreTokenizedString):
  286. The pre-tokenized string on which to apply this
  287. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  288. """
  289. pass
  290. def pre_tokenize_str(self, sequence):
  291. """
  292. Pre tokenize the given string
  293. This method provides a way to visualize the effect of a
  294. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  295. alignment, nor does it provide all the capabilities of the
  296. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  297. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  298. Args:
  299. sequence (:obj:`str`):
  300. A string to pre-tokeize
  301. Returns:
  302. :obj:`List[Tuple[str, Offsets]]`:
  303. A list of tuple with the pre-tokenized parts and their offsets
  304. """
  305. pass
  306. class FixedLength(PreTokenizer):
  307. """
  308. This pre-tokenizer splits the text into fixed length chunks as used
  309. [here](https://www.biorxiv.org/content/10.1101/2023.01.11.523679v1.full)
  310. Args:
  311. length (:obj:`int`, `optional`, defaults to :obj:`5`):
  312. The length of the chunks to split the text into.
  313. Strings are split on the character level rather than the byte level to avoid
  314. splitting unicode characters consisting of multiple bytes.
  315. """
  316. def __init__(self, length=5):
  317. pass
  318. def __getstate__(self):
  319. """ """
  320. pass
  321. def __setstate__(self, state):
  322. """ """
  323. pass
  324. @staticmethod
  325. def custom(pretok):
  326. """ """
  327. pass
  328. @property
  329. def length(self):
  330. """ """
  331. pass
  332. @length.setter
  333. def length(self, value):
  334. """ """
  335. pass
  336. def pre_tokenize(self, pretok):
  337. """
  338. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  339. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  340. keep track of the pre-tokenization, and leverage the capabilities of the
  341. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  342. the pre-tokenization of a raw string, you can use
  343. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  344. Args:
  345. pretok (:class:`~tokenizers.PreTokenizedString):
  346. The pre-tokenized string on which to apply this
  347. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  348. """
  349. pass
  350. def pre_tokenize_str(self, sequence):
  351. """
  352. Pre tokenize the given string
  353. This method provides a way to visualize the effect of a
  354. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  355. alignment, nor does it provide all the capabilities of the
  356. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  357. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  358. Args:
  359. sequence (:obj:`str`):
  360. A string to pre-tokeize
  361. Returns:
  362. :obj:`List[Tuple[str, Offsets]]`:
  363. A list of tuple with the pre-tokenized parts and their offsets
  364. """
  365. pass
  366. class Metaspace(PreTokenizer):
  367. """
  368. Metaspace pre-tokenizer
  369. This pre-tokenizer replaces any whitespace by the provided replacement character.
  370. It then tries to split on these spaces.
  371. Args:
  372. replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
  373. The replacement character. Must be exactly one character. By default we
  374. use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
  375. prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
  376. Whether to add a space to the first word if there isn't already one. This
  377. lets us treat `hello` exactly like `say hello`.
  378. Choices: "always", "never", "first". First means the space is only added on the first
  379. token (relevant when special tokens are used or other pre_tokenizer are used).
  380. """
  381. def __init__(self, replacement="_", prepend_scheme="always", split=True):
  382. pass
  383. def __getstate__(self):
  384. """ """
  385. pass
  386. def __setstate__(self, state):
  387. """ """
  388. pass
  389. @staticmethod
  390. def custom(pretok):
  391. """ """
  392. pass
  393. def pre_tokenize(self, pretok):
  394. """
  395. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  396. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  397. keep track of the pre-tokenization, and leverage the capabilities of the
  398. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  399. the pre-tokenization of a raw string, you can use
  400. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  401. Args:
  402. pretok (:class:`~tokenizers.PreTokenizedString):
  403. The pre-tokenized string on which to apply this
  404. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  405. """
  406. pass
  407. def pre_tokenize_str(self, sequence):
  408. """
  409. Pre tokenize the given string
  410. This method provides a way to visualize the effect of a
  411. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  412. alignment, nor does it provide all the capabilities of the
  413. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  414. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  415. Args:
  416. sequence (:obj:`str`):
  417. A string to pre-tokeize
  418. Returns:
  419. :obj:`List[Tuple[str, Offsets]]`:
  420. A list of tuple with the pre-tokenized parts and their offsets
  421. """
  422. pass
  423. @property
  424. def prepend_scheme(self):
  425. """ """
  426. pass
  427. @prepend_scheme.setter
  428. def prepend_scheme(self, value):
  429. """ """
  430. pass
  431. @property
  432. def replacement(self):
  433. """ """
  434. pass
  435. @replacement.setter
  436. def replacement(self, value):
  437. """ """
  438. pass
  439. @property
  440. def split(self):
  441. """ """
  442. pass
  443. @split.setter
  444. def split(self, value):
  445. """ """
  446. pass
  447. class Punctuation(PreTokenizer):
  448. """
  449. This pre-tokenizer simply splits on punctuation as individual characters.
  450. Args:
  451. behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
  452. The behavior to use when splitting.
  453. Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
  454. "contiguous"
  455. """
  456. def __init__(self, behavior="isolated"):
  457. pass
  458. def __getstate__(self):
  459. """ """
  460. pass
  461. def __setstate__(self, state):
  462. """ """
  463. pass
  464. @property
  465. def behavior(self):
  466. """ """
  467. pass
  468. @behavior.setter
  469. def behavior(self, value):
  470. """ """
  471. pass
  472. @staticmethod
  473. def custom(pretok):
  474. """ """
  475. pass
  476. def pre_tokenize(self, pretok):
  477. """
  478. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  479. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  480. keep track of the pre-tokenization, and leverage the capabilities of the
  481. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  482. the pre-tokenization of a raw string, you can use
  483. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  484. Args:
  485. pretok (:class:`~tokenizers.PreTokenizedString):
  486. The pre-tokenized string on which to apply this
  487. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  488. """
  489. pass
  490. def pre_tokenize_str(self, sequence):
  491. """
  492. Pre tokenize the given string
  493. This method provides a way to visualize the effect of a
  494. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  495. alignment, nor does it provide all the capabilities of the
  496. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  497. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  498. Args:
  499. sequence (:obj:`str`):
  500. A string to pre-tokeize
  501. Returns:
  502. :obj:`List[Tuple[str, Offsets]]`:
  503. A list of tuple with the pre-tokenized parts and their offsets
  504. """
  505. pass
  506. class Sequence(PreTokenizer):
  507. """
  508. This pre-tokenizer composes other pre_tokenizers and applies them in sequence
  509. """
  510. def __init__(self, pretokenizers):
  511. pass
  512. def __getitem__(self, key):
  513. """
  514. Return self[key].
  515. """
  516. pass
  517. def __getnewargs__(self):
  518. """ """
  519. pass
  520. def __getstate__(self):
  521. """ """
  522. pass
  523. def __setitem__(self, key, value):
  524. """
  525. Set self[key] to value.
  526. """
  527. pass
  528. def __setstate__(self, state):
  529. """ """
  530. pass
  531. @staticmethod
  532. def custom(pretok):
  533. """ """
  534. pass
  535. def pre_tokenize(self, pretok):
  536. """
  537. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  538. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  539. keep track of the pre-tokenization, and leverage the capabilities of the
  540. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  541. the pre-tokenization of a raw string, you can use
  542. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  543. Args:
  544. pretok (:class:`~tokenizers.PreTokenizedString):
  545. The pre-tokenized string on which to apply this
  546. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  547. """
  548. pass
  549. def pre_tokenize_str(self, sequence):
  550. """
  551. Pre tokenize the given string
  552. This method provides a way to visualize the effect of a
  553. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  554. alignment, nor does it provide all the capabilities of the
  555. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  556. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  557. Args:
  558. sequence (:obj:`str`):
  559. A string to pre-tokeize
  560. Returns:
  561. :obj:`List[Tuple[str, Offsets]]`:
  562. A list of tuple with the pre-tokenized parts and their offsets
  563. """
  564. pass
  565. class Split(PreTokenizer):
  566. """
  567. Split PreTokenizer
  568. This versatile pre-tokenizer splits using the provided pattern and
  569. according to the provided behavior. The pattern can be inverted by
  570. making use of the invert flag.
  571. Args:
  572. pattern (:obj:`str` or :class:`~tokenizers.Regex`):
  573. A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`.
  574. If you want to use a regex pattern, it has to be wrapped around a `tokenizers.Regex`,
  575. otherwise we consider is as a string pattern. For example `pattern="|"`
  576. means you want to split on `|` (imagine a csv file for example), while
  577. `pattern=tokenizers.Regex("1|2")` means you split on either '1' or '2'.
  578. behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
  579. The behavior to use when splitting.
  580. Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
  581. "contiguous"
  582. invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
  583. Whether to invert the pattern.
  584. """
  585. def __init__(self, pattern, behavior, invert=False):
  586. pass
  587. def __getnewargs__(self):
  588. """ """
  589. pass
  590. def __getstate__(self):
  591. """ """
  592. pass
  593. def __setstate__(self, state):
  594. """ """
  595. pass
  596. @property
  597. def behavior(self):
  598. """ """
  599. pass
  600. @behavior.setter
  601. def behavior(self, value):
  602. """ """
  603. pass
  604. @staticmethod
  605. def custom(pretok):
  606. """ """
  607. pass
  608. @property
  609. def invert(self):
  610. """ """
  611. pass
  612. @invert.setter
  613. def invert(self, value):
  614. """ """
  615. pass
  616. @property
  617. def pattern(self):
  618. """ """
  619. pass
  620. @pattern.setter
  621. def pattern(self, value):
  622. """ """
  623. pass
  624. def pre_tokenize(self, pretok):
  625. """
  626. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  627. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  628. keep track of the pre-tokenization, and leverage the capabilities of the
  629. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  630. the pre-tokenization of a raw string, you can use
  631. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  632. Args:
  633. pretok (:class:`~tokenizers.PreTokenizedString):
  634. The pre-tokenized string on which to apply this
  635. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  636. """
  637. pass
  638. def pre_tokenize_str(self, sequence):
  639. """
  640. Pre tokenize the given string
  641. This method provides a way to visualize the effect of a
  642. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  643. alignment, nor does it provide all the capabilities of the
  644. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  645. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  646. Args:
  647. sequence (:obj:`str`):
  648. A string to pre-tokeize
  649. Returns:
  650. :obj:`List[Tuple[str, Offsets]]`:
  651. A list of tuple with the pre-tokenized parts and their offsets
  652. """
  653. pass
  654. class UnicodeScripts(PreTokenizer):
  655. """
  656. This pre-tokenizer splits on characters that belong to different language family
  657. It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt
  658. Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
  659. This mimicks SentencePiece Unigram implementation.
  660. """
  661. def __init__(self):
  662. pass
  663. def __getstate__(self):
  664. """ """
  665. pass
  666. def __setstate__(self, state):
  667. """ """
  668. pass
  669. @staticmethod
  670. def custom(pretok):
  671. """ """
  672. pass
  673. def pre_tokenize(self, pretok):
  674. """
  675. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  676. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  677. keep track of the pre-tokenization, and leverage the capabilities of the
  678. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  679. the pre-tokenization of a raw string, you can use
  680. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  681. Args:
  682. pretok (:class:`~tokenizers.PreTokenizedString):
  683. The pre-tokenized string on which to apply this
  684. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  685. """
  686. pass
  687. def pre_tokenize_str(self, sequence):
  688. """
  689. Pre tokenize the given string
  690. This method provides a way to visualize the effect of a
  691. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  692. alignment, nor does it provide all the capabilities of the
  693. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  694. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  695. Args:
  696. sequence (:obj:`str`):
  697. A string to pre-tokeize
  698. Returns:
  699. :obj:`List[Tuple[str, Offsets]]`:
  700. A list of tuple with the pre-tokenized parts and their offsets
  701. """
  702. pass
  703. class Whitespace(PreTokenizer):
  704. """
  705. This pre-tokenizer splits on word boundaries according to the `\w+|[^\w\s]+`
  706. regex pattern. It splits on word characters or characters that aren't words or
  707. whitespaces (punctuation such as hyphens, apostrophes, commas, etc.).
  708. Example:
  709. Use the `Whitespace` function as shown below::
  710. ```python
  711. from tokenizers.pre_tokenizers import Whitespace
  712. pre_tokenizer = Whitespace()
  713. text = "Hello, world! Let's try the Whitespace pre-tokenizer."
  714. pre_tokenizer.pre_tokenize_str(text)
  715. [('Hello', (0, 5)),
  716. (',', (5, 6)),
  717. ('world', (7, 12)),
  718. ('!', (12, 13)),
  719. ('Let', (14, 17)),
  720. ("'", (17, 18)),
  721. ('s', (18, 19)),
  722. ('try', (20, 23)),
  723. ('the', (24, 27)),
  724. ('Whitespace', (28, 38)),
  725. ('pre', (39, 42)),
  726. ('-', (42, 43)),
  727. ('tokenizer', (43, 52)),
  728. ('.', (52, 53))]
  729. ```
  730. """
  731. def __init__(self):
  732. pass
  733. def __getstate__(self):
  734. """ """
  735. pass
  736. def __setstate__(self, state):
  737. """ """
  738. pass
  739. @staticmethod
  740. def custom(pretok):
  741. """ """
  742. pass
  743. def pre_tokenize(self, pretok):
  744. """
  745. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  746. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  747. keep track of the pre-tokenization, and leverage the capabilities of the
  748. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  749. the pre-tokenization of a raw string, you can use
  750. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  751. Args:
  752. pretok (:class:`~tokenizers.PreTokenizedString):
  753. The pre-tokenized string on which to apply this
  754. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  755. """
  756. pass
  757. def pre_tokenize_str(self, sequence):
  758. """
  759. Pre tokenize the given string
  760. This method provides a way to visualize the effect of a
  761. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  762. alignment, nor does it provide all the capabilities of the
  763. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  764. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  765. Args:
  766. sequence (:obj:`str`):
  767. A string to pre-tokeize
  768. Returns:
  769. :obj:`List[Tuple[str, Offsets]]`:
  770. A list of tuple with the pre-tokenized parts and their offsets
  771. """
  772. pass
  773. class WhitespaceSplit(PreTokenizer):
  774. """
  775. This pre-tokenizer simply splits on the whitespace. Works like `.split()`
  776. """
  777. def __init__(self):
  778. pass
  779. def __getstate__(self):
  780. """ """
  781. pass
  782. def __setstate__(self, state):
  783. """ """
  784. pass
  785. @staticmethod
  786. def custom(pretok):
  787. """ """
  788. pass
  789. def pre_tokenize(self, pretok):
  790. """
  791. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  792. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  793. keep track of the pre-tokenization, and leverage the capabilities of the
  794. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  795. the pre-tokenization of a raw string, you can use
  796. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  797. Args:
  798. pretok (:class:`~tokenizers.PreTokenizedString):
  799. The pre-tokenized string on which to apply this
  800. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  801. """
  802. pass
  803. def pre_tokenize_str(self, sequence):
  804. """
  805. Pre tokenize the given string
  806. This method provides a way to visualize the effect of a
  807. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  808. alignment, nor does it provide all the capabilities of the
  809. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  810. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  811. Args:
  812. sequence (:obj:`str`):
  813. A string to pre-tokeize
  814. Returns:
  815. :obj:`List[Tuple[str, Offsets]]`:
  816. A list of tuple with the pre-tokenized parts and their offsets
  817. """
  818. pass