__init__.pyi 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689
  1. # Generated content DO NOT EDIT
  2. class PreTokenizer:
  3. """
  4. Base class for all pre-tokenizers
  5. This class is not supposed to be instantiated directly. Instead, any implementation of a
  6. PreTokenizer will return an instance of this class when instantiated.
  7. """
  8. def pre_tokenize(self, pretok):
  9. """
  10. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  11. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  12. keep track of the pre-tokenization, and leverage the capabilities of the
  13. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  14. the pre-tokenization of a raw string, you can use
  15. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  16. Args:
  17. pretok (:class:`~tokenizers.PreTokenizedString):
  18. The pre-tokenized string on which to apply this
  19. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  20. """
  21. pass
  22. def pre_tokenize_str(self, sequence):
  23. """
  24. Pre tokenize the given string
  25. This method provides a way to visualize the effect of a
  26. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  27. alignment, nor does it provide all the capabilities of the
  28. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  29. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  30. Args:
  31. sequence (:obj:`str`):
  32. A string to pre-tokeize
  33. Returns:
  34. :obj:`List[Tuple[str, Offsets]]`:
  35. A list of tuple with the pre-tokenized parts and their offsets
  36. """
  37. pass
  38. class BertPreTokenizer(PreTokenizer):
  39. """
  40. BertPreTokenizer
  41. This pre-tokenizer splits tokens on spaces, and also on punctuation.
  42. Each occurrence of a punctuation character will be treated separately.
  43. """
  44. def __init__(self):
  45. pass
  46. def pre_tokenize(self, pretok):
  47. """
  48. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  49. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  50. keep track of the pre-tokenization, and leverage the capabilities of the
  51. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  52. the pre-tokenization of a raw string, you can use
  53. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  54. Args:
  55. pretok (:class:`~tokenizers.PreTokenizedString):
  56. The pre-tokenized string on which to apply this
  57. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  58. """
  59. pass
  60. def pre_tokenize_str(self, sequence):
  61. """
  62. Pre tokenize the given string
  63. This method provides a way to visualize the effect of a
  64. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  65. alignment, nor does it provide all the capabilities of the
  66. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  67. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  68. Args:
  69. sequence (:obj:`str`):
  70. A string to pre-tokeize
  71. Returns:
  72. :obj:`List[Tuple[str, Offsets]]`:
  73. A list of tuple with the pre-tokenized parts and their offsets
  74. """
  75. pass
  76. class ByteLevel(PreTokenizer):
  77. """
  78. ByteLevel PreTokenizer
  79. This pre-tokenizer takes care of replacing all bytes of the given string
  80. with a corresponding representation, as well as splitting into words.
  81. Args:
  82. add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
  83. Whether to add a space to the first word if there isn't already one. This
  84. lets us treat `hello` exactly like `say hello`.
  85. use_regex (:obj:`bool`, `optional`, defaults to :obj:`True`):
  86. Set this to :obj:`False` to prevent this `pre_tokenizer` from using
  87. the GPT2 specific regexp for spliting on whitespace.
  88. """
  89. def __init__(self, add_prefix_space=True, use_regex=True):
  90. pass
  91. @staticmethod
  92. def alphabet():
  93. """
  94. Returns the alphabet used by this PreTokenizer.
  95. Since the ByteLevel works as its name suggests, at the byte level, it
  96. encodes each byte value to a unique visible character. This means that there is a
  97. total of 256 different characters composing this alphabet.
  98. Returns:
  99. :obj:`List[str]`: A list of characters that compose the alphabet
  100. """
  101. pass
  102. def pre_tokenize(self, pretok):
  103. """
  104. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  105. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  106. keep track of the pre-tokenization, and leverage the capabilities of the
  107. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  108. the pre-tokenization of a raw string, you can use
  109. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  110. Args:
  111. pretok (:class:`~tokenizers.PreTokenizedString):
  112. The pre-tokenized string on which to apply this
  113. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  114. """
  115. pass
  116. def pre_tokenize_str(self, sequence):
  117. """
  118. Pre tokenize the given string
  119. This method provides a way to visualize the effect of a
  120. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  121. alignment, nor does it provide all the capabilities of the
  122. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  123. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  124. Args:
  125. sequence (:obj:`str`):
  126. A string to pre-tokeize
  127. Returns:
  128. :obj:`List[Tuple[str, Offsets]]`:
  129. A list of tuple with the pre-tokenized parts and their offsets
  130. """
  131. pass
  132. class CharDelimiterSplit(PreTokenizer):
  133. """
  134. This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
  135. Args:
  136. delimiter: str:
  137. The delimiter char that will be used to split input
  138. """
  139. def pre_tokenize(self, pretok):
  140. """
  141. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  142. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  143. keep track of the pre-tokenization, and leverage the capabilities of the
  144. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  145. the pre-tokenization of a raw string, you can use
  146. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  147. Args:
  148. pretok (:class:`~tokenizers.PreTokenizedString):
  149. The pre-tokenized string on which to apply this
  150. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  151. """
  152. pass
  153. def pre_tokenize_str(self, sequence):
  154. """
  155. Pre tokenize the given string
  156. This method provides a way to visualize the effect of a
  157. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  158. alignment, nor does it provide all the capabilities of the
  159. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  160. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  161. Args:
  162. sequence (:obj:`str`):
  163. A string to pre-tokeize
  164. Returns:
  165. :obj:`List[Tuple[str, Offsets]]`:
  166. A list of tuple with the pre-tokenized parts and their offsets
  167. """
  168. pass
  169. class Digits(PreTokenizer):
  170. """
  171. This pre-tokenizer simply splits using the digits in separate tokens
  172. Args:
  173. individual_digits (:obj:`bool`, `optional`, defaults to :obj:`False`):
  174. If set to True, digits will each be separated as follows::
  175. "Call 123 please" -> "Call ", "1", "2", "3", " please"
  176. If set to False, digits will grouped as follows::
  177. "Call 123 please" -> "Call ", "123", " please"
  178. """
  179. def __init__(self, individual_digits=False):
  180. pass
  181. def pre_tokenize(self, pretok):
  182. """
  183. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  184. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  185. keep track of the pre-tokenization, and leverage the capabilities of the
  186. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  187. the pre-tokenization of a raw string, you can use
  188. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  189. Args:
  190. pretok (:class:`~tokenizers.PreTokenizedString):
  191. The pre-tokenized string on which to apply this
  192. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  193. """
  194. pass
  195. def pre_tokenize_str(self, sequence):
  196. """
  197. Pre tokenize the given string
  198. This method provides a way to visualize the effect of a
  199. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  200. alignment, nor does it provide all the capabilities of the
  201. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  202. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  203. Args:
  204. sequence (:obj:`str`):
  205. A string to pre-tokeize
  206. Returns:
  207. :obj:`List[Tuple[str, Offsets]]`:
  208. A list of tuple with the pre-tokenized parts and their offsets
  209. """
  210. pass
  211. class FixedLength(PreTokenizer):
  212. """
  213. This pre-tokenizer splits the text into fixed length chunks as used
  214. [here](https://www.biorxiv.org/content/10.1101/2023.01.11.523679v1.full)
  215. Args:
  216. length (:obj:`int`, `optional`, defaults to :obj:`5`):
  217. The length of the chunks to split the text into.
  218. Strings are split on the character level rather than the byte level to avoid
  219. splitting unicode characters consisting of multiple bytes.
  220. """
  221. def __init__(self, length=5):
  222. pass
  223. def pre_tokenize(self, pretok):
  224. """
  225. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  226. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  227. keep track of the pre-tokenization, and leverage the capabilities of the
  228. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  229. the pre-tokenization of a raw string, you can use
  230. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  231. Args:
  232. pretok (:class:`~tokenizers.PreTokenizedString):
  233. The pre-tokenized string on which to apply this
  234. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  235. """
  236. pass
  237. def pre_tokenize_str(self, sequence):
  238. """
  239. Pre tokenize the given string
  240. This method provides a way to visualize the effect of a
  241. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  242. alignment, nor does it provide all the capabilities of the
  243. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  244. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  245. Args:
  246. sequence (:obj:`str`):
  247. A string to pre-tokeize
  248. Returns:
  249. :obj:`List[Tuple[str, Offsets]]`:
  250. A list of tuple with the pre-tokenized parts and their offsets
  251. """
  252. pass
  253. class Metaspace(PreTokenizer):
  254. """
  255. Metaspace pre-tokenizer
  256. This pre-tokenizer replaces any whitespace by the provided replacement character.
  257. It then tries to split on these spaces.
  258. Args:
  259. replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
  260. The replacement character. Must be exactly one character. By default we
  261. use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
  262. prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
  263. Whether to add a space to the first word if there isn't already one. This
  264. lets us treat `hello` exactly like `say hello`.
  265. Choices: "always", "never", "first". First means the space is only added on the first
  266. token (relevant when special tokens are used or other pre_tokenizer are used).
  267. """
  268. def __init__(self, replacement="_", prepend_scheme="always", split=True):
  269. pass
  270. def pre_tokenize(self, pretok):
  271. """
  272. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  273. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  274. keep track of the pre-tokenization, and leverage the capabilities of the
  275. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  276. the pre-tokenization of a raw string, you can use
  277. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  278. Args:
  279. pretok (:class:`~tokenizers.PreTokenizedString):
  280. The pre-tokenized string on which to apply this
  281. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  282. """
  283. pass
  284. def pre_tokenize_str(self, sequence):
  285. """
  286. Pre tokenize the given string
  287. This method provides a way to visualize the effect of a
  288. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  289. alignment, nor does it provide all the capabilities of the
  290. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  291. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  292. Args:
  293. sequence (:obj:`str`):
  294. A string to pre-tokeize
  295. Returns:
  296. :obj:`List[Tuple[str, Offsets]]`:
  297. A list of tuple with the pre-tokenized parts and their offsets
  298. """
  299. pass
  300. class Punctuation(PreTokenizer):
  301. """
  302. This pre-tokenizer simply splits on punctuation as individual characters.
  303. Args:
  304. behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
  305. The behavior to use when splitting.
  306. Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
  307. "contiguous"
  308. """
  309. def __init__(self, behavior="isolated"):
  310. pass
  311. def pre_tokenize(self, pretok):
  312. """
  313. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  314. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  315. keep track of the pre-tokenization, and leverage the capabilities of the
  316. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  317. the pre-tokenization of a raw string, you can use
  318. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  319. Args:
  320. pretok (:class:`~tokenizers.PreTokenizedString):
  321. The pre-tokenized string on which to apply this
  322. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  323. """
  324. pass
  325. def pre_tokenize_str(self, sequence):
  326. """
  327. Pre tokenize the given string
  328. This method provides a way to visualize the effect of a
  329. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  330. alignment, nor does it provide all the capabilities of the
  331. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  332. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  333. Args:
  334. sequence (:obj:`str`):
  335. A string to pre-tokeize
  336. Returns:
  337. :obj:`List[Tuple[str, Offsets]]`:
  338. A list of tuple with the pre-tokenized parts and their offsets
  339. """
  340. pass
  341. class Sequence(PreTokenizer):
  342. """
  343. This pre-tokenizer composes other pre_tokenizers and applies them in sequence
  344. """
  345. def __init__(self, pretokenizers):
  346. pass
  347. def pre_tokenize(self, pretok):
  348. """
  349. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  350. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  351. keep track of the pre-tokenization, and leverage the capabilities of the
  352. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  353. the pre-tokenization of a raw string, you can use
  354. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  355. Args:
  356. pretok (:class:`~tokenizers.PreTokenizedString):
  357. The pre-tokenized string on which to apply this
  358. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  359. """
  360. pass
  361. def pre_tokenize_str(self, sequence):
  362. """
  363. Pre tokenize the given string
  364. This method provides a way to visualize the effect of a
  365. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  366. alignment, nor does it provide all the capabilities of the
  367. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  368. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  369. Args:
  370. sequence (:obj:`str`):
  371. A string to pre-tokeize
  372. Returns:
  373. :obj:`List[Tuple[str, Offsets]]`:
  374. A list of tuple with the pre-tokenized parts and their offsets
  375. """
  376. pass
  377. class Split(PreTokenizer):
  378. """
  379. Split PreTokenizer
  380. This versatile pre-tokenizer splits using the provided pattern and
  381. according to the provided behavior. The pattern can be inverted by
  382. making use of the invert flag.
  383. Args:
  384. pattern (:obj:`str` or :class:`~tokenizers.Regex`):
  385. A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`.
  386. If you want to use a regex pattern, it has to be wrapped around a `tokenizers.Regex`,
  387. otherwise we consider is as a string pattern. For example `pattern="|"`
  388. means you want to split on `|` (imagine a csv file for example), while
  389. `pattern=tokenizers.Regex("1|2")` means you split on either '1' or '2'.
  390. behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
  391. The behavior to use when splitting.
  392. Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
  393. "contiguous"
  394. invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
  395. Whether to invert the pattern.
  396. """
  397. def __init__(self, pattern, behavior, invert=False):
  398. pass
  399. def pre_tokenize(self, pretok):
  400. """
  401. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  402. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  403. keep track of the pre-tokenization, and leverage the capabilities of the
  404. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  405. the pre-tokenization of a raw string, you can use
  406. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  407. Args:
  408. pretok (:class:`~tokenizers.PreTokenizedString):
  409. The pre-tokenized string on which to apply this
  410. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  411. """
  412. pass
  413. def pre_tokenize_str(self, sequence):
  414. """
  415. Pre tokenize the given string
  416. This method provides a way to visualize the effect of a
  417. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  418. alignment, nor does it provide all the capabilities of the
  419. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  420. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  421. Args:
  422. sequence (:obj:`str`):
  423. A string to pre-tokeize
  424. Returns:
  425. :obj:`List[Tuple[str, Offsets]]`:
  426. A list of tuple with the pre-tokenized parts and their offsets
  427. """
  428. pass
  429. class UnicodeScripts(PreTokenizer):
  430. """
  431. This pre-tokenizer splits on characters that belong to different language family
  432. It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt
  433. Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
  434. This mimicks SentencePiece Unigram implementation.
  435. """
  436. def __init__(self):
  437. pass
  438. def pre_tokenize(self, pretok):
  439. """
  440. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  441. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  442. keep track of the pre-tokenization, and leverage the capabilities of the
  443. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  444. the pre-tokenization of a raw string, you can use
  445. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  446. Args:
  447. pretok (:class:`~tokenizers.PreTokenizedString):
  448. The pre-tokenized string on which to apply this
  449. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  450. """
  451. pass
  452. def pre_tokenize_str(self, sequence):
  453. """
  454. Pre tokenize the given string
  455. This method provides a way to visualize the effect of a
  456. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  457. alignment, nor does it provide all the capabilities of the
  458. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  459. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  460. Args:
  461. sequence (:obj:`str`):
  462. A string to pre-tokeize
  463. Returns:
  464. :obj:`List[Tuple[str, Offsets]]`:
  465. A list of tuple with the pre-tokenized parts and their offsets
  466. """
  467. pass
  468. class Whitespace(PreTokenizer):
  469. """
  470. This pre-tokenizer splits on word boundaries according to the `\w+|[^\w\s]+`
  471. regex pattern. It splits on word characters or characters that aren't words or
  472. whitespaces (punctuation such as hyphens, apostrophes, commas, etc.).
  473. Example:
  474. Use the `Whitespace` function as shown below::
  475. ```python
  476. from tokenizers.pre_tokenizers import Whitespace
  477. pre_tokenizer = Whitespace()
  478. text = "Hello, world! Let's try the Whitespace pre-tokenizer."
  479. pre_tokenizer.pre_tokenize_str(text)
  480. [('Hello', (0, 5)),
  481. (',', (5, 6)),
  482. ('world', (7, 12)),
  483. ('!', (12, 13)),
  484. ('Let', (14, 17)),
  485. ("'", (17, 18)),
  486. ('s', (18, 19)),
  487. ('try', (20, 23)),
  488. ('the', (24, 27)),
  489. ('Whitespace', (28, 38)),
  490. ('pre', (39, 42)),
  491. ('-', (42, 43)),
  492. ('tokenizer', (43, 52)),
  493. ('.', (52, 53))]
  494. ```
  495. """
  496. def __init__(self):
  497. pass
  498. def pre_tokenize(self, pretok):
  499. """
  500. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  501. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  502. keep track of the pre-tokenization, and leverage the capabilities of the
  503. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  504. the pre-tokenization of a raw string, you can use
  505. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  506. Args:
  507. pretok (:class:`~tokenizers.PreTokenizedString):
  508. The pre-tokenized string on which to apply this
  509. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  510. """
  511. pass
  512. def pre_tokenize_str(self, sequence):
  513. """
  514. Pre tokenize the given string
  515. This method provides a way to visualize the effect of a
  516. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  517. alignment, nor does it provide all the capabilities of the
  518. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  519. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  520. Args:
  521. sequence (:obj:`str`):
  522. A string to pre-tokeize
  523. Returns:
  524. :obj:`List[Tuple[str, Offsets]]`:
  525. A list of tuple with the pre-tokenized parts and their offsets
  526. """
  527. pass
  528. class WhitespaceSplit(PreTokenizer):
  529. """
  530. This pre-tokenizer simply splits on the whitespace. Works like `.split()`
  531. """
  532. def __init__(self):
  533. pass
  534. def pre_tokenize(self, pretok):
  535. """
  536. Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
  537. This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
  538. keep track of the pre-tokenization, and leverage the capabilities of the
  539. :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
  540. the pre-tokenization of a raw string, you can use
  541. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
  542. Args:
  543. pretok (:class:`~tokenizers.PreTokenizedString):
  544. The pre-tokenized string on which to apply this
  545. :class:`~tokenizers.pre_tokenizers.PreTokenizer`
  546. """
  547. pass
  548. def pre_tokenize_str(self, sequence):
  549. """
  550. Pre tokenize the given string
  551. This method provides a way to visualize the effect of a
  552. :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
  553. alignment, nor does it provide all the capabilities of the
  554. :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
  555. :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
  556. Args:
  557. sequence (:obj:`str`):
  558. A string to pre-tokeize
  559. Returns:
  560. :obj:`List[Tuple[str, Offsets]]`:
  561. A list of tuple with the pre-tokenized parts and their offsets
  562. """
  563. pass