__init__.pyi 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519
  1. # Generated content DO NOT EDIT
  2. class PostProcessor:
  3. """
  4. Base class for all post-processors
  5. This class is not supposed to be instantiated directly. Instead, any implementation of
  6. a PostProcessor will return an instance of this class when instantiated.
  7. """
  8. def __getstate__(self):
  9. """ """
  10. pass
  11. def __setstate__(self, state):
  12. """ """
  13. pass
  14. def num_special_tokens_to_add(self, is_pair):
  15. """
  16. Return the number of special tokens that would be added for single/pair sentences.
  17. Args:
  18. is_pair (:obj:`bool`):
  19. Whether the input would be a pair of sequences
  20. Returns:
  21. :obj:`int`: The number of tokens to add
  22. """
  23. pass
  24. def process(self, encoding, pair=None, add_special_tokens=True):
  25. """
  26. Post-process the given encodings, generating the final one
  27. Args:
  28. encoding (:class:`~tokenizers.Encoding`):
  29. The encoding for the first sequence
  30. pair (:class:`~tokenizers.Encoding`, `optional`):
  31. The encoding for the pair sequence
  32. add_special_tokens (:obj:`bool`):
  33. Whether to add the special tokens
  34. Return:
  35. :class:`~tokenizers.Encoding`: The final encoding
  36. """
  37. pass
  38. class BertProcessing(PostProcessor):
  39. """
  40. This post-processor takes care of adding the special tokens needed by
  41. a Bert model:
  42. - a SEP token
  43. - a CLS token
  44. Args:
  45. sep (:obj:`Tuple[str, int]`):
  46. A tuple with the string representation of the SEP token, and its id
  47. cls (:obj:`Tuple[str, int]`):
  48. A tuple with the string representation of the CLS token, and its id
  49. """
  50. def __init__(self, sep, cls):
  51. pass
  52. def __getnewargs__(self):
  53. """ """
  54. pass
  55. def __getstate__(self):
  56. """ """
  57. pass
  58. def __setstate__(self, state):
  59. """ """
  60. pass
  61. @property
  62. def cls(self):
  63. """ """
  64. pass
  65. @cls.setter
  66. def cls(self, value):
  67. """ """
  68. pass
  69. def num_special_tokens_to_add(self, is_pair):
  70. """
  71. Return the number of special tokens that would be added for single/pair sentences.
  72. Args:
  73. is_pair (:obj:`bool`):
  74. Whether the input would be a pair of sequences
  75. Returns:
  76. :obj:`int`: The number of tokens to add
  77. """
  78. pass
  79. def process(self, encoding, pair=None, add_special_tokens=True):
  80. """
  81. Post-process the given encodings, generating the final one
  82. Args:
  83. encoding (:class:`~tokenizers.Encoding`):
  84. The encoding for the first sequence
  85. pair (:class:`~tokenizers.Encoding`, `optional`):
  86. The encoding for the pair sequence
  87. add_special_tokens (:obj:`bool`):
  88. Whether to add the special tokens
  89. Return:
  90. :class:`~tokenizers.Encoding`: The final encoding
  91. """
  92. pass
  93. @property
  94. def sep(self):
  95. """ """
  96. pass
  97. @sep.setter
  98. def sep(self, value):
  99. """ """
  100. pass
  101. class ByteLevel(PostProcessor):
  102. """
  103. This post-processor takes care of trimming the offsets.
  104. By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
  105. want the offsets to include these whitespaces, then this PostProcessor must be used.
  106. Args:
  107. trim_offsets (:obj:`bool`):
  108. Whether to trim the whitespaces from the produced offsets.
  109. add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
  110. If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments
  111. the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets`
  112. is set to :obj:`True`.
  113. """
  114. def __init__(self, add_prefix_space=None, trim_offsets=None, use_regex=None):
  115. pass
  116. def __getstate__(self):
  117. """ """
  118. pass
  119. def __setstate__(self, state):
  120. """ """
  121. pass
  122. @property
  123. def add_prefix_space(self):
  124. """ """
  125. pass
  126. @add_prefix_space.setter
  127. def add_prefix_space(self, value):
  128. """ """
  129. pass
  130. def num_special_tokens_to_add(self, is_pair):
  131. """
  132. Return the number of special tokens that would be added for single/pair sentences.
  133. Args:
  134. is_pair (:obj:`bool`):
  135. Whether the input would be a pair of sequences
  136. Returns:
  137. :obj:`int`: The number of tokens to add
  138. """
  139. pass
  140. def process(self, encoding, pair=None, add_special_tokens=True):
  141. """
  142. Post-process the given encodings, generating the final one
  143. Args:
  144. encoding (:class:`~tokenizers.Encoding`):
  145. The encoding for the first sequence
  146. pair (:class:`~tokenizers.Encoding`, `optional`):
  147. The encoding for the pair sequence
  148. add_special_tokens (:obj:`bool`):
  149. Whether to add the special tokens
  150. Return:
  151. :class:`~tokenizers.Encoding`: The final encoding
  152. """
  153. pass
  154. @property
  155. def trim_offsets(self):
  156. """ """
  157. pass
  158. @trim_offsets.setter
  159. def trim_offsets(self, value):
  160. """ """
  161. pass
  162. @property
  163. def use_regex(self):
  164. """ """
  165. pass
  166. @use_regex.setter
  167. def use_regex(self, value):
  168. """ """
  169. pass
  170. class RobertaProcessing(PostProcessor):
  171. """
  172. This post-processor takes care of adding the special tokens needed by
  173. a Roberta model:
  174. - a SEP token
  175. - a CLS token
  176. It also takes care of trimming the offsets.
  177. By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
  178. want the offsets to include these whitespaces, then this PostProcessor should be initialized
  179. with :obj:`trim_offsets=True`
  180. Args:
  181. sep (:obj:`Tuple[str, int]`):
  182. A tuple with the string representation of the SEP token, and its id
  183. cls (:obj:`Tuple[str, int]`):
  184. A tuple with the string representation of the CLS token, and its id
  185. trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
  186. Whether to trim the whitespaces from the produced offsets.
  187. add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
  188. Whether the add_prefix_space option was enabled during pre-tokenization. This
  189. is relevant because it defines the way the offsets are trimmed out.
  190. """
  191. def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True):
  192. pass
  193. def __getnewargs__(self):
  194. """ """
  195. pass
  196. def __getstate__(self):
  197. """ """
  198. pass
  199. def __setstate__(self, state):
  200. """ """
  201. pass
  202. @property
  203. def add_prefix_space(self):
  204. """ """
  205. pass
  206. @add_prefix_space.setter
  207. def add_prefix_space(self, value):
  208. """ """
  209. pass
  210. @property
  211. def cls(self):
  212. """ """
  213. pass
  214. @cls.setter
  215. def cls(self, value):
  216. """ """
  217. pass
  218. def num_special_tokens_to_add(self, is_pair):
  219. """
  220. Return the number of special tokens that would be added for single/pair sentences.
  221. Args:
  222. is_pair (:obj:`bool`):
  223. Whether the input would be a pair of sequences
  224. Returns:
  225. :obj:`int`: The number of tokens to add
  226. """
  227. pass
  228. def process(self, encoding, pair=None, add_special_tokens=True):
  229. """
  230. Post-process the given encodings, generating the final one
  231. Args:
  232. encoding (:class:`~tokenizers.Encoding`):
  233. The encoding for the first sequence
  234. pair (:class:`~tokenizers.Encoding`, `optional`):
  235. The encoding for the pair sequence
  236. add_special_tokens (:obj:`bool`):
  237. Whether to add the special tokens
  238. Return:
  239. :class:`~tokenizers.Encoding`: The final encoding
  240. """
  241. pass
  242. @property
  243. def sep(self):
  244. """ """
  245. pass
  246. @sep.setter
  247. def sep(self, value):
  248. """ """
  249. pass
  250. @property
  251. def trim_offsets(self):
  252. """ """
  253. pass
  254. @trim_offsets.setter
  255. def trim_offsets(self, value):
  256. """ """
  257. pass
  258. class Sequence(PostProcessor):
  259. """
  260. Sequence Processor
  261. Args:
  262. processors (:obj:`List[PostProcessor]`)
  263. The processors that need to be chained
  264. """
  265. def __init__(self, processors):
  266. pass
  267. def __getitem__(self, key):
  268. """
  269. Return self[key].
  270. """
  271. pass
  272. def __getnewargs__(self):
  273. """ """
  274. pass
  275. def __getstate__(self):
  276. """ """
  277. pass
  278. def __setitem__(self, key, value):
  279. """
  280. Set self[key] to value.
  281. """
  282. pass
  283. def __setstate__(self, state):
  284. """ """
  285. pass
  286. def num_special_tokens_to_add(self, is_pair):
  287. """
  288. Return the number of special tokens that would be added for single/pair sentences.
  289. Args:
  290. is_pair (:obj:`bool`):
  291. Whether the input would be a pair of sequences
  292. Returns:
  293. :obj:`int`: The number of tokens to add
  294. """
  295. pass
  296. def process(self, encoding, pair=None, add_special_tokens=True):
  297. """
  298. Post-process the given encodings, generating the final one
  299. Args:
  300. encoding (:class:`~tokenizers.Encoding`):
  301. The encoding for the first sequence
  302. pair (:class:`~tokenizers.Encoding`, `optional`):
  303. The encoding for the pair sequence
  304. add_special_tokens (:obj:`bool`):
  305. Whether to add the special tokens
  306. Return:
  307. :class:`~tokenizers.Encoding`: The final encoding
  308. """
  309. pass
  310. class TemplateProcessing(PostProcessor):
  311. """
  312. Provides a way to specify templates in order to add the special tokens to each
  313. input sequence as relevant.
  314. Let's take :obj:`BERT` tokenizer as an example. It uses two special tokens, used to
  315. delimitate each sequence. :obj:`[CLS]` is always used at the beginning of the first
  316. sequence, and :obj:`[SEP]` is added at the end of both the first, and the pair
  317. sequences. The final result looks like this:
  318. - Single sequence: :obj:`[CLS] Hello there [SEP]`
  319. - Pair sequences: :obj:`[CLS] My name is Anthony [SEP] What is my name? [SEP]`
  320. With the type ids as following::
  321. [CLS] ... [SEP] ... [SEP]
  322. 0 0 0 1 1
  323. You can achieve such behavior using a TemplateProcessing::
  324. TemplateProcessing(
  325. single="[CLS] $0 [SEP]",
  326. pair="[CLS] $A [SEP] $B:1 [SEP]:1",
  327. special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
  328. )
  329. In this example, each input sequence is identified using a ``$`` construct. This identifier
  330. lets us specify each input sequence, and the type_id to use. When nothing is specified,
  331. it uses the default values. Here are the different ways to specify it:
  332. - Specifying the sequence, with default ``type_id == 0``: ``$A`` or ``$B``
  333. - Specifying the `type_id` with default ``sequence == A``: ``$0``, ``$1``, ``$2``, ...
  334. - Specifying both: ``$A:0``, ``$B:1``, ...
  335. The same construct is used for special tokens: ``<identifier>(:<type_id>)?``.
  336. **Warning**: You must ensure that you are giving the correct tokens/ids as these
  337. will be added to the Encoding without any further check. If the given ids correspond
  338. to something totally different in a `Tokenizer` using this `PostProcessor`, it
  339. might lead to unexpected results.
  340. Args:
  341. single (:obj:`Template`):
  342. The template used for single sequences
  343. pair (:obj:`Template`):
  344. The template used when both sequences are specified
  345. special_tokens (:obj:`Tokens`):
  346. The list of special tokens used in each sequences
  347. Types:
  348. Template (:obj:`str` or :obj:`List`):
  349. - If a :obj:`str` is provided, the whitespace is used as delimiter between tokens
  350. - If a :obj:`List[str]` is provided, a list of tokens
  351. Tokens (:obj:`List[Union[Tuple[int, str], Tuple[str, int], dict]]`):
  352. - A :obj:`Tuple` with both a token and its associated ID, in any order
  353. - A :obj:`dict` with the following keys:
  354. - "id": :obj:`str` => The special token id, as specified in the Template
  355. - "ids": :obj:`List[int]` => The associated IDs
  356. - "tokens": :obj:`List[str]` => The associated tokens
  357. The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have
  358. the same length.
  359. """
  360. def __init__(self, single=None, pair=None, special_tokens=None):
  361. pass
  362. def __getstate__(self):
  363. """ """
  364. pass
  365. def __setstate__(self, state):
  366. """ """
  367. pass
  368. def num_special_tokens_to_add(self, is_pair):
  369. """
  370. Return the number of special tokens that would be added for single/pair sentences.
  371. Args:
  372. is_pair (:obj:`bool`):
  373. Whether the input would be a pair of sequences
  374. Returns:
  375. :obj:`int`: The number of tokens to add
  376. """
  377. pass
  378. def process(self, encoding, pair=None, add_special_tokens=True):
  379. """
  380. Post-process the given encodings, generating the final one
  381. Args:
  382. encoding (:class:`~tokenizers.Encoding`):
  383. The encoding for the first sequence
  384. pair (:class:`~tokenizers.Encoding`, `optional`):
  385. The encoding for the pair sequence
  386. add_special_tokens (:obj:`bool`):
  387. Whether to add the special tokens
  388. Return:
  389. :class:`~tokenizers.Encoding`: The final encoding
  390. """
  391. pass
  392. @property
  393. def single(self):
  394. """ """
  395. pass
  396. @single.setter
  397. def single(self, value):
  398. """ """
  399. pass