streamers.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. # coding=utf-8
  2. # Copyright 2023 The HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. from __future__ import annotations
  16. import asyncio
  17. from queue import Queue
  18. from typing import TYPE_CHECKING
  19. if TYPE_CHECKING:
  20. from ..models.auto import AutoTokenizer
  21. class BaseStreamer:
  22. """
  23. Base class from which `.generate()` streamers should inherit.
  24. """
  25. def put(self, value):
  26. """Function that is called by `.generate()` to push new tokens"""
  27. raise NotImplementedError()
  28. def end(self):
  29. """Function that is called by `.generate()` to signal the end of generation"""
  30. raise NotImplementedError()
  31. class TextStreamer(BaseStreamer):
  32. """
  33. Simple text streamer that prints the token(s) to stdout as soon as entire words are formed.
  34. <Tip warning={true}>
  35. The API for the streamer classes is still under development and may change in the future.
  36. </Tip>
  37. Parameters:
  38. tokenizer (`AutoTokenizer`):
  39. The tokenized used to decode the tokens.
  40. skip_prompt (`bool`, *optional*, defaults to `False`):
  41. Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
  42. decode_kwargs (`dict`, *optional*):
  43. Additional keyword arguments to pass to the tokenizer's `decode` method.
  44. Examples:
  45. ```python
  46. >>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
  47. >>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
  48. >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
  49. >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
  50. >>> streamer = TextStreamer(tok)
  51. >>> # Despite returning the usual output, the streamer will also print the generated text to stdout.
  52. >>> _ = model.generate(**inputs, streamer=streamer, max_new_tokens=20)
  53. An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,
  54. ```
  55. """
  56. def __init__(self, tokenizer: AutoTokenizer, skip_prompt: bool = False, **decode_kwargs):
  57. self.tokenizer = tokenizer
  58. self.skip_prompt = skip_prompt
  59. self.decode_kwargs = decode_kwargs
  60. # variables used in the streaming process
  61. self.token_cache = []
  62. self.print_len = 0
  63. self.next_tokens_are_prompt = True
  64. def put(self, value):
  65. """
  66. Receives tokens, decodes them, and prints them to stdout as soon as they form entire words.
  67. """
  68. if len(value.shape) > 1 and value.shape[0] > 1:
  69. raise ValueError("TextStreamer only supports batch size 1")
  70. elif len(value.shape) > 1:
  71. value = value[0]
  72. if self.skip_prompt and self.next_tokens_are_prompt:
  73. self.next_tokens_are_prompt = False
  74. return
  75. # Add the new token to the cache and decodes the entire thing.
  76. self.token_cache.extend(value.tolist())
  77. text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
  78. # After the symbol for a new line, we flush the cache.
  79. if text.endswith("\n"):
  80. printable_text = text[self.print_len :]
  81. self.token_cache = []
  82. self.print_len = 0
  83. # If the last token is a CJK character, we print the characters.
  84. elif len(text) > 0 and self._is_chinese_char(ord(text[-1])):
  85. printable_text = text[self.print_len :]
  86. self.print_len += len(printable_text)
  87. # Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
  88. # which may change with the subsequent token -- there are probably smarter ways to do this!)
  89. else:
  90. printable_text = text[self.print_len : text.rfind(" ") + 1]
  91. self.print_len += len(printable_text)
  92. self.on_finalized_text(printable_text)
  93. def end(self):
  94. """Flushes any remaining cache and prints a newline to stdout."""
  95. # Flush the cache, if it exists
  96. if len(self.token_cache) > 0:
  97. text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
  98. printable_text = text[self.print_len :]
  99. self.token_cache = []
  100. self.print_len = 0
  101. else:
  102. printable_text = ""
  103. self.next_tokens_are_prompt = True
  104. self.on_finalized_text(printable_text, stream_end=True)
  105. def on_finalized_text(self, text: str, stream_end: bool = False):
  106. """Prints the new text to stdout. If the stream is ending, also prints a newline."""
  107. print(text, flush=True, end="" if not stream_end else None)
  108. def _is_chinese_char(self, cp):
  109. """Checks whether CP is the codepoint of a CJK character."""
  110. # This defines a "chinese character" as anything in the CJK Unicode block:
  111. # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
  112. #
  113. # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
  114. # despite its name. The modern Korean Hangul alphabet is a different block,
  115. # as is Japanese Hiragana and Katakana. Those alphabets are used to write
  116. # space-separated words, so they are not treated specially and handled
  117. # like the all of the other languages.
  118. if (
  119. (cp >= 0x4E00 and cp <= 0x9FFF)
  120. or (cp >= 0x3400 and cp <= 0x4DBF)
  121. or (cp >= 0x20000 and cp <= 0x2A6DF)
  122. or (cp >= 0x2A700 and cp <= 0x2B73F)
  123. or (cp >= 0x2B740 and cp <= 0x2B81F)
  124. or (cp >= 0x2B820 and cp <= 0x2CEAF)
  125. or (cp >= 0xF900 and cp <= 0xFAFF)
  126. or (cp >= 0x2F800 and cp <= 0x2FA1F)
  127. ):
  128. return True
  129. return False
  130. class TextIteratorStreamer(TextStreamer):
  131. """
  132. Streamer that stores print-ready text in a queue, to be used by a downstream application as an iterator. This is
  133. useful for applications that benefit from accessing the generated text in a non-blocking way (e.g. in an interactive
  134. Gradio demo).
  135. <Tip warning={true}>
  136. The API for the streamer classes is still under development and may change in the future.
  137. </Tip>
  138. Parameters:
  139. tokenizer (`AutoTokenizer`):
  140. The tokenized used to decode the tokens.
  141. skip_prompt (`bool`, *optional*, defaults to `False`):
  142. Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
  143. timeout (`float`, *optional*):
  144. The timeout for the text queue. If `None`, the queue will block indefinitely. Useful to handle exceptions
  145. in `.generate()`, when it is called in a separate thread.
  146. decode_kwargs (`dict`, *optional*):
  147. Additional keyword arguments to pass to the tokenizer's `decode` method.
  148. Examples:
  149. ```python
  150. >>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
  151. >>> from threading import Thread
  152. >>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
  153. >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
  154. >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
  155. >>> streamer = TextIteratorStreamer(tok)
  156. >>> # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
  157. >>> generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=20)
  158. >>> thread = Thread(target=model.generate, kwargs=generation_kwargs)
  159. >>> thread.start()
  160. >>> generated_text = ""
  161. >>> for new_text in streamer:
  162. ... generated_text += new_text
  163. >>> generated_text
  164. 'An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,'
  165. ```
  166. """
  167. def __init__(
  168. self, tokenizer: AutoTokenizer, skip_prompt: bool = False, timeout: float | None = None, **decode_kwargs
  169. ):
  170. super().__init__(tokenizer, skip_prompt, **decode_kwargs)
  171. self.text_queue = Queue()
  172. self.stop_signal = None
  173. self.timeout = timeout
  174. def on_finalized_text(self, text: str, stream_end: bool = False):
  175. """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue."""
  176. self.text_queue.put(text, timeout=self.timeout)
  177. if stream_end:
  178. self.text_queue.put(self.stop_signal, timeout=self.timeout)
  179. def __iter__(self):
  180. return self
  181. def __next__(self):
  182. value = self.text_queue.get(timeout=self.timeout)
  183. if value == self.stop_signal:
  184. raise StopIteration()
  185. else:
  186. return value
  187. class AsyncTextIteratorStreamer(TextStreamer):
  188. """
  189. Streamer that stores print-ready text in a queue, to be used by a downstream application as an async iterator.
  190. This is useful for applications that benefit from accessing the generated text asynchronously (e.g. in an
  191. interactive Gradio demo).
  192. <Tip warning={true}>
  193. The API for the streamer classes is still under development and may change in the future.
  194. </Tip>
  195. Parameters:
  196. tokenizer (`AutoTokenizer`):
  197. The tokenized used to decode the tokens.
  198. skip_prompt (`bool`, *optional*, defaults to `False`):
  199. Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
  200. timeout (`float`, *optional*):
  201. The timeout for the text queue. If `None`, the queue will block indefinitely. Useful to handle exceptions
  202. in `.generate()`, when it is called in a separate thread.
  203. decode_kwargs (`dict`, *optional*):
  204. Additional keyword arguments to pass to the tokenizer's `decode` method.
  205. Raises:
  206. TimeoutError: If token generation time exceeds timeout value.
  207. Examples:
  208. ```python
  209. >>> from transformers import AutoModelForCausalLM, AutoTokenizer, AsyncTextIteratorStreamer
  210. >>> from threading import Thread
  211. >>> import asyncio
  212. >>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
  213. >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
  214. >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
  215. >>> # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
  216. >>> async def main():
  217. ... # Important: AsyncTextIteratorStreamer must be initialized inside a coroutine!
  218. ... streamer = AsyncTextIteratorStreamer(tok)
  219. ... generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=20)
  220. ... thread = Thread(target=model.generate, kwargs=generation_kwargs)
  221. ... thread.start()
  222. ... generated_text = ""
  223. ... async for new_text in streamer:
  224. ... generated_text += new_text
  225. >>> print(generated_text)
  226. >>> asyncio.run(main())
  227. An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,
  228. ```
  229. """
  230. def __init__(
  231. self, tokenizer: AutoTokenizer, skip_prompt: bool = False, timeout: float | None = None, **decode_kwargs
  232. ):
  233. super().__init__(tokenizer, skip_prompt, **decode_kwargs)
  234. self.text_queue = asyncio.Queue()
  235. self.stop_signal = None
  236. self.timeout = timeout
  237. self.loop = asyncio.get_running_loop()
  238. self.has_asyncio_timeout = hasattr(asyncio, "timeout")
  239. def on_finalized_text(self, text: str, stream_end: bool = False):
  240. """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue."""
  241. self.loop.call_soon_threadsafe(self.text_queue.put_nowait, text)
  242. if stream_end:
  243. self.loop.call_soon_threadsafe(self.text_queue.put_nowait, self.stop_signal)
  244. def __aiter__(self):
  245. return self
  246. async def __anext__(self):
  247. try:
  248. if self.has_asyncio_timeout:
  249. async with asyncio.timeout(self.timeout):
  250. value = await self.text_queue.get()
  251. else:
  252. value = await asyncio.wait_for(self.text_queue.get(), timeout=self.timeout)
  253. except asyncio.TimeoutError:
  254. raise TimeoutError()
  255. else:
  256. if value == self.stop_signal:
  257. raise StopAsyncIteration()
  258. else:
  259. return value