wmt16.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """
  15. ACL2016 Multimodal Machine Translation. Please see this website for more
  16. details: http://www.statmt.org/wmt16/multimodal-task.html#task1
  17. If you use the dataset created for your task, please cite the following paper:
  18. Multi30K: Multilingual English-German Image Descriptions.
  19. @article{elliott-EtAl:2016:VL16,
  20. author = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.},
  21. title = {Multi30K: Multilingual English-German Image Descriptions},
  22. booktitle = {Proceedings of the 6th Workshop on Vision and Language},
  23. year = {2016},
  24. pages = {70--74},
  25. year = 2016
  26. }
  27. """
  28. import os
  29. import tarfile
  30. from collections import defaultdict
  31. import paddle
  32. from paddle.utils import deprecated
  33. __all__ = []
  34. DATA_URL = "http://paddlemodels.bj.bcebos.com/wmt/wmt16.tar.gz"
  35. DATA_MD5 = "0c38be43600334966403524a40dcd81e"
  36. TOTAL_EN_WORDS = 11250
  37. TOTAL_DE_WORDS = 19220
  38. START_MARK = "<s>"
  39. END_MARK = "<e>"
  40. UNK_MARK = "<unk>"
  41. def __build_dict(tar_file, dict_size, save_path, lang):
  42. word_dict = defaultdict(int)
  43. with tarfile.open(tar_file, mode="r") as f:
  44. for line in f.extractfile("wmt16/train"):
  45. line = line.decode()
  46. line_split = line.strip().split("\t")
  47. if len(line_split) != 2:
  48. continue
  49. sen = line_split[0] if lang == "en" else line_split[1]
  50. for w in sen.split():
  51. word_dict[w] += 1
  52. with open(save_path, "wb") as fout:
  53. fout.write((f"{START_MARK}\n{END_MARK}\n{UNK_MARK}\n").encode())
  54. for idx, word in enumerate(
  55. sorted(word_dict.items(), key=lambda x: x[1], reverse=True)
  56. ):
  57. if idx + 3 == dict_size:
  58. break
  59. fout.write(word[0].encode())
  60. fout.write(b'\n')
  61. def __load_dict(tar_file, dict_size, lang, reverse=False):
  62. dict_path = os.path.join(
  63. paddle.dataset.common.DATA_HOME, "wmt16/%s_%d.dict" % (lang, dict_size)
  64. )
  65. if not os.path.exists(dict_path) or (
  66. len(open(dict_path, "rb").readlines()) != dict_size
  67. ):
  68. __build_dict(tar_file, dict_size, dict_path, lang)
  69. word_dict = {}
  70. with open(dict_path, "rb") as fdict:
  71. for idx, line in enumerate(fdict):
  72. if reverse:
  73. word_dict[idx] = line.strip().decode()
  74. else:
  75. word_dict[line.strip().decode()] = idx
  76. return word_dict
  77. def __get_dict_size(src_dict_size, trg_dict_size, src_lang):
  78. src_dict_size = min(
  79. src_dict_size, (TOTAL_EN_WORDS if src_lang == "en" else TOTAL_DE_WORDS)
  80. )
  81. trg_dict_size = min(
  82. trg_dict_size, (TOTAL_DE_WORDS if src_lang == "en" else TOTAL_EN_WORDS)
  83. )
  84. return src_dict_size, trg_dict_size
  85. def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
  86. def reader():
  87. src_dict = __load_dict(tar_file, src_dict_size, src_lang)
  88. trg_dict = __load_dict(
  89. tar_file, trg_dict_size, ("de" if src_lang == "en" else "en")
  90. )
  91. # the index for start mark, end mark, and unk are the same in source
  92. # language and target language. Here uses the source language
  93. # dictionary to determine their indices.
  94. start_id = src_dict[START_MARK]
  95. end_id = src_dict[END_MARK]
  96. unk_id = src_dict[UNK_MARK]
  97. src_col = 0 if src_lang == "en" else 1
  98. trg_col = 1 - src_col
  99. with tarfile.open(tar_file, mode="r") as f:
  100. for line in f.extractfile(file_name):
  101. line = line.decode()
  102. line_split = line.strip().split("\t")
  103. if len(line_split) != 2:
  104. continue
  105. src_words = line_split[src_col].split()
  106. src_ids = (
  107. [start_id]
  108. + [src_dict.get(w, unk_id) for w in src_words]
  109. + [end_id]
  110. )
  111. trg_words = line_split[trg_col].split()
  112. trg_ids = [trg_dict.get(w, unk_id) for w in trg_words]
  113. trg_ids_next = trg_ids + [end_id]
  114. trg_ids = [start_id] + trg_ids
  115. yield src_ids, trg_ids, trg_ids_next
  116. return reader
  117. @deprecated(
  118. since="2.0.0",
  119. update_to="paddle.text.datasets.WMT16",
  120. level=1,
  121. reason="Please use new dataset API which supports paddle.io.DataLoader",
  122. )
  123. def train(src_dict_size, trg_dict_size, src_lang="en"):
  124. """
  125. WMT16 train set reader.
  126. This function returns the reader for train data. Each sample the reader
  127. returns is made up of three fields: the source language word index sequence,
  128. target language word index sequence and next word index sequence.
  129. NOTE:
  130. The original like for training data is:
  131. http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz
  132. paddle.dataset.wmt16 provides a tokenized version of the original dataset by
  133. using moses's tokenization script:
  134. https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
  135. Args:
  136. src_dict_size(int): Size of the source language dictionary. Three
  137. special tokens will be added into the dictionary:
  138. <s> for start mark, <e> for end mark, and <unk> for
  139. unknown word.
  140. trg_dict_size(int): Size of the target language dictionary. Three
  141. special tokens will be added into the dictionary:
  142. <s> for start mark, <e> for end mark, and <unk> for
  143. unknown word.
  144. src_lang(string): A string indicating which language is the source
  145. language. Available options are: "en" for English
  146. and "de" for Germany.
  147. Returns:
  148. callable: The train reader.
  149. """
  150. if src_lang not in ["en", "de"]:
  151. raise ValueError(
  152. "An error language type. Only support: "
  153. "en (for English); de(for Germany)."
  154. )
  155. src_dict_size, trg_dict_size = __get_dict_size(
  156. src_dict_size, trg_dict_size, src_lang
  157. )
  158. return reader_creator(
  159. tar_file=paddle.dataset.common.download(
  160. DATA_URL, "wmt16", DATA_MD5, "wmt16.tar.gz"
  161. ),
  162. file_name="wmt16/train",
  163. src_dict_size=src_dict_size,
  164. trg_dict_size=trg_dict_size,
  165. src_lang=src_lang,
  166. )
  167. @deprecated(
  168. since="2.0.0",
  169. update_to="paddle.text.datasets.WMT16",
  170. level=1,
  171. reason="Please use new dataset API which supports paddle.io.DataLoader",
  172. )
  173. def test(src_dict_size, trg_dict_size, src_lang="en"):
  174. """
  175. WMT16 test set reader.
  176. This function returns the reader for test data. Each sample the reader
  177. returns is made up of three fields: the source language word index sequence,
  178. target language word index sequence and next word index sequence.
  179. NOTE:
  180. The original like for test data is:
  181. http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz
  182. paddle.dataset.wmt16 provides a tokenized version of the original dataset by
  183. using moses's tokenization script:
  184. https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
  185. Args:
  186. src_dict_size(int): Size of the source language dictionary. Three
  187. special tokens will be added into the dictionary:
  188. <s> for start mark, <e> for end mark, and <unk> for
  189. unknown word.
  190. trg_dict_size(int): Size of the target language dictionary. Three
  191. special tokens will be added into the dictionary:
  192. <s> for start mark, <e> for end mark, and <unk> for
  193. unknown word.
  194. src_lang(string): A string indicating which language is the source
  195. language. Available options are: "en" for English
  196. and "de" for Germany.
  197. Returns:
  198. callable: The test reader.
  199. """
  200. if src_lang not in ["en", "de"]:
  201. raise ValueError(
  202. "An error language type. "
  203. "Only support: en (for English); de(for Germany)."
  204. )
  205. src_dict_size, trg_dict_size = __get_dict_size(
  206. src_dict_size, trg_dict_size, src_lang
  207. )
  208. return reader_creator(
  209. tar_file=paddle.dataset.common.download(
  210. DATA_URL, "wmt16", DATA_MD5, "wmt16.tar.gz"
  211. ),
  212. file_name="wmt16/test",
  213. src_dict_size=src_dict_size,
  214. trg_dict_size=trg_dict_size,
  215. src_lang=src_lang,
  216. )
  217. @deprecated(
  218. since="2.0.0",
  219. update_to="paddle.text.datasets.WMT16",
  220. level=1,
  221. reason="Please use new dataset API which supports paddle.io.DataLoader",
  222. )
  223. def validation(src_dict_size, trg_dict_size, src_lang="en"):
  224. """
  225. WMT16 validation set reader.
  226. This function returns the reader for validation data. Each sample the reader
  227. returns is made up of three fields: the source language word index sequence,
  228. target language word index sequence and next word index sequence.
  229. NOTE:
  230. The original like for validation data is:
  231. http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz
  232. paddle.dataset.wmt16 provides a tokenized version of the original dataset by
  233. using moses's tokenization script:
  234. https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
  235. Args:
  236. src_dict_size(int): Size of the source language dictionary. Three
  237. special tokens will be added into the dictionary:
  238. <s> for start mark, <e> for end mark, and <unk> for
  239. unknown word.
  240. trg_dict_size(int): Size of the target language dictionary. Three
  241. special tokens will be added into the dictionary:
  242. <s> for start mark, <e> for end mark, and <unk> for
  243. unknown word.
  244. src_lang(string): A string indicating which language is the source
  245. language. Available options are: "en" for English
  246. and "de" for Germany.
  247. Returns:
  248. callable: The validation reader.
  249. """
  250. if src_lang not in ["en", "de"]:
  251. raise ValueError(
  252. "An error language type. "
  253. "Only support: en (for English); de(for Germany)."
  254. )
  255. src_dict_size, trg_dict_size = __get_dict_size(
  256. src_dict_size, trg_dict_size, src_lang
  257. )
  258. return reader_creator(
  259. tar_file=paddle.dataset.common.download(
  260. DATA_URL, "wmt16", DATA_MD5, "wmt16.tar.gz"
  261. ),
  262. file_name="wmt16/val",
  263. src_dict_size=src_dict_size,
  264. trg_dict_size=trg_dict_size,
  265. src_lang=src_lang,
  266. )
  267. @deprecated(
  268. since="2.0.0",
  269. update_to="paddle.text.datasets.WMT16",
  270. level=1,
  271. reason="Please use new dataset API which supports paddle.io.DataLoader",
  272. )
  273. def get_dict(lang, dict_size, reverse=False):
  274. """
  275. return the word dictionary for the specified language.
  276. Args:
  277. lang(string): A string indicating which language is the source
  278. language. Available options are: "en" for English
  279. and "de" for Germany.
  280. dict_size(int): Size of the specified language dictionary.
  281. reverse(bool): If reverse is set to False, the returned python
  282. dictionary will use word as key and use index as value.
  283. If reverse is set to True, the returned python
  284. dictionary will use index as key and word as value.
  285. Returns:
  286. dict: The word dictionary for the specific language.
  287. """
  288. if lang == "en":
  289. dict_size = min(dict_size, TOTAL_EN_WORDS)
  290. else:
  291. dict_size = min(dict_size, TOTAL_DE_WORDS)
  292. dict_path = os.path.join(
  293. paddle.dataset.common.DATA_HOME, "wmt16/%s_%d.dict" % (lang, dict_size)
  294. )
  295. assert os.path.exists(dict_path), "Word dictionary does not exist. "
  296. "Please invoke paddle.dataset.wmt16.train/test/validation first "
  297. "to build the dictionary."
  298. tar_file = os.path.join(paddle.dataset.common.DATA_HOME, "wmt16.tar.gz")
  299. return __load_dict(tar_file, dict_size, lang, reverse)
  300. @deprecated(
  301. since="2.0.0",
  302. update_to="paddle.text.datasets.WMT16",
  303. level=1,
  304. reason="Please use new dataset API which supports paddle.io.DataLoader",
  305. )
  306. def fetch():
  307. """download the entire dataset."""
  308. paddle.v4.dataset.common.download(
  309. DATA_URL, "wmt16", DATA_MD5, "wmt16.tar.gz"
  310. )