conll05.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """
  15. Conll05 dataset.
  16. Paddle semantic role labeling Book and demo use this dataset as an example.
  17. Because Conll05 is not free in public, the default downloaded URL is test set
  18. of Conll05 (which is public). Users can change URL and MD5 to their Conll
  19. dataset. And a pre-trained word vector model based on Wikipedia corpus is used
  20. to initialize SRL model.
  21. """
  22. import gzip
  23. import tarfile
  24. import paddle.dataset.common
  25. from paddle.utils import deprecated
  26. __all__ = []
  27. DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
  28. DATA_MD5 = '387719152ae52d60422c016e92a742fc'
  29. WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
  30. WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
  31. VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
  32. VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
  33. TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
  34. TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
  35. EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
  36. EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
  37. UNK_IDX = 0
  38. def load_label_dict(filename):
  39. d = {}
  40. tag_dict = set()
  41. with open(filename, 'r') as f:
  42. for i, line in enumerate(f):
  43. line = line.strip()
  44. if line.startswith("B-"):
  45. tag_dict.add(line[2:])
  46. elif line.startswith("I-"):
  47. tag_dict.add(line[2:])
  48. index = 0
  49. for tag in tag_dict:
  50. d["B-" + tag] = index
  51. index += 1
  52. d["I-" + tag] = index
  53. index += 1
  54. d["O"] = index
  55. return d
  56. def load_dict(filename):
  57. d = {}
  58. with open(filename, 'r') as f:
  59. for i, line in enumerate(f):
  60. d[line.strip()] = i
  61. return d
  62. def corpus_reader(data_path, words_name, props_name):
  63. """
  64. Read one corpus. It returns an iterator. Each element of
  65. this iterator is a tuple including sentence and labels. The sentence is
  66. consist of a list of word IDs. The labels include a list of label IDs.
  67. :return: a iterator of data.
  68. :rtype: iterator
  69. """
  70. def reader():
  71. tf = tarfile.open(data_path)
  72. wf = tf.extractfile(words_name)
  73. pf = tf.extractfile(props_name)
  74. with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
  75. fileobj=pf
  76. ) as props_file:
  77. sentences = []
  78. labels = []
  79. one_seg = []
  80. for word, label in zip(words_file, props_file):
  81. word = word.strip().decode()
  82. label = label.strip().decode().split()
  83. if len(label) == 0: # end of sentence
  84. for i in range(len(one_seg[0])):
  85. a_kind_label = [x[i] for x in one_seg]
  86. labels.append(a_kind_label)
  87. if len(labels) >= 1:
  88. verb_list = []
  89. for x in labels[0]:
  90. if x != '-':
  91. verb_list.append(x)
  92. for i, lbl in enumerate(labels[1:]):
  93. cur_tag = 'O'
  94. is_in_bracket = False
  95. lbl_seq = []
  96. verb_word = ''
  97. for l in lbl:
  98. if l == '*' and not is_in_bracket:
  99. lbl_seq.append('O')
  100. elif l == '*' and is_in_bracket:
  101. lbl_seq.append('I-' + cur_tag)
  102. elif l == '*)':
  103. lbl_seq.append('I-' + cur_tag)
  104. is_in_bracket = False
  105. elif l.find('(') != -1 and l.find(')') != -1:
  106. cur_tag = l[1 : l.find('*')]
  107. lbl_seq.append('B-' + cur_tag)
  108. is_in_bracket = False
  109. elif l.find('(') != -1 and l.find(')') == -1:
  110. cur_tag = l[1 : l.find('*')]
  111. lbl_seq.append('B-' + cur_tag)
  112. is_in_bracket = True
  113. else:
  114. raise RuntimeError(
  115. 'Unexpected label: %s' % l
  116. )
  117. yield sentences, verb_list[i], lbl_seq
  118. sentences = []
  119. labels = []
  120. one_seg = []
  121. else:
  122. sentences.append(word)
  123. one_seg.append(label)
  124. pf.close()
  125. wf.close()
  126. tf.close()
  127. return reader
  128. def reader_creator(
  129. corpus_reader, word_dict=None, predicate_dict=None, label_dict=None
  130. ):
  131. def reader():
  132. for sentence, predicate, labels in corpus_reader():
  133. sen_len = len(sentence)
  134. verb_index = labels.index('B-V')
  135. mark = [0] * len(labels)
  136. if verb_index > 0:
  137. mark[verb_index - 1] = 1
  138. ctx_n1 = sentence[verb_index - 1]
  139. else:
  140. ctx_n1 = 'bos'
  141. if verb_index > 1:
  142. mark[verb_index - 2] = 1
  143. ctx_n2 = sentence[verb_index - 2]
  144. else:
  145. ctx_n2 = 'bos'
  146. mark[verb_index] = 1
  147. ctx_0 = sentence[verb_index]
  148. if verb_index < len(labels) - 1:
  149. mark[verb_index + 1] = 1
  150. ctx_p1 = sentence[verb_index + 1]
  151. else:
  152. ctx_p1 = 'eos'
  153. if verb_index < len(labels) - 2:
  154. mark[verb_index + 2] = 1
  155. ctx_p2 = sentence[verb_index + 2]
  156. else:
  157. ctx_p2 = 'eos'
  158. word_idx = [word_dict.get(w, UNK_IDX) for w in sentence]
  159. ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len
  160. ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len
  161. ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len
  162. ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len
  163. ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len
  164. pred_idx = [predicate_dict.get(predicate)] * sen_len
  165. label_idx = [label_dict.get(w) for w in labels]
  166. yield word_idx, ctx_n2_idx, ctx_n1_idx, ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx
  167. return reader
  168. @deprecated(
  169. since="2.0.0",
  170. update_to="paddle.text.datasets.Conll05st",
  171. level=1,
  172. reason="Please use new dataset API which supports paddle.io.DataLoader",
  173. )
  174. def get_dict():
  175. """
  176. Get the word, verb and label dictionary of Wikipedia corpus.
  177. """
  178. word_dict = load_dict(
  179. paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
  180. )
  181. verb_dict = load_dict(
  182. paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
  183. )
  184. label_dict = load_label_dict(
  185. paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
  186. )
  187. return word_dict, verb_dict, label_dict
  188. @deprecated(
  189. since="2.0.0",
  190. update_to="paddle.text.datasets.Conll05st",
  191. level=1,
  192. reason="Please use new dataset API which supports paddle.io.DataLoader",
  193. )
  194. def get_embedding():
  195. """
  196. Get the trained word vector based on Wikipedia corpus.
  197. """
  198. return paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
  199. @deprecated(
  200. since="2.0.0",
  201. update_to="paddle.text.datasets.Conll05st",
  202. level=1,
  203. reason="Please use new dataset API which supports paddle.io.DataLoader",
  204. )
  205. def test():
  206. """
  207. Conll05 test set creator.
  208. Because the training dataset is not free, the test dataset is used for
  209. training. It returns a reader creator, each sample in the reader is nine
  210. features, including sentence sequence, predicate, predicate context,
  211. predicate context flag and tagged sequence.
  212. :return: Training reader creator
  213. :rtype: callable
  214. """
  215. word_dict, verb_dict, label_dict = get_dict()
  216. reader = corpus_reader(
  217. paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
  218. words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
  219. props_name='conll05st-release/test.wsj/props/test.wsj.props.gz',
  220. )
  221. return reader_creator(reader, word_dict, verb_dict, label_dict)
  222. @deprecated(
  223. since="2.0.0",
  224. update_to="paddle.text.datasets.Conll05st",
  225. level=1,
  226. reason="Please use new dataset API which supports paddle.io.DataLoader",
  227. )
  228. def fetch():
  229. paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
  230. paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
  231. paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
  232. paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
  233. paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)