clean_dataset.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. # Copyright (c) Alibaba, Inc. and its affiliates.
  2. import os
  3. import re
  4. from . import ontology
  5. def clean_text_split_dot(text):
  6. text = re.sub(r'([a-zT]+)\.([a-z])', r'\1 . \2',
  7. text) # 'abc.xyz' -> 'abc . xyz'
  8. text = re.sub(r'(\w+)\.\.? ', r'\1 . ', text) # if 'abc. ' -> 'abc . '
  9. return text
  10. def clean_text(data_dir, text):
  11. text = text.strip()
  12. text = text.lower()
  13. text = text.replace(u'’', "'")
  14. text = text.replace(u'‘', "'")
  15. text = text.replace(';', ',')
  16. text = text.replace('"', ' ')
  17. text = text.replace('/', ' and ')
  18. text = text.replace("don't", "do n't")
  19. text = clean_time(text)
  20. baddata = {
  21. r'c\.b (\d), (\d) ([a-z])\.([a-z])': r'cb\1\2\3\4',
  22. 'c.b. 1 7 d.y': 'cb17dy',
  23. 'c.b.1 7 d.y': 'cb17dy',
  24. 'c.b 25, 9 a.q': 'cb259aq',
  25. 'isc.b 25, 9 a.q': 'is cb259aq',
  26. 'c.b2, 1 u.f': 'cb21uf',
  27. 'c.b 1,2 q.a': 'cb12qa',
  28. '0-122-336-5664': '01223365664',
  29. 'postcodecb21rs': 'postcode cb21rs',
  30. r'i\.d': 'id',
  31. ' i d ': 'id',
  32. 'Telephone:01223358966': 'Telephone: 01223358966',
  33. 'depature': 'departure',
  34. 'depearting': 'departing',
  35. '-type': ' type',
  36. r'b[\s]?&[\s]?b': 'bed and breakfast',
  37. 'b and b': 'bed and breakfast',
  38. r'guesthouse[s]?': 'guest house',
  39. r'swimmingpool[s]?': 'swimming pool',
  40. "wo n\'t": 'will not',
  41. " \'d ": ' would ',
  42. " \'m ": ' am ',
  43. " \'re' ": ' are ',
  44. " \'ll' ": ' will ',
  45. " \'ve ": ' have ',
  46. r'^\'': '',
  47. r'\'$': '',
  48. }
  49. for tmpl, good in baddata.items():
  50. text = re.sub(tmpl, good, text)
  51. text = re.sub(r'([a-zT]+)\.([a-z])', r'\1 . \2',
  52. text) # 'abc.xyz' -> 'abc . xyz'
  53. text = re.sub(r'(\w+)\.\.? ', r'\1 . ', text) # if 'abc. ' -> 'abc . '
  54. with open(
  55. os.path.join(data_dir, 'mapping.pair'), 'r',
  56. encoding='utf-8') as fin:
  57. for line in fin.readlines():
  58. fromx, tox = line.replace('\n', '').split('\t')
  59. text = ' ' + text + ' '
  60. text = text.replace(' ' + fromx + ' ', ' ' + tox + ' ')[1:-1]
  61. return text
  62. def clean_time(utter):
  63. utter = re.sub(r'(\d+) ([ap]\.?m)', lambda x: x.group(1) + x.group(2),
  64. utter) # 9 am -> 9am
  65. utter = re.sub(r'((?<!\d)\d:\d+)(am)?', r'0\1', utter)
  66. utter = re.sub(r'((?<!\d)\d)am', r'0\1:00', utter)
  67. utter = re.sub(r'((?<!\d)\d)pm',
  68. lambda x: str(int(x.group(1)) + 12) + ':00', utter)
  69. utter = re.sub(r'(\d+)(:\d+)pm',
  70. lambda x: str(int(x.group(1)) + 12) + x.group(2), utter)
  71. utter = re.sub(r'(\d+)a\.?m', r'\1', utter)
  72. return utter
  73. def clean_slot_values(data_dir, domain, slot, value):
  74. value = clean_text(data_dir, value)
  75. if not value:
  76. value = ''
  77. elif value == 'not mentioned':
  78. value = ''
  79. # value = 'not mentioned' # if in DST setting
  80. elif domain == 'attraction':
  81. if slot == 'name':
  82. if value == 't':
  83. value = ''
  84. if value == 'trinity':
  85. value = 'trinity college'
  86. elif slot == 'area':
  87. if value in ['town centre', 'cent', 'center', 'ce']:
  88. value = 'centre'
  89. elif value in [
  90. 'ely', 'in town', 'museum', 'norwich', 'same area as hotel'
  91. ]:
  92. value = ''
  93. elif value in ['we']:
  94. value = 'west'
  95. elif slot == 'type':
  96. if value in ['m', 'mus', 'musuem']:
  97. value = 'museum'
  98. elif value in ['art', 'architectural']:
  99. value = 'architecture'
  100. elif value in ['churches']:
  101. value = 'church'
  102. elif value in ['coll']:
  103. value = 'college'
  104. elif value in ['concert', 'concerthall']:
  105. value = 'concert hall'
  106. elif value in ['night club']:
  107. value = 'nightclub'
  108. elif value in [
  109. 'mutiple sports', 'mutliple sports', 'sports', 'galleria'
  110. ]:
  111. value = 'multiple sports'
  112. elif value in ['ol', 'science', 'gastropub', 'la raza']:
  113. value = ''
  114. elif value in ['swimmingpool', 'pool']:
  115. value = 'swimming pool'
  116. elif value in ['fun']:
  117. value = 'entertainment'
  118. elif domain == 'hotel':
  119. if slot == 'area':
  120. if value in [
  121. 'cen', 'centre of town', 'near city center', 'center'
  122. ]:
  123. value = 'centre'
  124. elif value in ['east area', 'east side']:
  125. value = 'east'
  126. elif value in ['in the north', 'north part of town']:
  127. value = 'north'
  128. elif value in ['we']:
  129. value = 'west'
  130. elif slot == 'day':
  131. if value == 'monda':
  132. value = 'monday'
  133. elif value == 't':
  134. value = 'tuesday'
  135. elif slot == 'name':
  136. if value == 'uni':
  137. value = 'university arms hotel'
  138. elif value == 'university arms':
  139. value = 'university arms hotel'
  140. elif value == 'acron':
  141. value = 'acorn guest house'
  142. elif value == 'ashley':
  143. value = 'ashley hotel'
  144. elif value == 'arbury lodge guesthouse':
  145. value = 'arbury lodge guest house'
  146. elif value == 'la':
  147. value = 'la margherit'
  148. elif value == 'no':
  149. value = ''
  150. elif slot == 'internet':
  151. if value == 'does not':
  152. value = 'no'
  153. elif value in ['y', 'free', 'free internet']:
  154. value = 'yes'
  155. elif value in ['4']:
  156. value = ''
  157. elif slot == 'parking':
  158. if value == 'n':
  159. value = 'no'
  160. elif value in ['free parking']:
  161. value = 'yes'
  162. elif value in ['y']:
  163. value = 'yes'
  164. elif slot in ['pricerange', 'price range']:
  165. slot = 'pricerange'
  166. if value == 'moderately':
  167. value = 'moderate'
  168. elif value in ['any']:
  169. value = "do n't care"
  170. elif value in ['any']:
  171. value = "do n't care"
  172. elif value in ['inexpensive']:
  173. value = 'cheap'
  174. elif value in ['2', '4']:
  175. value = ''
  176. elif slot == 'stars':
  177. if value == 'two':
  178. value = '2'
  179. elif value == 'three':
  180. value = '3'
  181. elif value in [
  182. '4-star', '4 stars', '4 star', 'four star', 'four stars'
  183. ]:
  184. value = '4'
  185. elif slot == 'type':
  186. if value == '0 star rarting':
  187. value = ''
  188. elif value == 'guesthouse':
  189. value = 'guest house'
  190. elif value not in ['hotel', 'guest house', "do n't care"]:
  191. value = ''
  192. elif domain == 'restaurant':
  193. if slot == 'area':
  194. if value in [
  195. 'center', 'scentre', 'center of town', 'city center',
  196. 'cb30aq', 'town center', 'centre of cambridge',
  197. 'city centre'
  198. ]:
  199. value = 'centre'
  200. elif value == 'west part of town':
  201. value = 'west'
  202. elif value == 'n':
  203. value = 'north'
  204. elif value in ['the south']:
  205. value = 'south'
  206. elif value not in [
  207. 'centre', 'south', "do n't care", 'west', 'east', 'north'
  208. ]:
  209. value = ''
  210. elif slot == 'day':
  211. if value == 'monda':
  212. value = 'monday'
  213. elif value == 't':
  214. value = 'tuesday'
  215. elif slot in ['pricerange', 'price range']:
  216. slot = 'pricerange'
  217. if value in ['moderately', 'mode', 'mo']:
  218. value = 'moderate'
  219. elif value in ['not']:
  220. value = ''
  221. elif value in ['inexpensive', 'ch']:
  222. value = 'cheap'
  223. elif slot == 'food':
  224. if value == 'barbecue':
  225. value = 'barbeque'
  226. elif slot == 'pricerange':
  227. if value == 'moderately':
  228. value = 'moderate'
  229. elif slot == 'time':
  230. if value == '9:00':
  231. value = '09:00'
  232. elif value == '9:45':
  233. value = '09:45'
  234. elif value == '1330':
  235. value = '13:30'
  236. elif value == '1430':
  237. value = '14:30'
  238. elif value == '9:15':
  239. value = '09:15'
  240. elif value == '9:30':
  241. value = '09:30'
  242. elif value == '1830':
  243. value = '18:30'
  244. elif value == '9':
  245. value = '09:00'
  246. elif value == '2:00':
  247. value = '14:00'
  248. elif value == '1:00':
  249. value = '13:00'
  250. elif value == '3:00':
  251. value = '15:00'
  252. elif domain == 'taxi':
  253. if slot in ['arriveBy', 'arrive by']:
  254. slot = 'arriveby'
  255. if value == '1530':
  256. value = '15:30'
  257. elif value == '15 minutes':
  258. value = ''
  259. elif slot in ['leaveAt', 'leave at']:
  260. slot = 'leaveat'
  261. if value == '1:00':
  262. value = '01:00'
  263. elif value == '21:4':
  264. value = '21:04'
  265. elif value == '4:15':
  266. value = '04:15'
  267. elif value == '5:45':
  268. value = '05:45'
  269. elif value == '0700':
  270. value = '07:00'
  271. elif value == '4:45':
  272. value = '04:45'
  273. elif value == '8:30':
  274. value = '08:30'
  275. elif value == '9:30':
  276. value = '09:30'
  277. value = value.replace('.', ':')
  278. elif domain == 'train':
  279. if slot in ['arriveBy', 'arrive by']:
  280. slot = 'arriveby'
  281. if value == '1':
  282. value = '01:00'
  283. elif value in ['does not care', 'doesnt care', "doesn't care"]:
  284. value = "do n't care"
  285. elif value == '8:30':
  286. value = '08:30'
  287. elif value == 'not 15:45':
  288. value = ''
  289. value = value.replace('.', ':')
  290. elif slot == 'day':
  291. if value == 'doesnt care' or value == "doesn't care":
  292. value = "do n't care"
  293. elif slot in ['leaveAt', 'leave at']:
  294. slot = 'leaveat'
  295. if value == '2:30':
  296. value = '02:30'
  297. elif value == '7:54':
  298. value = '07:54'
  299. elif value == 'after 5:45 pm':
  300. value = '17:45'
  301. elif value in [
  302. 'early evening', 'friday', 'sunday', 'tuesday', 'afternoon'
  303. ]:
  304. value = ''
  305. elif value == '12':
  306. value = '12:00'
  307. elif value == '1030':
  308. value = '10:30'
  309. elif value == '1700':
  310. value = '17:00'
  311. elif value in [
  312. 'does not care', 'doesnt care', 'do nt care',
  313. "doesn't care"
  314. ]:
  315. value = "do n't care"
  316. value = value.replace('.', ':')
  317. if value in ['dont care', "don't care", 'do nt care', "doesn't care"]:
  318. value = "do n't care"
  319. if ontology.normlize_slot_names.get(slot):
  320. slot = ontology.normlize_slot_names[slot]
  321. return slot, value