text_clean.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. # Copyright (c) Alibaba, Inc. and its affiliates.
  2. import codecs
  3. import re
  4. import sys
  5. class TextClean(object):
  6. def __init__(self):
  7. spu = [
  8. 0xA0, 0x1680, 0x202f, 0x205F, 0x3000, 0xFEFF, 8203, 8206, 8207,
  9. 8298, 8300, 65279
  10. ]
  11. spu.extend(range(0xE000, 0xF8FF + 1))
  12. spu.extend(range(0x2000, 0x200A + 1))
  13. spu.extend(range(0x7F, 0xA0 + 1))
  14. self.spaces = set([chr(i) for i in spu])
  15. self.space_pat = re.compile(r'\s+', re.UNICODE)
  16. self.replace_char = {
  17. u'`': u"'",
  18. u'’': u"'",
  19. u'´': u"'",
  20. u'‘': u"'",
  21. u'º': u'°',
  22. u'–': u'-',
  23. u'—': u'-'
  24. }
  25. def sbc2dbc(self, ch):
  26. n = ord(ch)
  27. if 0xFF00 < n < 0xFF5F:
  28. n -= 0xFEE0
  29. elif n == 0x3000:
  30. n = 0x20
  31. else:
  32. return ch
  33. return chr(n)
  34. def clean(self, s):
  35. try:
  36. line = list(s.strip())
  37. size = len(line)
  38. i = 0
  39. while i < size:
  40. if line[i] < u' ' or line[i] in self.spaces:
  41. line[i] = u' '
  42. else:
  43. line[i] = self.replace_char.get(line[i], line[i])
  44. line[i] = self.sbc2dbc(line[i])
  45. i += 1
  46. line = ''.join(line)
  47. line = self.space_pat.sub(' ', line).strip()
  48. return line
  49. except Exception:
  50. return ''
  51. if __name__ == '__main__':
  52. tc = TextClean()
  53. for line in sys.stdin:
  54. res = tc.clean(line)
  55. print(res)