| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- # Copyright (c) Alibaba, Inc. and its affiliates.
- import codecs
- import re
- import sys
- class TextClean(object):
- def __init__(self):
- spu = [
- 0xA0, 0x1680, 0x202f, 0x205F, 0x3000, 0xFEFF, 8203, 8206, 8207,
- 8298, 8300, 65279
- ]
- spu.extend(range(0xE000, 0xF8FF + 1))
- spu.extend(range(0x2000, 0x200A + 1))
- spu.extend(range(0x7F, 0xA0 + 1))
- self.spaces = set([chr(i) for i in spu])
- self.space_pat = re.compile(r'\s+', re.UNICODE)
- self.replace_char = {
- u'`': u"'",
- u'’': u"'",
- u'´': u"'",
- u'‘': u"'",
- u'º': u'°',
- u'–': u'-',
- u'—': u'-'
- }
- def sbc2dbc(self, ch):
- n = ord(ch)
- if 0xFF00 < n < 0xFF5F:
- n -= 0xFEE0
- elif n == 0x3000:
- n = 0x20
- else:
- return ch
- return chr(n)
- def clean(self, s):
- try:
- line = list(s.strip())
- size = len(line)
- i = 0
- while i < size:
- if line[i] < u' ' or line[i] in self.spaces:
- line[i] = u' '
- else:
- line[i] = self.replace_char.get(line[i], line[i])
- line[i] = self.sbc2dbc(line[i])
- i += 1
- line = ''.join(line)
- line = self.space_pat.sub(' ', line).strip()
- return line
- except Exception:
- return ''
- if __name__ == '__main__':
- tc = TextClean()
- for line in sys.stdin:
- res = tc.clean(line)
- print(res)
|