chinese_utils.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. # Copyright (c) Alibaba, Inc. and its affiliates.
  2. import re
  3. import string
  4. CHINESE_PUNCTUATION = '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·!?。。'
  5. ENGLISH_PUNCTUATION = string.punctuation
  6. def remove_space_between_chinese_chars(decoded_str: str):
  7. old_word_list = decoded_str.split(' ')
  8. new_word_list = []
  9. start = -1
  10. for i, word in enumerate(old_word_list):
  11. if _is_chinese_str(word):
  12. if start == -1:
  13. start = i
  14. else:
  15. if start != -1:
  16. new_word_list.append(''.join(old_word_list[start:i]))
  17. start = -1
  18. new_word_list.append(word)
  19. if start != -1:
  20. new_word_list.append(''.join(old_word_list[start:]))
  21. return ' '.join(new_word_list).strip()
  22. # add space for each chinese char
  23. def rebuild_chinese_str(string: str):
  24. return ' '.join(''.join([
  25. f' {char} '
  26. if _is_chinese_char(char) or char in CHINESE_PUNCTUATION else char
  27. for char in string
  28. ]).split())
  29. def _is_chinese_str(string: str) -> bool:
  30. return all(
  31. _is_chinese_char(cp) or cp in CHINESE_PUNCTUATION
  32. or cp in ENGLISH_PUNCTUATION for cp in string)
  33. def _is_chinese_char(cp: str) -> bool:
  34. """Checks whether CP is the codepoint of a CJK character."""
  35. cp = ord(cp)
  36. if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF)
  37. or (cp >= 0x20000 and cp <= 0x2A6DF)
  38. or (cp >= 0x2A700 and cp <= 0x2B73F)
  39. or (cp >= 0x2B740 and cp <= 0x2B81F)
  40. or (cp >= 0x2B820 and cp <= 0x2CEAF)
  41. or (cp >= 0xF900 and cp <= 0xFAFF)
  42. or (cp >= 0x2F800 and cp <= 0x2FA1F)):
  43. return True
  44. return False
  45. def normalize_chinese_number(text):
  46. from zhconv import convert
  47. chinese_number = ['零', '一', '二', '三', '四', '五', '六', '七', '八', '九']
  48. new_text = ''
  49. for x in text:
  50. if x in '0123456789':
  51. x = chinese_number[0]
  52. new_text += x
  53. new_text = convert(new_text, 'zh-hans')
  54. return new_text
  55. def pre_chinese(text, max_words):
  56. text = text.lower().replace(CHINESE_PUNCTUATION,
  57. ' ').replace(ENGLISH_PUNCTUATION, ' ')
  58. text = re.sub(
  59. r'\s{2,}',
  60. ' ',
  61. text,
  62. )
  63. text = text.rstrip('\n')
  64. text = text.strip(' ')[:max_words]
  65. return text