check_dict.py 894 B

123456789101112131415161718192021222324252627282930
  1. # -*- coding: utf-8 -*-
  2. import sys
  3. from pathlib import Path
  4. dict_path = Path(__file__).parent / 'OnnxOCR-main' / 'OnnxOCR-main' / 'onnxocr' / 'models' / 'ppocrv5' / 'ppocrv5_dict.txt'
  5. with open(dict_path, 'r', encoding='utf-8') as f:
  6. content = f.read()
  7. # 检查日文字符
  8. hiragana = [c for c in content if '\u3040' <= c <= '\u309F'] # 平假名
  9. katakana = [c for c in content if '\u30A0' <= c <= '\u30FF'] # 片假名
  10. kanji = [c for c in content if '\u4E00' <= c <= '\u9FAF'] # 汉字
  11. print(f'字典总字符数: {len(content)}')
  12. print(f'平假名数量: {len(hiragana)}')
  13. print(f'片假名数量: {len(katakana)}')
  14. print(f'汉字数量: {len(kanji)}')
  15. if hiragana:
  16. print(f'平假名示例: {hiragana[:20]}')
  17. else:
  18. print('平假名: 无')
  19. if katakana:
  20. print(f'片假名示例: {katakana[:20]}')
  21. else:
  22. print('片假名: 无')
  23. print(f'汉字示例: {kanji[:20]}')