test_french_accents.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Test script to verify French accented character handling in OCR text recognition.
  5. This script tests that French words with accented characters (é, è, à, ç, etc.)
  6. and contractions (n'êtes, l'été) are properly grouped as single words and not
  7. split at each accented character.
  8. """
  9. import sys
  10. import os
  11. import numpy as np
  12. # Add the project root to the path
  13. sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
  14. from ppocr.postprocess.rec_postprocess import BaseRecLabelDecode
  15. def test_french_word_grouping():
  16. """Test that French words with accents are properly grouped."""
  17. # Initialize the decoder
  18. decoder = BaseRecLabelDecode(character_dict_path=None, use_space_char=True)
  19. # Test cases with French accented words
  20. test_cases = [
  21. {
  22. "name": "Simple accented word: été (summer)",
  23. "text": "été",
  24. "expected_words": [["é", "t", "é"]],
  25. "expected_states": ["en&num"],
  26. },
  27. {
  28. "name": "Word with ç: français (French)",
  29. "text": "français",
  30. "expected_words": [["f", "r", "a", "n", "ç", "a", "i", "s"]],
  31. "expected_states": ["en&num"],
  32. },
  33. {
  34. "name": "Contraction: n'êtes (you are)",
  35. "text": "n'êtes",
  36. "expected_words": [["n", "'", "ê", "t", "e", "s"]],
  37. "expected_states": ["en&num"],
  38. },
  39. {
  40. "name": "Multiple accents: élève (student)",
  41. "text": "élève",
  42. "expected_words": [["é", "l", "è", "v", "e"]],
  43. "expected_states": ["en&num"],
  44. },
  45. {
  46. "name": "Word with à: à demain (see you tomorrow)",
  47. "text": "à demain",
  48. "expected_words": [["à"], ["d", "e", "m", "a", "i", "n"]],
  49. "expected_states": ["en&num", "en&num"],
  50. },
  51. {
  52. "name": "Complex: C'était très français (It was very French)",
  53. "text": "C'était très français",
  54. "expected_words": [
  55. ["C", "'", "é", "t", "a", "i", "t"],
  56. ["t", "r", "è", "s"],
  57. ["f", "r", "a", "n", "ç", "a", "i", "s"],
  58. ],
  59. "expected_states": ["en&num", "en&num", "en&num"],
  60. },
  61. ]
  62. print("=" * 70)
  63. print("Testing French Accented Character Word Grouping")
  64. print("=" * 70)
  65. all_passed = True
  66. for test in test_cases:
  67. text = test["name"]
  68. test_text = test["text"]
  69. # Create a mock selection array (all characters are valid)
  70. selection = np.ones(len(test_text), dtype=bool)
  71. # Call get_word_info
  72. word_list, word_col_list, state_list = decoder.get_word_info(
  73. test_text, selection
  74. )
  75. # Check results
  76. passed = True
  77. if len(word_list) != len(test["expected_words"]):
  78. passed = False
  79. print(f"\nFAILED: {text}")
  80. print(
  81. f" Expected {len(test['expected_words'])} words, got {len(word_list)}"
  82. )
  83. elif state_list != test["expected_states"]:
  84. passed = False
  85. print(f"\nFAILED: {text}")
  86. print(f" Expected states: {test['expected_states']}")
  87. print(f" Got states: {state_list}")
  88. else:
  89. # Check if words match
  90. for i, (expected, actual) in enumerate(
  91. zip(test["expected_words"], word_list)
  92. ):
  93. if expected != actual:
  94. passed = False
  95. print(f"\nFAILED: {text}")
  96. print(f" Word {i}: Expected {expected}, got {actual}")
  97. break
  98. if passed:
  99. print(f"\nPASSED: {text}")
  100. print(f" Text: '{test_text}'")
  101. print(f" Words: {[''.join(w) for w in word_list]}")
  102. print(f" States: {state_list}")
  103. else:
  104. all_passed = False
  105. print(f" Text: '{test_text}'")
  106. print(f" Expected words: {[''.join(w) for w in test['expected_words']]}")
  107. print(f" Got words: {[''.join(w) for w in word_list]}")
  108. print("\n" + "=" * 70)
  109. if all_passed:
  110. print("All tests PASSED! French accented words are properly grouped.")
  111. else:
  112. print("Some tests FAILED. Please review the output above.")
  113. print("=" * 70)
  114. assert all_passed, "Some French accent tests failed"
  115. if __name__ == "__main__":
  116. test_french_word_grouping()