| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Test script to verify French accented character handling in OCR text recognition.
- This script tests that French words with accented characters (é, è, à, ç, etc.)
- and contractions (n'êtes, l'été) are properly grouped as single words and not
- split at each accented character.
- """
- import sys
- import os
- import numpy as np
- # Add the project root to the path
- sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
- from ppocr.postprocess.rec_postprocess import BaseRecLabelDecode
- def test_french_word_grouping():
- """Test that French words with accents are properly grouped."""
- # Initialize the decoder
- decoder = BaseRecLabelDecode(character_dict_path=None, use_space_char=True)
- # Test cases with French accented words
- test_cases = [
- {
- "name": "Simple accented word: été (summer)",
- "text": "été",
- "expected_words": [["é", "t", "é"]],
- "expected_states": ["en&num"],
- },
- {
- "name": "Word with ç: français (French)",
- "text": "français",
- "expected_words": [["f", "r", "a", "n", "ç", "a", "i", "s"]],
- "expected_states": ["en&num"],
- },
- {
- "name": "Contraction: n'êtes (you are)",
- "text": "n'êtes",
- "expected_words": [["n", "'", "ê", "t", "e", "s"]],
- "expected_states": ["en&num"],
- },
- {
- "name": "Multiple accents: élève (student)",
- "text": "élève",
- "expected_words": [["é", "l", "è", "v", "e"]],
- "expected_states": ["en&num"],
- },
- {
- "name": "Word with à: à demain (see you tomorrow)",
- "text": "à demain",
- "expected_words": [["à"], ["d", "e", "m", "a", "i", "n"]],
- "expected_states": ["en&num", "en&num"],
- },
- {
- "name": "Complex: C'était très français (It was very French)",
- "text": "C'était très français",
- "expected_words": [
- ["C", "'", "é", "t", "a", "i", "t"],
- ["t", "r", "è", "s"],
- ["f", "r", "a", "n", "ç", "a", "i", "s"],
- ],
- "expected_states": ["en&num", "en&num", "en&num"],
- },
- ]
- print("=" * 70)
- print("Testing French Accented Character Word Grouping")
- print("=" * 70)
- all_passed = True
- for test in test_cases:
- text = test["name"]
- test_text = test["text"]
- # Create a mock selection array (all characters are valid)
- selection = np.ones(len(test_text), dtype=bool)
- # Call get_word_info
- word_list, word_col_list, state_list = decoder.get_word_info(
- test_text, selection
- )
- # Check results
- passed = True
- if len(word_list) != len(test["expected_words"]):
- passed = False
- print(f"\nFAILED: {text}")
- print(
- f" Expected {len(test['expected_words'])} words, got {len(word_list)}"
- )
- elif state_list != test["expected_states"]:
- passed = False
- print(f"\nFAILED: {text}")
- print(f" Expected states: {test['expected_states']}")
- print(f" Got states: {state_list}")
- else:
- # Check if words match
- for i, (expected, actual) in enumerate(
- zip(test["expected_words"], word_list)
- ):
- if expected != actual:
- passed = False
- print(f"\nFAILED: {text}")
- print(f" Word {i}: Expected {expected}, got {actual}")
- break
- if passed:
- print(f"\nPASSED: {text}")
- print(f" Text: '{test_text}'")
- print(f" Words: {[''.join(w) for w in word_list]}")
- print(f" States: {state_list}")
- else:
- all_passed = False
- print(f" Text: '{test_text}'")
- print(f" Expected words: {[''.join(w) for w in test['expected_words']]}")
- print(f" Got words: {[''.join(w) for w in word_list]}")
- print("\n" + "=" * 70)
- if all_passed:
- print("All tests PASSED! French accented words are properly grouped.")
- else:
- print("Some tests FAILED. Please review the output above.")
- print("=" * 70)
- assert all_passed, "Some French accent tests failed"
- if __name__ == "__main__":
- test_french_word_grouping()
|