import os import sys import numpy as np import pytest current_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(current_dir, ".."))) from ppocr.postprocess.rec_postprocess import BaseRecLabelDecode class TestBaseRecLabelDecode: """Tests for BaseRecLabelDecode.get_word_info() method.""" @pytest.fixture def decoder(self): """Create a BaseRecLabelDecode instance for testing.""" return BaseRecLabelDecode() def test_get_word_info_with_german_accented_chars(self, decoder): """Test that German words with accented characters are not split.""" text = "Grüßen" selection = np.ones(len(text), dtype=bool) word_list, _, state_list = decoder.get_word_info(text, selection) assert len(word_list) == 1, "German word should not be split" assert "".join(word_list[0]) == "Grüßen" assert state_list[0] == "en&num" def test_get_word_info_with_longer_german_word(self, decoder): """Test longer German words with umlauts remain intact.""" text = "ungewöhnlichen" selection = np.ones(len(text), dtype=bool) word_list, _, state_list = decoder.get_word_info(text, selection) assert len(word_list) == 1, "German word should not be split" assert "".join(word_list[0]) == "ungewöhnlichen" assert state_list[0] == "en&num" def test_get_word_info_with_french_accented_chars(self, decoder): """Test French words with accented characters.""" text = "café" selection = np.ones(len(text), dtype=bool) word_list, _, state_list = decoder.get_word_info(text, selection) assert len(word_list) == 1, "French word should not be split" assert "".join(word_list[0]) == "café" def test_get_word_info_underscore_as_splitter(self, decoder): """Test that underscores are treated as word splitters.""" text = "hello_world" selection = np.ones(len(text), dtype=bool) word_list, _, state_list = decoder.get_word_info(text, selection) assert len(word_list) == 2, "Underscore should split words" assert "".join(word_list[0]) == "hello" assert "".join(word_list[1]) == "world" def test_get_word_info_with_mixed_content(self, decoder): """Test mixed content with spaces and accented characters.""" text = "Grüßen Sie" selection = np.ones(len(text), dtype=bool) word_list, _, state_list = decoder.get_word_info(text, selection) assert len(word_list) == 2, "Should have two words separated by space" assert "".join(word_list[0]) == "Grüßen" assert "".join(word_list[1]) == "Sie" def test_get_word_info_with_french_apostrophe(self, decoder): """Test French words with apostrophes like n'êtes.""" text = "n'êtes" selection = np.ones(len(text), dtype=bool) word_list, _, state_list = decoder.get_word_info(text, selection) # Apostrophe should keep words connected in French context assert len(word_list) == 1, "French apostrophe should connect words" assert "".join(word_list[0]) == "n'êtes" def test_get_word_info_with_ascii_only(self, decoder): """Test backward compatibility with ASCII-only text.""" text = "hello world" selection = np.ones(len(text), dtype=bool) word_list, _, state_list = decoder.get_word_info(text, selection) assert len(word_list) == 2 assert "".join(word_list[0]) == "hello" assert "".join(word_list[1]) == "world" def test_get_word_info_with_numbers(self, decoder): """Test that numbers are properly handled.""" text = "VGG-16" selection = np.ones(len(text), dtype=bool) word_list, _, state_list = decoder.get_word_info(text, selection) assert len(word_list) == 1, "Hyphenated word-number should stay together" assert "".join(word_list[0]) == "VGG-16" def test_get_word_info_with_floating_point(self, decoder): """Test floating point numbers stay together.""" text = "price 3.14" selection = np.ones(len(text), dtype=bool) word_list, _, state_list = decoder.get_word_info(text, selection) assert len(word_list) == 2 assert "".join(word_list[0]) == "price" assert "".join(word_list[1]) == "3.14" def test_get_word_info_with_chinese(self, decoder): """Test Chinese characters are properly grouped.""" text = "你好啊" selection = np.ones(len(text), dtype=bool) word_list, _, state_list = decoder.get_word_info(text, selection) assert len(word_list) == 1 assert "".join(word_list[0]) == "你好啊" assert state_list[0] == "cn"