| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107 |
- import os
- import sys
- import numpy as np
- import pytest
- current_dir = os.path.dirname(os.path.abspath(__file__))
- sys.path.append(os.path.abspath(os.path.join(current_dir, "..")))
- from ppocr.postprocess.rec_postprocess import BaseRecLabelDecode
- class TestBaseRecLabelDecode:
- """Tests for BaseRecLabelDecode.get_word_info() method."""
- @pytest.fixture
- def decoder(self):
- """Create a BaseRecLabelDecode instance for testing."""
- return BaseRecLabelDecode()
- def test_get_word_info_with_german_accented_chars(self, decoder):
- """Test that German words with accented characters are not split."""
- text = "Grüßen"
- selection = np.ones(len(text), dtype=bool)
- word_list, _, state_list = decoder.get_word_info(text, selection)
- assert len(word_list) == 1, "German word should not be split"
- assert "".join(word_list[0]) == "Grüßen"
- assert state_list[0] == "en&num"
- def test_get_word_info_with_longer_german_word(self, decoder):
- """Test longer German words with umlauts remain intact."""
- text = "ungewöhnlichen"
- selection = np.ones(len(text), dtype=bool)
- word_list, _, state_list = decoder.get_word_info(text, selection)
- assert len(word_list) == 1, "German word should not be split"
- assert "".join(word_list[0]) == "ungewöhnlichen"
- assert state_list[0] == "en&num"
- def test_get_word_info_with_french_accented_chars(self, decoder):
- """Test French words with accented characters."""
- text = "café"
- selection = np.ones(len(text), dtype=bool)
- word_list, _, state_list = decoder.get_word_info(text, selection)
- assert len(word_list) == 1, "French word should not be split"
- assert "".join(word_list[0]) == "café"
- def test_get_word_info_underscore_as_splitter(self, decoder):
- """Test that underscores are treated as word splitters."""
- text = "hello_world"
- selection = np.ones(len(text), dtype=bool)
- word_list, _, state_list = decoder.get_word_info(text, selection)
- assert len(word_list) == 2, "Underscore should split words"
- assert "".join(word_list[0]) == "hello"
- assert "".join(word_list[1]) == "world"
- def test_get_word_info_with_mixed_content(self, decoder):
- """Test mixed content with spaces and accented characters."""
- text = "Grüßen Sie"
- selection = np.ones(len(text), dtype=bool)
- word_list, _, state_list = decoder.get_word_info(text, selection)
- assert len(word_list) == 2, "Should have two words separated by space"
- assert "".join(word_list[0]) == "Grüßen"
- assert "".join(word_list[1]) == "Sie"
- def test_get_word_info_with_french_apostrophe(self, decoder):
- """Test French words with apostrophes like n'êtes."""
- text = "n'êtes"
- selection = np.ones(len(text), dtype=bool)
- word_list, _, state_list = decoder.get_word_info(text, selection)
- # Apostrophe should keep words connected in French context
- assert len(word_list) == 1, "French apostrophe should connect words"
- assert "".join(word_list[0]) == "n'êtes"
- def test_get_word_info_with_ascii_only(self, decoder):
- """Test backward compatibility with ASCII-only text."""
- text = "hello world"
- selection = np.ones(len(text), dtype=bool)
- word_list, _, state_list = decoder.get_word_info(text, selection)
- assert len(word_list) == 2
- assert "".join(word_list[0]) == "hello"
- assert "".join(word_list[1]) == "world"
- def test_get_word_info_with_numbers(self, decoder):
- """Test that numbers are properly handled."""
- text = "VGG-16"
- selection = np.ones(len(text), dtype=bool)
- word_list, _, state_list = decoder.get_word_info(text, selection)
- assert len(word_list) == 1, "Hyphenated word-number should stay together"
- assert "".join(word_list[0]) == "VGG-16"
- def test_get_word_info_with_floating_point(self, decoder):
- """Test floating point numbers stay together."""
- text = "price 3.14"
- selection = np.ones(len(text), dtype=bool)
- word_list, _, state_list = decoder.get_word_info(text, selection)
- assert len(word_list) == 2
- assert "".join(word_list[0]) == "price"
- assert "".join(word_list[1]) == "3.14"
- def test_get_word_info_with_chinese(self, decoder):
- """Test Chinese characters are properly grouped."""
- text = "你好啊"
- selection = np.ones(len(text), dtype=bool)
- word_list, _, state_list = decoder.get_word_info(text, selection)
- assert len(word_list) == 1
- assert "".join(word_list[0]) == "你好啊"
- assert state_list[0] == "cn"
|