yichael
/
AIStoryBoard


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
							import os
import sys

import numpy as np
import pytest

current_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(current_dir, "..")))

from ppocr.postprocess.rec_postprocess import BaseRecLabelDecode


class TestBaseRecLabelDecode:
    """Tests for BaseRecLabelDecode.get_word_info() method."""

    @pytest.fixture
    def decoder(self):
        """Create a BaseRecLabelDecode instance for testing."""
        return BaseRecLabelDecode()

    def test_get_word_info_with_german_accented_chars(self, decoder):
        """Test that German words with accented characters are not split."""
        text = "Grüßen"
        selection = np.ones(len(text), dtype=bool)
        word_list, _, state_list = decoder.get_word_info(text, selection)
        assert len(word_list) == 1, "German word should not be split"
        assert "".join(word_list[0]) == "Grüßen"
        assert state_list[0] == "en&num"

    def test_get_word_info_with_longer_german_word(self, decoder):
        """Test longer German words with umlauts remain intact."""
        text = "ungewöhnlichen"
        selection = np.ones(len(text), dtype=bool)
        word_list, _, state_list = decoder.get_word_info(text, selection)
        assert len(word_list) == 1, "German word should not be split"
        assert "".join(word_list[0]) == "ungewöhnlichen"
        assert state_list[0] == "en&num"

    def test_get_word_info_with_french_accented_chars(self, decoder):
        """Test French words with accented characters."""
        text = "café"
        selection = np.ones(len(text), dtype=bool)
        word_list, _, state_list = decoder.get_word_info(text, selection)
        assert len(word_list) == 1, "French word should not be split"
        assert "".join(word_list[0]) == "café"

    def test_get_word_info_underscore_as_splitter(self, decoder):
        """Test that underscores are treated as word splitters."""
        text = "hello_world"
        selection = np.ones(len(text), dtype=bool)
        word_list, _, state_list = decoder.get_word_info(text, selection)
        assert len(word_list) == 2, "Underscore should split words"
        assert "".join(word_list[0]) == "hello"
        assert "".join(word_list[1]) == "world"

    def test_get_word_info_with_mixed_content(self, decoder):
        """Test mixed content with spaces and accented characters."""
        text = "Grüßen Sie"
        selection = np.ones(len(text), dtype=bool)
        word_list, _, state_list = decoder.get_word_info(text, selection)
        assert len(word_list) == 2, "Should have two words separated by space"
        assert "".join(word_list[0]) == "Grüßen"
        assert "".join(word_list[1]) == "Sie"

    def test_get_word_info_with_french_apostrophe(self, decoder):
        """Test French words with apostrophes like n'êtes."""
        text = "n'êtes"
        selection = np.ones(len(text), dtype=bool)
        word_list, _, state_list = decoder.get_word_info(text, selection)
        # Apostrophe should keep words connected in French context
        assert len(word_list) == 1, "French apostrophe should connect words"
        assert "".join(word_list[0]) == "n'êtes"

    def test_get_word_info_with_ascii_only(self, decoder):
        """Test backward compatibility with ASCII-only text."""
        text = "hello world"
        selection = np.ones(len(text), dtype=bool)
        word_list, _, state_list = decoder.get_word_info(text, selection)
        assert len(word_list) == 2
        assert "".join(word_list[0]) == "hello"
        assert "".join(word_list[1]) == "world"

    def test_get_word_info_with_numbers(self, decoder):
        """Test that numbers are properly handled."""
        text = "VGG-16"
        selection = np.ones(len(text), dtype=bool)
        word_list, _, state_list = decoder.get_word_info(text, selection)
        assert len(word_list) == 1, "Hyphenated word-number should stay together"
        assert "".join(word_list[0]) == "VGG-16"

    def test_get_word_info_with_floating_point(self, decoder):
        """Test floating point numbers stay together."""
        text = "price 3.14"
        selection = np.ones(len(text), dtype=bool)
        word_list, _, state_list = decoder.get_word_info(text, selection)
        assert len(word_list) == 2
        assert "".join(word_list[0]) == "price"
        assert "".join(word_list[1]) == "3.14"

    def test_get_word_info_with_chinese(self, decoder):
        """Test Chinese characters are properly grouped."""
        text = "你好啊"
        selection = np.ones(len(text), dtype=bool)
        word_list, _, state_list = decoder.get_word_info(text, selection)
        assert len(word_list) == 1
        assert "".join(word_list[0]) == "你好啊"
        assert state_list[0] == "cn"