__init__.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. ######################## BEGIN LICENSE BLOCK ########################
  2. # This library is free software; you can redistribute it and/or
  3. # modify it under the terms of the GNU Lesser General Public
  4. # License as published by the Free Software Foundation; either
  5. # version 2.1 of the License, or (at your option) any later version.
  6. #
  7. # This library is distributed in the hope that it will be useful,
  8. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. # Lesser General Public License for more details.
  11. #
  12. # You should have received a copy of the GNU Lesser General Public
  13. # License along with this library; if not, write to the Free Software
  14. # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
  15. # 02110-1301 USA
  16. ######################### END LICENSE BLOCK #########################
  17. from typing import List, Union
  18. from .charsetgroupprober import CharSetGroupProber
  19. from .charsetprober import CharSetProber
  20. from .enums import InputState
  21. from .resultdict import ResultDict
  22. from .universaldetector import UniversalDetector
  23. from .version import VERSION, __version__
  24. __all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]
  25. def detect(
  26. byte_str: Union[bytes, bytearray], should_rename_legacy: bool = False
  27. ) -> ResultDict:
  28. """
  29. Detect the encoding of the given byte string.
  30. :param byte_str: The byte sequence to examine.
  31. :type byte_str: ``bytes`` or ``bytearray``
  32. :param should_rename_legacy: Should we rename legacy encodings
  33. to their more modern equivalents?
  34. :type should_rename_legacy: ``bool``
  35. """
  36. if not isinstance(byte_str, bytearray):
  37. if not isinstance(byte_str, bytes):
  38. raise TypeError(
  39. f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
  40. )
  41. byte_str = bytearray(byte_str)
  42. detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
  43. detector.feed(byte_str)
  44. return detector.close()
  45. def detect_all(
  46. byte_str: Union[bytes, bytearray],
  47. ignore_threshold: bool = False,
  48. should_rename_legacy: bool = False,
  49. ) -> List[ResultDict]:
  50. """
  51. Detect all the possible encodings of the given byte string.
  52. :param byte_str: The byte sequence to examine.
  53. :type byte_str: ``bytes`` or ``bytearray``
  54. :param ignore_threshold: Include encodings that are below
  55. ``UniversalDetector.MINIMUM_THRESHOLD``
  56. in results.
  57. :type ignore_threshold: ``bool``
  58. :param should_rename_legacy: Should we rename legacy encodings
  59. to their more modern equivalents?
  60. :type should_rename_legacy: ``bool``
  61. """
  62. if not isinstance(byte_str, bytearray):
  63. if not isinstance(byte_str, bytes):
  64. raise TypeError(
  65. f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
  66. )
  67. byte_str = bytearray(byte_str)
  68. detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
  69. detector.feed(byte_str)
  70. detector.close()
  71. if detector.input_state == InputState.HIGH_BYTE:
  72. results: List[ResultDict] = []
  73. probers: List[CharSetProber] = []
  74. for prober in detector.charset_probers:
  75. if isinstance(prober, CharSetGroupProber):
  76. probers.extend(p for p in prober.probers)
  77. else:
  78. probers.append(prober)
  79. for prober in probers:
  80. if ignore_threshold or prober.get_confidence() > detector.MINIMUM_THRESHOLD:
  81. charset_name = prober.charset_name or ""
  82. lower_charset_name = charset_name.lower()
  83. # Use Windows encoding name instead of ISO-8859 if we saw any
  84. # extra Windows-specific bytes
  85. if lower_charset_name.startswith("iso-8859") and detector.has_win_bytes:
  86. charset_name = detector.ISO_WIN_MAP.get(
  87. lower_charset_name, charset_name
  88. )
  89. # Rename legacy encodings with superset encodings if asked
  90. if should_rename_legacy:
  91. charset_name = detector.LEGACY_MAP.get(
  92. charset_name.lower(), charset_name
  93. )
  94. results.append(
  95. {
  96. "encoding": charset_name,
  97. "confidence": prober.get_confidence(),
  98. "language": prober.language,
  99. }
  100. )
  101. if len(results) > 0:
  102. return sorted(results, key=lambda result: -result["confidence"])
  103. return [detector.result]