test_encoding.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. """
  2. Tests encoding functionality during parsing
  3. for all of the parsers defined in parsers.py
  4. """
  5. from io import (
  6. BytesIO,
  7. TextIOWrapper,
  8. )
  9. import os
  10. import tempfile
  11. import uuid
  12. import numpy as np
  13. import pytest
  14. from pandas import (
  15. DataFrame,
  16. read_csv,
  17. )
  18. import pandas._testing as tm
  19. pytestmark = pytest.mark.filterwarnings(
  20. "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
  21. )
  22. skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
  23. def test_bytes_io_input(all_parsers):
  24. encoding = "cp1255"
  25. parser = all_parsers
  26. data = BytesIO("שלום:1234\n562:123".encode(encoding))
  27. result = parser.read_csv(data, sep=":", encoding=encoding)
  28. expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
  29. tm.assert_frame_equal(result, expected)
  30. @skip_pyarrow # CSV parse error: Empty CSV file or block
  31. def test_read_csv_unicode(all_parsers):
  32. parser = all_parsers
  33. data = BytesIO("\u0141aski, Jan;1".encode())
  34. result = parser.read_csv(data, sep=";", encoding="utf-8", header=None)
  35. expected = DataFrame([["\u0141aski, Jan", 1]])
  36. tm.assert_frame_equal(result, expected)
  37. @skip_pyarrow
  38. @pytest.mark.parametrize("sep", [",", "\t"])
  39. @pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
  40. def test_utf16_bom_skiprows(all_parsers, sep, encoding):
  41. # see gh-2298
  42. parser = all_parsers
  43. data = """skip this
  44. skip this too
  45. A,B,C
  46. 1,2,3
  47. 4,5,6""".replace(
  48. ",", sep
  49. )
  50. path = f"__{uuid.uuid4()}__.csv"
  51. kwargs = {"sep": sep, "skiprows": 2}
  52. utf8 = "utf-8"
  53. with tm.ensure_clean(path) as path:
  54. bytes_data = data.encode(encoding)
  55. with open(path, "wb") as f:
  56. f.write(bytes_data)
  57. with TextIOWrapper(BytesIO(data.encode(utf8)), encoding=utf8) as bytes_buffer:
  58. result = parser.read_csv(path, encoding=encoding, **kwargs)
  59. expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs)
  60. tm.assert_frame_equal(result, expected)
  61. def test_utf16_example(all_parsers, csv_dir_path):
  62. path = os.path.join(csv_dir_path, "utf16_ex.txt")
  63. parser = all_parsers
  64. result = parser.read_csv(path, encoding="utf-16", sep="\t")
  65. assert len(result) == 50
  66. def test_unicode_encoding(all_parsers, csv_dir_path):
  67. path = os.path.join(csv_dir_path, "unicode_series.csv")
  68. parser = all_parsers
  69. result = parser.read_csv(path, header=None, encoding="latin-1")
  70. result = result.set_index(0)
  71. got = result[1][1632]
  72. expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)"
  73. assert got == expected
  74. @pytest.mark.parametrize(
  75. "data,kwargs,expected",
  76. [
  77. # Basic test
  78. ("a\n1", {}, DataFrame({"a": [1]})),
  79. # "Regular" quoting
  80. ('"a"\n1', {"quotechar": '"'}, DataFrame({"a": [1]})),
  81. # Test in a data row instead of header
  82. ("b\n1", {"names": ["a"]}, DataFrame({"a": ["b", "1"]})),
  83. # Test in empty data row with skipping
  84. ("\n1", {"names": ["a"], "skip_blank_lines": True}, DataFrame({"a": [1]})),
  85. # Test in empty data row without skipping
  86. (
  87. "\n1",
  88. {"names": ["a"], "skip_blank_lines": False},
  89. DataFrame({"a": [np.nan, 1]}),
  90. ),
  91. ],
  92. )
  93. def test_utf8_bom(all_parsers, data, kwargs, expected, request):
  94. # see gh-4793
  95. parser = all_parsers
  96. bom = "\ufeff"
  97. utf8 = "utf-8"
  98. def _encode_data_with_bom(_data):
  99. bom_data = (bom + _data).encode(utf8)
  100. return BytesIO(bom_data)
  101. if (
  102. parser.engine == "pyarrow"
  103. and data == "\n1"
  104. and kwargs.get("skip_blank_lines", True)
  105. ):
  106. # CSV parse error: Empty CSV file or block: cannot infer number of columns
  107. pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
  108. result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs)
  109. tm.assert_frame_equal(result, expected)
  110. def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
  111. # see gh-13549
  112. expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
  113. parser = all_parsers
  114. encoding = encoding_fmt.format(utf_value)
  115. data = "mb_num,multibyte\n4.8,test".encode(encoding)
  116. result = parser.read_csv(BytesIO(data), encoding=encoding)
  117. tm.assert_frame_equal(result, expected)
  118. @pytest.mark.parametrize(
  119. "file_path,encoding",
  120. [
  121. (("io", "data", "csv", "test1.csv"), "utf-8"),
  122. (("io", "parser", "data", "unicode_series.csv"), "latin-1"),
  123. (("io", "parser", "data", "sauron.SHIFT_JIS.csv"), "shiftjis"),
  124. ],
  125. )
  126. def test_binary_mode_file_buffers(all_parsers, file_path, encoding, datapath):
  127. # gh-23779: Python csv engine shouldn't error on files opened in binary.
  128. # gh-31575: Python csv engine shouldn't error on files opened in raw binary.
  129. parser = all_parsers
  130. fpath = datapath(*file_path)
  131. expected = parser.read_csv(fpath, encoding=encoding)
  132. with open(fpath, encoding=encoding) as fa:
  133. result = parser.read_csv(fa)
  134. assert not fa.closed
  135. tm.assert_frame_equal(expected, result)
  136. with open(fpath, mode="rb") as fb:
  137. result = parser.read_csv(fb, encoding=encoding)
  138. assert not fb.closed
  139. tm.assert_frame_equal(expected, result)
  140. with open(fpath, mode="rb", buffering=0) as fb:
  141. result = parser.read_csv(fb, encoding=encoding)
  142. assert not fb.closed
  143. tm.assert_frame_equal(expected, result)
  144. @pytest.mark.parametrize("pass_encoding", [True, False])
  145. def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding):
  146. # see gh-24130
  147. parser = all_parsers
  148. encoding = encoding_fmt.format(utf_value)
  149. if parser.engine == "pyarrow" and pass_encoding is True and utf_value in [16, 32]:
  150. # FIXME: this is bad!
  151. pytest.skip("These cases freeze")
  152. expected = DataFrame({"foo": ["bar"]})
  153. with tm.ensure_clean(mode="w+", encoding=encoding, return_filelike=True) as f:
  154. f.write("foo\nbar")
  155. f.seek(0)
  156. result = parser.read_csv(f, encoding=encoding if pass_encoding else None)
  157. tm.assert_frame_equal(result, expected)
  158. def test_encoding_named_temp_file(all_parsers):
  159. # see gh-31819
  160. parser = all_parsers
  161. encoding = "shift-jis"
  162. title = "てすと"
  163. data = "こむ"
  164. expected = DataFrame({title: [data]})
  165. with tempfile.NamedTemporaryFile() as f:
  166. f.write(f"{title}\n{data}".encode(encoding))
  167. f.seek(0)
  168. result = parser.read_csv(f, encoding=encoding)
  169. tm.assert_frame_equal(result, expected)
  170. assert not f.closed
  171. @pytest.mark.parametrize(
  172. "encoding", ["utf-8", "utf-16", "utf-16-be", "utf-16-le", "utf-32"]
  173. )
  174. def test_parse_encoded_special_characters(encoding):
  175. # GH16218 Verify parsing of data with encoded special characters
  176. # Data contains a Unicode 'FULLWIDTH COLON' (U+FF1A) at position (0,"a")
  177. data = "a\tb\n:foo\t0\nbar\t1\nbaz\t2" # noqa: RUF001
  178. encoded_data = BytesIO(data.encode(encoding))
  179. result = read_csv(encoded_data, delimiter="\t", encoding=encoding)
  180. expected = DataFrame(
  181. data=[[":foo", 0], ["bar", 1], ["baz", 2]], # noqa: RUF001
  182. columns=["a", "b"],
  183. )
  184. tm.assert_frame_equal(result, expected)
  185. @pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"])
  186. def test_encoding_memory_map(all_parsers, encoding):
  187. # GH40986
  188. parser = all_parsers
  189. expected = DataFrame(
  190. {
  191. "name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"],
  192. "mask": ["red", "purple", "orange", "blue"],
  193. "weapon": ["sai", "bo staff", "nunchunk", "katana"],
  194. }
  195. )
  196. with tm.ensure_clean() as file:
  197. expected.to_csv(file, index=False, encoding=encoding)
  198. if parser.engine == "pyarrow":
  199. msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
  200. with pytest.raises(ValueError, match=msg):
  201. parser.read_csv(file, encoding=encoding, memory_map=True)
  202. return
  203. df = parser.read_csv(file, encoding=encoding, memory_map=True)
  204. tm.assert_frame_equal(df, expected)
  205. def test_chunk_splits_multibyte_char(all_parsers):
  206. """
  207. Chunk splits a multibyte character with memory_map=True
  208. GH 43540
  209. """
  210. parser = all_parsers
  211. # DEFAULT_CHUNKSIZE = 262144, defined in parsers.pyx
  212. df = DataFrame(data=["a" * 127] * 2048)
  213. # Put two-bytes utf-8 encoded character "ą" at the end of chunk
  214. # utf-8 encoding of "ą" is b'\xc4\x85'
  215. df.iloc[2047] = "a" * 127 + "ą"
  216. with tm.ensure_clean("bug-gh43540.csv") as fname:
  217. df.to_csv(fname, index=False, header=False, encoding="utf-8")
  218. if parser.engine == "pyarrow":
  219. msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
  220. with pytest.raises(ValueError, match=msg):
  221. parser.read_csv(fname, header=None, memory_map=True)
  222. return
  223. dfr = parser.read_csv(fname, header=None, memory_map=True)
  224. tm.assert_frame_equal(dfr, df)
  225. def test_readcsv_memmap_utf8(all_parsers):
  226. """
  227. GH 43787
  228. Test correct handling of UTF-8 chars when memory_map=True and encoding is UTF-8
  229. """
  230. lines = []
  231. line_length = 128
  232. start_char = " "
  233. end_char = "\U00010080"
  234. # This for loop creates a list of 128-char strings
  235. # consisting of consecutive Unicode chars
  236. for lnum in range(ord(start_char), ord(end_char), line_length):
  237. line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n"
  238. try:
  239. line.encode("utf-8")
  240. except UnicodeEncodeError:
  241. continue
  242. lines.append(line)
  243. parser = all_parsers
  244. df = DataFrame(lines)
  245. with tm.ensure_clean("utf8test.csv") as fname:
  246. df.to_csv(fname, index=False, header=False, encoding="utf-8")
  247. if parser.engine == "pyarrow":
  248. msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
  249. with pytest.raises(ValueError, match=msg):
  250. parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8")
  251. return
  252. dfr = parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8")
  253. tm.assert_frame_equal(df, dfr)
  254. @pytest.mark.usefixtures("pyarrow_xfail")
  255. @pytest.mark.parametrize("mode", ["w+b", "w+t"])
  256. def test_not_readable(all_parsers, mode):
  257. # GH43439
  258. parser = all_parsers
  259. content = b"abcd"
  260. if "t" in mode:
  261. content = "abcd"
  262. with tempfile.SpooledTemporaryFile(mode=mode, encoding="utf-8") as handle:
  263. handle.write(content)
  264. handle.seek(0)
  265. df = parser.read_csv(handle)
  266. expected = DataFrame([], columns=["abcd"])
  267. tm.assert_frame_equal(df, expected)