test_converters.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. """
  2. Tests column conversion functionality during parsing
  3. for all of the parsers defined in parsers.py
  4. """
  5. from io import StringIO
  6. from dateutil.parser import parse
  7. import numpy as np
  8. import pytest
  9. import pandas as pd
  10. from pandas import (
  11. DataFrame,
  12. Index,
  13. )
  14. import pandas._testing as tm
  15. def test_converters_type_must_be_dict(all_parsers):
  16. parser = all_parsers
  17. data = """index,A,B,C,D
  18. foo,2,3,4,5
  19. """
  20. if parser.engine == "pyarrow":
  21. msg = "The 'converters' option is not supported with the 'pyarrow' engine"
  22. with pytest.raises(ValueError, match=msg):
  23. parser.read_csv(StringIO(data), converters=0)
  24. return
  25. with pytest.raises(TypeError, match="Type converters.+"):
  26. parser.read_csv(StringIO(data), converters=0)
  27. @pytest.mark.parametrize("column", [3, "D"])
  28. @pytest.mark.parametrize(
  29. "converter", [parse, lambda x: int(x.split("/")[2])] # Produce integer.
  30. )
  31. def test_converters(all_parsers, column, converter):
  32. parser = all_parsers
  33. data = """A,B,C,D
  34. a,1,2,01/01/2009
  35. b,3,4,01/02/2009
  36. c,4,5,01/03/2009
  37. """
  38. if parser.engine == "pyarrow":
  39. msg = "The 'converters' option is not supported with the 'pyarrow' engine"
  40. with pytest.raises(ValueError, match=msg):
  41. parser.read_csv(StringIO(data), converters={column: converter})
  42. return
  43. result = parser.read_csv(StringIO(data), converters={column: converter})
  44. expected = parser.read_csv(StringIO(data))
  45. expected["D"] = expected["D"].map(converter)
  46. tm.assert_frame_equal(result, expected)
  47. def test_converters_no_implicit_conv(all_parsers):
  48. # see gh-2184
  49. parser = all_parsers
  50. data = """000102,1.2,A\n001245,2,B"""
  51. converters = {0: lambda x: x.strip()}
  52. if parser.engine == "pyarrow":
  53. msg = "The 'converters' option is not supported with the 'pyarrow' engine"
  54. with pytest.raises(ValueError, match=msg):
  55. parser.read_csv(StringIO(data), header=None, converters=converters)
  56. return
  57. result = parser.read_csv(StringIO(data), header=None, converters=converters)
  58. # Column 0 should not be casted to numeric and should remain as object.
  59. expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]])
  60. tm.assert_frame_equal(result, expected)
  61. def test_converters_euro_decimal_format(all_parsers):
  62. # see gh-583
  63. converters = {}
  64. parser = all_parsers
  65. data = """Id;Number1;Number2;Text1;Text2;Number3
  66. 1;1521,1541;187101,9543;ABC;poi;4,7387
  67. 2;121,12;14897,76;DEF;uyt;0,3773
  68. 3;878,158;108013,434;GHI;rez;2,7356"""
  69. converters["Number1"] = converters["Number2"] = converters[
  70. "Number3"
  71. ] = lambda x: float(x.replace(",", "."))
  72. if parser.engine == "pyarrow":
  73. msg = "The 'converters' option is not supported with the 'pyarrow' engine"
  74. with pytest.raises(ValueError, match=msg):
  75. parser.read_csv(StringIO(data), sep=";", converters=converters)
  76. return
  77. result = parser.read_csv(StringIO(data), sep=";", converters=converters)
  78. expected = DataFrame(
  79. [
  80. [1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387],
  81. [2, 121.12, 14897.76, "DEF", "uyt", 0.3773],
  82. [3, 878.158, 108013.434, "GHI", "rez", 2.7356],
  83. ],
  84. columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
  85. )
  86. tm.assert_frame_equal(result, expected)
  87. def test_converters_corner_with_nans(all_parsers):
  88. parser = all_parsers
  89. data = """id,score,days
  90. 1,2,12
  91. 2,2-5,
  92. 3,,14+
  93. 4,6-12,2"""
  94. # Example converters.
  95. def convert_days(x):
  96. x = x.strip()
  97. if not x:
  98. return np.nan
  99. is_plus = x.endswith("+")
  100. if is_plus:
  101. x = int(x[:-1]) + 1
  102. else:
  103. x = int(x)
  104. return x
  105. def convert_days_sentinel(x):
  106. x = x.strip()
  107. if not x:
  108. return np.nan
  109. is_plus = x.endswith("+")
  110. if is_plus:
  111. x = int(x[:-1]) + 1
  112. else:
  113. x = int(x)
  114. return x
  115. def convert_score(x):
  116. x = x.strip()
  117. if not x:
  118. return np.nan
  119. if x.find("-") > 0:
  120. val_min, val_max = map(int, x.split("-"))
  121. val = 0.5 * (val_min + val_max)
  122. else:
  123. val = float(x)
  124. return val
  125. results = []
  126. for day_converter in [convert_days, convert_days_sentinel]:
  127. if parser.engine == "pyarrow":
  128. msg = "The 'converters' option is not supported with the 'pyarrow' engine"
  129. with pytest.raises(ValueError, match=msg):
  130. parser.read_csv(
  131. StringIO(data),
  132. converters={"score": convert_score, "days": day_converter},
  133. na_values=["", None],
  134. )
  135. continue
  136. result = parser.read_csv(
  137. StringIO(data),
  138. converters={"score": convert_score, "days": day_converter},
  139. na_values=["", None],
  140. )
  141. assert pd.isna(result["days"][1])
  142. results.append(result)
  143. if parser.engine != "pyarrow":
  144. tm.assert_frame_equal(results[0], results[1])
  145. @pytest.mark.parametrize("conv_f", [lambda x: x, str])
  146. def test_converter_index_col_bug(all_parsers, conv_f):
  147. # see gh-1835 , GH#40589
  148. parser = all_parsers
  149. data = "A;B\n1;2\n3;4"
  150. if parser.engine == "pyarrow":
  151. msg = "The 'converters' option is not supported with the 'pyarrow' engine"
  152. with pytest.raises(ValueError, match=msg):
  153. parser.read_csv(
  154. StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
  155. )
  156. return
  157. rs = parser.read_csv(
  158. StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
  159. )
  160. xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A"))
  161. tm.assert_frame_equal(rs, xp)
  162. def test_converter_identity_object(all_parsers):
  163. # GH#40589
  164. parser = all_parsers
  165. data = "A,B\n1,2\n3,4"
  166. if parser.engine == "pyarrow":
  167. msg = "The 'converters' option is not supported with the 'pyarrow' engine"
  168. with pytest.raises(ValueError, match=msg):
  169. parser.read_csv(StringIO(data), converters={"A": lambda x: x})
  170. return
  171. rs = parser.read_csv(StringIO(data), converters={"A": lambda x: x})
  172. xp = DataFrame({"A": ["1", "3"], "B": [2, 4]})
  173. tm.assert_frame_equal(rs, xp)
  174. def test_converter_multi_index(all_parsers):
  175. # GH 42446
  176. parser = all_parsers
  177. data = "A,B,B\nX,Y,Z\n1,2,3"
  178. if parser.engine == "pyarrow":
  179. msg = "The 'converters' option is not supported with the 'pyarrow' engine"
  180. with pytest.raises(ValueError, match=msg):
  181. parser.read_csv(
  182. StringIO(data),
  183. header=list(range(2)),
  184. converters={
  185. ("A", "X"): np.int32,
  186. ("B", "Y"): np.int32,
  187. ("B", "Z"): np.float32,
  188. },
  189. )
  190. return
  191. result = parser.read_csv(
  192. StringIO(data),
  193. header=list(range(2)),
  194. converters={
  195. ("A", "X"): np.int32,
  196. ("B", "Y"): np.int32,
  197. ("B", "Z"): np.float32,
  198. },
  199. )
  200. expected = DataFrame(
  201. {
  202. ("A", "X"): np.int32([1]),
  203. ("B", "Y"): np.int32([2]),
  204. ("B", "Z"): np.float32([3]),
  205. }
  206. )
  207. tm.assert_frame_equal(result, expected)