test_to_numeric.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978
  1. import decimal
  2. import numpy as np
  3. from numpy import iinfo
  4. import pytest
  5. import pandas.util._test_decorators as td
  6. import pandas as pd
  7. from pandas import (
  8. ArrowDtype,
  9. DataFrame,
  10. Index,
  11. Series,
  12. option_context,
  13. to_numeric,
  14. )
  15. import pandas._testing as tm
  16. @pytest.fixture(params=[None, "ignore", "raise", "coerce"])
  17. def errors(request):
  18. return request.param
  19. @pytest.fixture(params=[True, False])
  20. def signed(request):
  21. return request.param
  22. @pytest.fixture(params=[lambda x: x, str], ids=["identity", "str"])
  23. def transform(request):
  24. return request.param
  25. @pytest.fixture(params=[47393996303418497800, 100000000000000000000])
  26. def large_val(request):
  27. return request.param
  28. @pytest.fixture(params=[True, False])
  29. def multiple_elts(request):
  30. return request.param
  31. @pytest.fixture(
  32. params=[
  33. (lambda x: Index(x, name="idx"), tm.assert_index_equal),
  34. (lambda x: Series(x, name="ser"), tm.assert_series_equal),
  35. (lambda x: np.array(Index(x).values), tm.assert_numpy_array_equal),
  36. ]
  37. )
  38. def transform_assert_equal(request):
  39. return request.param
  40. @pytest.mark.parametrize(
  41. "input_kwargs,result_kwargs",
  42. [
  43. ({}, {"dtype": np.int64}),
  44. ({"errors": "coerce", "downcast": "integer"}, {"dtype": np.int8}),
  45. ],
  46. )
  47. def test_empty(input_kwargs, result_kwargs):
  48. # see gh-16302
  49. ser = Series([], dtype=object)
  50. result = to_numeric(ser, **input_kwargs)
  51. expected = Series([], **result_kwargs)
  52. tm.assert_series_equal(result, expected)
  53. @pytest.mark.parametrize(
  54. "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
  55. )
  56. @pytest.mark.parametrize("last_val", ["7", 7])
  57. def test_series(last_val, infer_string):
  58. with option_context("future.infer_string", infer_string):
  59. ser = Series(["1", "-3.14", last_val])
  60. result = to_numeric(ser)
  61. expected = Series([1, -3.14, 7])
  62. tm.assert_series_equal(result, expected)
  63. @pytest.mark.parametrize(
  64. "data",
  65. [
  66. [1, 3, 4, 5],
  67. [1.0, 3.0, 4.0, 5.0],
  68. # Bool is regarded as numeric.
  69. [True, False, True, True],
  70. ],
  71. )
  72. def test_series_numeric(data):
  73. ser = Series(data, index=list("ABCD"), name="EFG")
  74. result = to_numeric(ser)
  75. tm.assert_series_equal(result, ser)
  76. @pytest.mark.parametrize(
  77. "data,msg",
  78. [
  79. ([1, -3.14, "apple"], 'Unable to parse string "apple" at position 2'),
  80. (
  81. ["orange", 1, -3.14, "apple"],
  82. 'Unable to parse string "orange" at position 0',
  83. ),
  84. ],
  85. )
  86. def test_error(data, msg):
  87. ser = Series(data)
  88. with pytest.raises(ValueError, match=msg):
  89. to_numeric(ser, errors="raise")
  90. @pytest.mark.parametrize(
  91. "errors,exp_data", [("ignore", [1, -3.14, "apple"]), ("coerce", [1, -3.14, np.nan])]
  92. )
  93. @pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning")
  94. def test_ignore_error(errors, exp_data):
  95. ser = Series([1, -3.14, "apple"])
  96. result = to_numeric(ser, errors=errors)
  97. expected = Series(exp_data)
  98. tm.assert_series_equal(result, expected)
  99. @pytest.mark.parametrize(
  100. "errors,exp",
  101. [
  102. ("raise", 'Unable to parse string "apple" at position 2'),
  103. ("ignore", [True, False, "apple"]),
  104. # Coerces to float.
  105. ("coerce", [1.0, 0.0, np.nan]),
  106. ],
  107. )
  108. @pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning")
  109. def test_bool_handling(errors, exp):
  110. ser = Series([True, False, "apple"])
  111. if isinstance(exp, str):
  112. with pytest.raises(ValueError, match=exp):
  113. to_numeric(ser, errors=errors)
  114. else:
  115. result = to_numeric(ser, errors=errors)
  116. expected = Series(exp)
  117. tm.assert_series_equal(result, expected)
  118. def test_list():
  119. ser = ["1", "-3.14", "7"]
  120. res = to_numeric(ser)
  121. expected = np.array([1, -3.14, 7])
  122. tm.assert_numpy_array_equal(res, expected)
  123. @pytest.mark.parametrize(
  124. "data,arr_kwargs",
  125. [
  126. ([1, 3, 4, 5], {"dtype": np.int64}),
  127. ([1.0, 3.0, 4.0, 5.0], {}),
  128. # Boolean is regarded as numeric.
  129. ([True, False, True, True], {}),
  130. ],
  131. )
  132. def test_list_numeric(data, arr_kwargs):
  133. result = to_numeric(data)
  134. expected = np.array(data, **arr_kwargs)
  135. tm.assert_numpy_array_equal(result, expected)
  136. @pytest.mark.parametrize("kwargs", [{"dtype": "O"}, {}])
  137. def test_numeric(kwargs):
  138. data = [1, -3.14, 7]
  139. ser = Series(data, **kwargs)
  140. result = to_numeric(ser)
  141. expected = Series(data)
  142. tm.assert_series_equal(result, expected)
  143. @pytest.mark.parametrize(
  144. "columns",
  145. [
  146. # One column.
  147. "a",
  148. # Multiple columns.
  149. ["a", "b"],
  150. ],
  151. )
  152. def test_numeric_df_columns(columns):
  153. # see gh-14827
  154. df = DataFrame(
  155. {
  156. "a": [1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), "0.1"],
  157. "b": [1.0, 2.0, 3.0, 4.0],
  158. }
  159. )
  160. expected = DataFrame({"a": [1.2, 3.14, np.inf, 0.1], "b": [1.0, 2.0, 3.0, 4.0]})
  161. df_copy = df.copy()
  162. df_copy[columns] = df_copy[columns].apply(to_numeric)
  163. tm.assert_frame_equal(df_copy, expected)
  164. @pytest.mark.parametrize(
  165. "data,exp_data",
  166. [
  167. (
  168. [[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1],
  169. [[3.14, 1.0], 1.6, 0.1],
  170. ),
  171. ([np.array([decimal.Decimal(3.14), 1.0]), 0.1], [[3.14, 1.0], 0.1]),
  172. ],
  173. )
  174. def test_numeric_embedded_arr_likes(data, exp_data):
  175. # Test to_numeric with embedded lists and arrays
  176. df = DataFrame({"a": data})
  177. df["a"] = df["a"].apply(to_numeric)
  178. expected = DataFrame({"a": exp_data})
  179. tm.assert_frame_equal(df, expected)
  180. def test_all_nan():
  181. ser = Series(["a", "b", "c"])
  182. result = to_numeric(ser, errors="coerce")
  183. expected = Series([np.nan, np.nan, np.nan])
  184. tm.assert_series_equal(result, expected)
  185. @pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning")
  186. def test_type_check(errors):
  187. # see gh-11776
  188. df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]})
  189. kwargs = {"errors": errors} if errors is not None else {}
  190. with pytest.raises(TypeError, match="1-d array"):
  191. to_numeric(df, **kwargs)
  192. @pytest.mark.parametrize("val", [1, 1.1, 20001])
  193. def test_scalar(val, signed, transform):
  194. val = -val if signed else val
  195. assert to_numeric(transform(val)) == float(val)
  196. @pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning")
  197. def test_really_large_scalar(large_val, signed, transform, errors):
  198. # see gh-24910
  199. kwargs = {"errors": errors} if errors is not None else {}
  200. val = -large_val if signed else large_val
  201. val = transform(val)
  202. val_is_string = isinstance(val, str)
  203. if val_is_string and errors in (None, "raise"):
  204. msg = "Integer out of range. at position 0"
  205. with pytest.raises(ValueError, match=msg):
  206. to_numeric(val, **kwargs)
  207. else:
  208. expected = float(val) if (errors == "coerce" and val_is_string) else val
  209. tm.assert_almost_equal(to_numeric(val, **kwargs), expected)
  210. @pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning")
  211. def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors):
  212. # see gh-24910
  213. kwargs = {"errors": errors} if errors is not None else {}
  214. val = -large_val if signed else large_val
  215. val = transform(val)
  216. extra_elt = "string"
  217. arr = [val] + multiple_elts * [extra_elt]
  218. val_is_string = isinstance(val, str)
  219. coercing = errors == "coerce"
  220. if errors in (None, "raise") and (val_is_string or multiple_elts):
  221. if val_is_string:
  222. msg = "Integer out of range. at position 0"
  223. else:
  224. msg = 'Unable to parse string "string" at position 1'
  225. with pytest.raises(ValueError, match=msg):
  226. to_numeric(arr, **kwargs)
  227. else:
  228. result = to_numeric(arr, **kwargs)
  229. exp_val = float(val) if (coercing and val_is_string) else val
  230. expected = [exp_val]
  231. if multiple_elts:
  232. if coercing:
  233. expected.append(np.nan)
  234. exp_dtype = float
  235. else:
  236. expected.append(extra_elt)
  237. exp_dtype = object
  238. else:
  239. exp_dtype = float if isinstance(exp_val, (int, float)) else object
  240. tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
  241. @pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning")
  242. def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors):
  243. # see gh-24910
  244. #
  245. # Even if we discover that we have to hold float, does not mean
  246. # we should be lenient on subsequent elements that fail to be integer.
  247. kwargs = {"errors": errors} if errors is not None else {}
  248. arr = [str(-large_val if signed else large_val)]
  249. if multiple_elts:
  250. arr.insert(0, large_val)
  251. if errors in (None, "raise"):
  252. index = int(multiple_elts)
  253. msg = f"Integer out of range. at position {index}"
  254. with pytest.raises(ValueError, match=msg):
  255. to_numeric(arr, **kwargs)
  256. else:
  257. result = to_numeric(arr, **kwargs)
  258. if errors == "coerce":
  259. expected = [float(i) for i in arr]
  260. exp_dtype = float
  261. else:
  262. expected = arr
  263. exp_dtype = object
  264. tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
  265. @pytest.mark.parametrize(
  266. "errors,checker",
  267. [
  268. ("raise", 'Unable to parse string "fail" at position 0'),
  269. ("ignore", lambda x: x == "fail"),
  270. ("coerce", lambda x: np.isnan(x)),
  271. ],
  272. )
  273. @pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning")
  274. def test_scalar_fail(errors, checker):
  275. scalar = "fail"
  276. if isinstance(checker, str):
  277. with pytest.raises(ValueError, match=checker):
  278. to_numeric(scalar, errors=errors)
  279. else:
  280. assert checker(to_numeric(scalar, errors=errors))
  281. @pytest.mark.parametrize("data", [[1, 2, 3], [1.0, np.nan, 3, np.nan]])
  282. def test_numeric_dtypes(data, transform_assert_equal):
  283. transform, assert_equal = transform_assert_equal
  284. data = transform(data)
  285. result = to_numeric(data)
  286. assert_equal(result, data)
  287. @pytest.mark.parametrize(
  288. "data,exp",
  289. [
  290. (["1", "2", "3"], np.array([1, 2, 3], dtype="int64")),
  291. (["1.5", "2.7", "3.4"], np.array([1.5, 2.7, 3.4])),
  292. ],
  293. )
  294. def test_str(data, exp, transform_assert_equal):
  295. transform, assert_equal = transform_assert_equal
  296. result = to_numeric(transform(data))
  297. expected = transform(exp)
  298. assert_equal(result, expected)
  299. def test_datetime_like(tz_naive_fixture, transform_assert_equal):
  300. transform, assert_equal = transform_assert_equal
  301. idx = pd.date_range("20130101", periods=3, tz=tz_naive_fixture)
  302. result = to_numeric(transform(idx))
  303. expected = transform(idx.asi8)
  304. assert_equal(result, expected)
  305. def test_timedelta(transform_assert_equal):
  306. transform, assert_equal = transform_assert_equal
  307. idx = pd.timedelta_range("1 days", periods=3, freq="D")
  308. result = to_numeric(transform(idx))
  309. expected = transform(idx.asi8)
  310. assert_equal(result, expected)
  311. def test_period(request, transform_assert_equal):
  312. transform, assert_equal = transform_assert_equal
  313. idx = pd.period_range("2011-01", periods=3, freq="M", name="")
  314. inp = transform(idx)
  315. if not isinstance(inp, Index):
  316. request.applymarker(
  317. pytest.mark.xfail(reason="Missing PeriodDtype support in to_numeric")
  318. )
  319. result = to_numeric(inp)
  320. expected = transform(idx.asi8)
  321. assert_equal(result, expected)
  322. @pytest.mark.parametrize(
  323. "errors,expected",
  324. [
  325. ("raise", "Invalid object type at position 0"),
  326. ("ignore", Series([[10.0, 2], 1.0, "apple"])),
  327. ("coerce", Series([np.nan, 1.0, np.nan])),
  328. ],
  329. )
  330. @pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning")
  331. def test_non_hashable(errors, expected):
  332. # see gh-13324
  333. ser = Series([[10.0, 2], 1.0, "apple"])
  334. if isinstance(expected, str):
  335. with pytest.raises(TypeError, match=expected):
  336. to_numeric(ser, errors=errors)
  337. else:
  338. result = to_numeric(ser, errors=errors)
  339. tm.assert_series_equal(result, expected)
  340. def test_downcast_invalid_cast():
  341. # see gh-13352
  342. data = ["1", 2, 3]
  343. invalid_downcast = "unsigned-integer"
  344. msg = "invalid downcasting method provided"
  345. with pytest.raises(ValueError, match=msg):
  346. to_numeric(data, downcast=invalid_downcast)
  347. def test_errors_invalid_value():
  348. # see gh-26466
  349. data = ["1", 2, 3]
  350. invalid_error_value = "invalid"
  351. msg = "invalid error value specified"
  352. with pytest.raises(ValueError, match=msg):
  353. to_numeric(data, errors=invalid_error_value)
  354. @pytest.mark.parametrize(
  355. "data",
  356. [
  357. ["1", 2, 3],
  358. [1, 2, 3],
  359. np.array(["1970-01-02", "1970-01-03", "1970-01-04"], dtype="datetime64[D]"),
  360. ],
  361. )
  362. @pytest.mark.parametrize(
  363. "kwargs,exp_dtype",
  364. [
  365. # Basic function tests.
  366. ({}, np.int64),
  367. ({"downcast": None}, np.int64),
  368. # Support below np.float32 is rare and far between.
  369. ({"downcast": "float"}, np.dtype(np.float32).char),
  370. # Basic dtype support.
  371. ({"downcast": "unsigned"}, np.dtype(np.typecodes["UnsignedInteger"][0])),
  372. ],
  373. )
  374. def test_downcast_basic(data, kwargs, exp_dtype):
  375. # see gh-13352
  376. result = to_numeric(data, **kwargs)
  377. expected = np.array([1, 2, 3], dtype=exp_dtype)
  378. tm.assert_numpy_array_equal(result, expected)
  379. @pytest.mark.parametrize("signed_downcast", ["integer", "signed"])
  380. @pytest.mark.parametrize(
  381. "data",
  382. [
  383. ["1", 2, 3],
  384. [1, 2, 3],
  385. np.array(["1970-01-02", "1970-01-03", "1970-01-04"], dtype="datetime64[D]"),
  386. ],
  387. )
  388. def test_signed_downcast(data, signed_downcast):
  389. # see gh-13352
  390. smallest_int_dtype = np.dtype(np.typecodes["Integer"][0])
  391. expected = np.array([1, 2, 3], dtype=smallest_int_dtype)
  392. res = to_numeric(data, downcast=signed_downcast)
  393. tm.assert_numpy_array_equal(res, expected)
  394. def test_ignore_downcast_invalid_data():
  395. # If we can't successfully cast the given
  396. # data to a numeric dtype, do not bother
  397. # with the downcast parameter.
  398. data = ["foo", 2, 3]
  399. expected = np.array(data, dtype=object)
  400. msg = "errors='ignore' is deprecated"
  401. with tm.assert_produces_warning(FutureWarning, match=msg):
  402. res = to_numeric(data, errors="ignore", downcast="unsigned")
  403. tm.assert_numpy_array_equal(res, expected)
  404. def test_ignore_downcast_neg_to_unsigned():
  405. # Cannot cast to an unsigned integer
  406. # because we have a negative number.
  407. data = ["-1", 2, 3]
  408. expected = np.array([-1, 2, 3], dtype=np.int64)
  409. res = to_numeric(data, downcast="unsigned")
  410. tm.assert_numpy_array_equal(res, expected)
  411. # Warning in 32 bit platforms
  412. @pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning")
  413. @pytest.mark.parametrize("downcast", ["integer", "signed", "unsigned"])
  414. @pytest.mark.parametrize(
  415. "data,expected",
  416. [
  417. (["1.1", 2, 3], np.array([1.1, 2, 3], dtype=np.float64)),
  418. (
  419. [10000.0, 20000, 3000, 40000.36, 50000, 50000.00],
  420. np.array(
  421. [10000.0, 20000, 3000, 40000.36, 50000, 50000.00], dtype=np.float64
  422. ),
  423. ),
  424. ],
  425. )
  426. def test_ignore_downcast_cannot_convert_float(data, expected, downcast):
  427. # Cannot cast to an integer (signed or unsigned)
  428. # because we have a float number.
  429. res = to_numeric(data, downcast=downcast)
  430. tm.assert_numpy_array_equal(res, expected)
  431. @pytest.mark.parametrize(
  432. "downcast,expected_dtype",
  433. [("integer", np.int16), ("signed", np.int16), ("unsigned", np.uint16)],
  434. )
  435. def test_downcast_not8bit(downcast, expected_dtype):
  436. # the smallest integer dtype need not be np.(u)int8
  437. data = ["256", 257, 258]
  438. expected = np.array([256, 257, 258], dtype=expected_dtype)
  439. res = to_numeric(data, downcast=downcast)
  440. tm.assert_numpy_array_equal(res, expected)
  441. @pytest.mark.parametrize(
  442. "dtype,downcast,min_max",
  443. [
  444. ("int8", "integer", [iinfo(np.int8).min, iinfo(np.int8).max]),
  445. ("int16", "integer", [iinfo(np.int16).min, iinfo(np.int16).max]),
  446. ("int32", "integer", [iinfo(np.int32).min, iinfo(np.int32).max]),
  447. ("int64", "integer", [iinfo(np.int64).min, iinfo(np.int64).max]),
  448. ("uint8", "unsigned", [iinfo(np.uint8).min, iinfo(np.uint8).max]),
  449. ("uint16", "unsigned", [iinfo(np.uint16).min, iinfo(np.uint16).max]),
  450. ("uint32", "unsigned", [iinfo(np.uint32).min, iinfo(np.uint32).max]),
  451. ("uint64", "unsigned", [iinfo(np.uint64).min, iinfo(np.uint64).max]),
  452. ("int16", "integer", [iinfo(np.int8).min, iinfo(np.int8).max + 1]),
  453. ("int32", "integer", [iinfo(np.int16).min, iinfo(np.int16).max + 1]),
  454. ("int64", "integer", [iinfo(np.int32).min, iinfo(np.int32).max + 1]),
  455. ("int16", "integer", [iinfo(np.int8).min - 1, iinfo(np.int16).max]),
  456. ("int32", "integer", [iinfo(np.int16).min - 1, iinfo(np.int32).max]),
  457. ("int64", "integer", [iinfo(np.int32).min - 1, iinfo(np.int64).max]),
  458. ("uint16", "unsigned", [iinfo(np.uint8).min, iinfo(np.uint8).max + 1]),
  459. ("uint32", "unsigned", [iinfo(np.uint16).min, iinfo(np.uint16).max + 1]),
  460. ("uint64", "unsigned", [iinfo(np.uint32).min, iinfo(np.uint32).max + 1]),
  461. ],
  462. )
  463. def test_downcast_limits(dtype, downcast, min_max):
  464. # see gh-14404: test the limits of each downcast.
  465. series = to_numeric(Series(min_max), downcast=downcast)
  466. assert series.dtype == dtype
  467. def test_downcast_float64_to_float32():
  468. # GH-43693: Check float64 preservation when >= 16,777,217
  469. series = Series([16777217.0, np.finfo(np.float64).max, np.nan], dtype=np.float64)
  470. result = to_numeric(series, downcast="float")
  471. assert series.dtype == result.dtype
  472. @pytest.mark.parametrize(
  473. "ser,expected",
  474. [
  475. (
  476. Series([0, 9223372036854775808]),
  477. Series([0, 9223372036854775808], dtype=np.uint64),
  478. )
  479. ],
  480. )
  481. def test_downcast_uint64(ser, expected):
  482. # see gh-14422:
  483. # BUG: to_numeric doesn't work uint64 numbers
  484. result = to_numeric(ser, downcast="unsigned")
  485. tm.assert_series_equal(result, expected)
  486. @pytest.mark.parametrize(
  487. "data,exp_data",
  488. [
  489. (
  490. [200, 300, "", "NaN", 30000000000000000000],
  491. [200, 300, np.nan, np.nan, 30000000000000000000],
  492. ),
  493. (
  494. ["12345678901234567890", "1234567890", "ITEM"],
  495. [12345678901234567890, 1234567890, np.nan],
  496. ),
  497. ],
  498. )
  499. def test_coerce_uint64_conflict(data, exp_data):
  500. # see gh-17007 and gh-17125
  501. #
  502. # Still returns float despite the uint64-nan conflict,
  503. # which would normally force the casting to object.
  504. result = to_numeric(Series(data), errors="coerce")
  505. expected = Series(exp_data, dtype=float)
  506. tm.assert_series_equal(result, expected)
  507. @pytest.mark.parametrize(
  508. "errors,exp",
  509. [
  510. ("ignore", Series(["12345678901234567890", "1234567890", "ITEM"])),
  511. ("raise", "Unable to parse string"),
  512. ],
  513. )
  514. @pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning")
  515. def test_non_coerce_uint64_conflict(errors, exp):
  516. # see gh-17007 and gh-17125
  517. #
  518. # For completeness.
  519. ser = Series(["12345678901234567890", "1234567890", "ITEM"])
  520. if isinstance(exp, str):
  521. with pytest.raises(ValueError, match=exp):
  522. to_numeric(ser, errors=errors)
  523. else:
  524. result = to_numeric(ser, errors=errors)
  525. tm.assert_series_equal(result, ser)
  526. @pytest.mark.parametrize("dc1", ["integer", "float", "unsigned"])
  527. @pytest.mark.parametrize("dc2", ["integer", "float", "unsigned"])
  528. def test_downcast_empty(dc1, dc2):
  529. # GH32493
  530. tm.assert_numpy_array_equal(
  531. to_numeric([], downcast=dc1),
  532. to_numeric([], downcast=dc2),
  533. check_dtype=False,
  534. )
  535. def test_failure_to_convert_uint64_string_to_NaN():
  536. # GH 32394
  537. result = to_numeric("uint64", errors="coerce")
  538. assert np.isnan(result)
  539. ser = Series([32, 64, np.nan])
  540. result = to_numeric(Series(["32", "64", "uint64"]), errors="coerce")
  541. tm.assert_series_equal(result, ser)
  542. @pytest.mark.parametrize(
  543. "strrep",
  544. [
  545. "243.164",
  546. "245.968",
  547. "249.585",
  548. "259.745",
  549. "265.742",
  550. "272.567",
  551. "279.196",
  552. "280.366",
  553. "275.034",
  554. "271.351",
  555. "272.889",
  556. "270.627",
  557. "280.828",
  558. "290.383",
  559. "308.153",
  560. "319.945",
  561. "336.0",
  562. "344.09",
  563. "351.385",
  564. "356.178",
  565. "359.82",
  566. "361.03",
  567. "367.701",
  568. "380.812",
  569. "387.98",
  570. "391.749",
  571. "391.171",
  572. "385.97",
  573. "385.345",
  574. "386.121",
  575. "390.996",
  576. "399.734",
  577. "413.073",
  578. "421.532",
  579. "430.221",
  580. "437.092",
  581. "439.746",
  582. "446.01",
  583. "451.191",
  584. "460.463",
  585. "469.779",
  586. "472.025",
  587. "479.49",
  588. "474.864",
  589. "467.54",
  590. "471.978",
  591. ],
  592. )
  593. def test_precision_float_conversion(strrep):
  594. # GH 31364
  595. result = to_numeric(strrep)
  596. assert result == float(strrep)
  597. @pytest.mark.parametrize(
  598. "values, expected",
  599. [
  600. (["1", "2", None], Series([1, 2, np.nan], dtype="Int64")),
  601. (["1", "2", "3"], Series([1, 2, 3], dtype="Int64")),
  602. (["1", "2", 3], Series([1, 2, 3], dtype="Int64")),
  603. (["1", "2", 3.5], Series([1, 2, 3.5], dtype="Float64")),
  604. (["1", None, 3.5], Series([1, np.nan, 3.5], dtype="Float64")),
  605. (["1", "2", "3.5"], Series([1, 2, 3.5], dtype="Float64")),
  606. ],
  607. )
  608. def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected):
  609. # https://github.com/pandas-dev/pandas/issues/37262
  610. s = Series(values, dtype=nullable_string_dtype)
  611. result = to_numeric(s)
  612. tm.assert_series_equal(result, expected)
  613. def test_to_numeric_from_nullable_string_coerce(nullable_string_dtype):
  614. # GH#52146
  615. values = ["a", "1"]
  616. ser = Series(values, dtype=nullable_string_dtype)
  617. result = to_numeric(ser, errors="coerce")
  618. expected = Series([pd.NA, 1], dtype="Int64")
  619. tm.assert_series_equal(result, expected)
  620. def test_to_numeric_from_nullable_string_ignore(nullable_string_dtype):
  621. # GH#52146
  622. values = ["a", "1"]
  623. ser = Series(values, dtype=nullable_string_dtype)
  624. expected = ser.copy()
  625. msg = "errors='ignore' is deprecated"
  626. with tm.assert_produces_warning(FutureWarning, match=msg):
  627. result = to_numeric(ser, errors="ignore")
  628. tm.assert_series_equal(result, expected)
  629. @pytest.mark.parametrize(
  630. "data, input_dtype, downcast, expected_dtype",
  631. (
  632. ([1, 1], "Int64", "integer", "Int8"),
  633. ([1.0, pd.NA], "Float64", "integer", "Int8"),
  634. ([1.0, 1.1], "Float64", "integer", "Float64"),
  635. ([1, pd.NA], "Int64", "integer", "Int8"),
  636. ([450, 300], "Int64", "integer", "Int16"),
  637. ([1, 1], "Float64", "integer", "Int8"),
  638. ([np.iinfo(np.int64).max - 1, 1], "Int64", "integer", "Int64"),
  639. ([1, 1], "Int64", "signed", "Int8"),
  640. ([1.0, 1.0], "Float32", "signed", "Int8"),
  641. ([1.0, 1.1], "Float64", "signed", "Float64"),
  642. ([1, pd.NA], "Int64", "signed", "Int8"),
  643. ([450, -300], "Int64", "signed", "Int16"),
  644. ([np.iinfo(np.uint64).max - 1, 1], "UInt64", "signed", "UInt64"),
  645. ([1, 1], "Int64", "unsigned", "UInt8"),
  646. ([1.0, 1.0], "Float32", "unsigned", "UInt8"),
  647. ([1.0, 1.1], "Float64", "unsigned", "Float64"),
  648. ([1, pd.NA], "Int64", "unsigned", "UInt8"),
  649. ([450, -300], "Int64", "unsigned", "Int64"),
  650. ([-1, -1], "Int32", "unsigned", "Int32"),
  651. ([1, 1], "Float64", "float", "Float32"),
  652. ([1, 1.1], "Float64", "float", "Float32"),
  653. ([1, 1], "Float32", "float", "Float32"),
  654. ([1, 1.1], "Float32", "float", "Float32"),
  655. ),
  656. )
  657. def test_downcast_nullable_numeric(data, input_dtype, downcast, expected_dtype):
  658. arr = pd.array(data, dtype=input_dtype)
  659. result = to_numeric(arr, downcast=downcast)
  660. expected = pd.array(data, dtype=expected_dtype)
  661. tm.assert_extension_array_equal(result, expected)
  662. def test_downcast_nullable_mask_is_copied():
  663. # GH38974
  664. arr = pd.array([1, 2, pd.NA], dtype="Int64")
  665. result = to_numeric(arr, downcast="integer")
  666. expected = pd.array([1, 2, pd.NA], dtype="Int8")
  667. tm.assert_extension_array_equal(result, expected)
  668. arr[1] = pd.NA # should not modify result
  669. tm.assert_extension_array_equal(result, expected)
  670. def test_to_numeric_scientific_notation():
  671. # GH 15898
  672. result = to_numeric("1.7e+308")
  673. expected = np.float64(1.7e308)
  674. assert result == expected
  675. @pytest.mark.parametrize("val", [9876543210.0, 2.0**128])
  676. def test_to_numeric_large_float_not_downcast_to_float_32(val):
  677. # GH 19729
  678. expected = Series([val])
  679. result = to_numeric(expected, downcast="float")
  680. tm.assert_series_equal(result, expected)
  681. @pytest.mark.parametrize(
  682. "val, dtype", [(1, "Int64"), (1.5, "Float64"), (True, "boolean")]
  683. )
  684. def test_to_numeric_dtype_backend(val, dtype):
  685. # GH#50505
  686. ser = Series([val], dtype=object)
  687. result = to_numeric(ser, dtype_backend="numpy_nullable")
  688. expected = Series([val], dtype=dtype)
  689. tm.assert_series_equal(result, expected)
  690. @pytest.mark.parametrize(
  691. "val, dtype",
  692. [
  693. (1, "Int64"),
  694. (1.5, "Float64"),
  695. (True, "boolean"),
  696. (1, "int64[pyarrow]"),
  697. (1.5, "float64[pyarrow]"),
  698. (True, "bool[pyarrow]"),
  699. ],
  700. )
  701. def test_to_numeric_dtype_backend_na(val, dtype):
  702. # GH#50505
  703. if "pyarrow" in dtype:
  704. pytest.importorskip("pyarrow")
  705. dtype_backend = "pyarrow"
  706. else:
  707. dtype_backend = "numpy_nullable"
  708. ser = Series([val, None], dtype=object)
  709. result = to_numeric(ser, dtype_backend=dtype_backend)
  710. expected = Series([val, pd.NA], dtype=dtype)
  711. tm.assert_series_equal(result, expected)
  712. @pytest.mark.parametrize(
  713. "val, dtype, downcast",
  714. [
  715. (1, "Int8", "integer"),
  716. (1.5, "Float32", "float"),
  717. (1, "Int8", "signed"),
  718. (1, "int8[pyarrow]", "integer"),
  719. (1.5, "float[pyarrow]", "float"),
  720. (1, "int8[pyarrow]", "signed"),
  721. ],
  722. )
  723. def test_to_numeric_dtype_backend_downcasting(val, dtype, downcast):
  724. # GH#50505
  725. if "pyarrow" in dtype:
  726. pytest.importorskip("pyarrow")
  727. dtype_backend = "pyarrow"
  728. else:
  729. dtype_backend = "numpy_nullable"
  730. ser = Series([val, None], dtype=object)
  731. result = to_numeric(ser, dtype_backend=dtype_backend, downcast=downcast)
  732. expected = Series([val, pd.NA], dtype=dtype)
  733. tm.assert_series_equal(result, expected)
  734. @pytest.mark.parametrize(
  735. "smaller, dtype_backend",
  736. [["UInt8", "numpy_nullable"], ["uint8[pyarrow]", "pyarrow"]],
  737. )
  738. def test_to_numeric_dtype_backend_downcasting_uint(smaller, dtype_backend):
  739. # GH#50505
  740. if dtype_backend == "pyarrow":
  741. pytest.importorskip("pyarrow")
  742. ser = Series([1, pd.NA], dtype="UInt64")
  743. result = to_numeric(ser, dtype_backend=dtype_backend, downcast="unsigned")
  744. expected = Series([1, pd.NA], dtype=smaller)
  745. tm.assert_series_equal(result, expected)
  746. @pytest.mark.parametrize(
  747. "dtype",
  748. [
  749. "Int64",
  750. "UInt64",
  751. "Float64",
  752. "boolean",
  753. "int64[pyarrow]",
  754. "uint64[pyarrow]",
  755. "float64[pyarrow]",
  756. "bool[pyarrow]",
  757. ],
  758. )
  759. def test_to_numeric_dtype_backend_already_nullable(dtype):
  760. # GH#50505
  761. if "pyarrow" in dtype:
  762. pytest.importorskip("pyarrow")
  763. ser = Series([1, pd.NA], dtype=dtype)
  764. result = to_numeric(ser, dtype_backend="numpy_nullable")
  765. expected = Series([1, pd.NA], dtype=dtype)
  766. tm.assert_series_equal(result, expected)
  767. def test_to_numeric_dtype_backend_error(dtype_backend):
  768. # GH#50505
  769. ser = Series(["a", "b", ""])
  770. expected = ser.copy()
  771. with pytest.raises(ValueError, match="Unable to parse string"):
  772. to_numeric(ser, dtype_backend=dtype_backend)
  773. msg = "errors='ignore' is deprecated"
  774. with tm.assert_produces_warning(FutureWarning, match=msg):
  775. result = to_numeric(ser, dtype_backend=dtype_backend, errors="ignore")
  776. tm.assert_series_equal(result, expected)
  777. result = to_numeric(ser, dtype_backend=dtype_backend, errors="coerce")
  778. if dtype_backend == "pyarrow":
  779. dtype = "double[pyarrow]"
  780. else:
  781. dtype = "Float64"
  782. expected = Series([np.nan, np.nan, np.nan], dtype=dtype)
  783. tm.assert_series_equal(result, expected)
  784. def test_invalid_dtype_backend():
  785. ser = Series([1, 2, 3])
  786. msg = (
  787. "dtype_backend numpy is invalid, only 'numpy_nullable' and "
  788. "'pyarrow' are allowed."
  789. )
  790. with pytest.raises(ValueError, match=msg):
  791. to_numeric(ser, dtype_backend="numpy")
  792. def test_coerce_pyarrow_backend():
  793. # GH 52588
  794. pa = pytest.importorskip("pyarrow")
  795. ser = Series(list("12x"), dtype=ArrowDtype(pa.string()))
  796. result = to_numeric(ser, errors="coerce", dtype_backend="pyarrow")
  797. expected = Series([1, 2, None], dtype=ArrowDtype(pa.int64()))
  798. tm.assert_series_equal(result, expected)