test_missing.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. from datetime import timedelta
  2. import numpy as np
  3. import pytest
  4. from pandas._libs import iNaT
  5. import pandas as pd
  6. from pandas import (
  7. Categorical,
  8. Index,
  9. NaT,
  10. Series,
  11. isna,
  12. )
  13. import pandas._testing as tm
  14. class TestSeriesMissingData:
  15. def test_categorical_nan_handling(self):
  16. # NaNs are represented as -1 in labels
  17. s = Series(Categorical(["a", "b", np.nan, "a"]))
  18. tm.assert_index_equal(s.cat.categories, Index(["a", "b"]))
  19. tm.assert_numpy_array_equal(
  20. s.values.codes, np.array([0, 1, -1, 0], dtype=np.int8)
  21. )
  22. def test_isna_for_inf(self):
  23. s = Series(["a", np.inf, np.nan, pd.NA, 1.0])
  24. msg = "use_inf_as_na option is deprecated"
  25. with tm.assert_produces_warning(FutureWarning, match=msg):
  26. with pd.option_context("mode.use_inf_as_na", True):
  27. r = s.isna()
  28. dr = s.dropna()
  29. e = Series([False, True, True, True, False])
  30. de = Series(["a", 1.0], index=[0, 4])
  31. tm.assert_series_equal(r, e)
  32. tm.assert_series_equal(dr, de)
  33. def test_timedelta64_nan(self):
  34. td = Series([timedelta(days=i) for i in range(10)])
  35. # nan ops on timedeltas
  36. td1 = td.copy()
  37. td1[0] = np.nan
  38. assert isna(td1[0])
  39. assert td1[0]._value == iNaT
  40. td1[0] = td[0]
  41. assert not isna(td1[0])
  42. # GH#16674 iNaT is treated as an integer when given by the user
  43. with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"):
  44. td1[1] = iNaT
  45. assert not isna(td1[1])
  46. assert td1.dtype == np.object_
  47. assert td1[1] == iNaT
  48. td1[1] = td[1]
  49. assert not isna(td1[1])
  50. td1[2] = NaT
  51. assert isna(td1[2])
  52. assert td1[2]._value == iNaT
  53. td1[2] = td[2]
  54. assert not isna(td1[2])
  55. # boolean setting
  56. # GH#2899 boolean setting
  57. td3 = np.timedelta64(timedelta(days=3))
  58. td7 = np.timedelta64(timedelta(days=7))
  59. td[(td > td3) & (td < td7)] = np.nan
  60. assert isna(td).sum() == 3
  61. @pytest.mark.xfail(
  62. reason="Chained inequality raises when trying to define 'selector'"
  63. )
  64. def test_logical_range_select(self, datetime_series):
  65. # NumPy limitation =(
  66. # https://github.com/pandas-dev/pandas/commit/9030dc021f07c76809848925cb34828f6c8484f3
  67. selector = -0.5 <= datetime_series <= 0.5
  68. expected = (datetime_series >= -0.5) & (datetime_series <= 0.5)
  69. tm.assert_series_equal(selector, expected)
  70. def test_valid(self, datetime_series):
  71. ts = datetime_series.copy()
  72. ts.index = ts.index._with_freq(None)
  73. ts[::2] = np.nan
  74. result = ts.dropna()
  75. assert len(result) == ts.count()
  76. tm.assert_series_equal(result, ts[1::2])
  77. tm.assert_series_equal(result, ts[pd.notna(ts)])
  78. def test_hasnans_uncached_for_series():
  79. # GH#19700
  80. # set float64 dtype to avoid upcast when setting nan
  81. idx = Index([0, 1], dtype="float64")
  82. assert idx.hasnans is False
  83. assert "hasnans" in idx._cache
  84. ser = idx.to_series()
  85. assert ser.hasnans is False
  86. assert not hasattr(ser, "_cache")
  87. ser.iloc[-1] = np.nan
  88. assert ser.hasnans is True