test_algos.py 77 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059
  1. from datetime import datetime
  2. import struct
  3. import numpy as np
  4. import pytest
  5. from pandas._config import using_string_dtype
  6. from pandas._libs import (
  7. algos as libalgos,
  8. hashtable as ht,
  9. )
  10. from pandas.core.dtypes.common import (
  11. is_bool_dtype,
  12. is_complex_dtype,
  13. is_float_dtype,
  14. is_integer_dtype,
  15. is_object_dtype,
  16. )
  17. from pandas.core.dtypes.dtypes import CategoricalDtype
  18. import pandas as pd
  19. from pandas import (
  20. Categorical,
  21. CategoricalIndex,
  22. DataFrame,
  23. DatetimeIndex,
  24. Index,
  25. IntervalIndex,
  26. MultiIndex,
  27. NaT,
  28. Period,
  29. PeriodIndex,
  30. Series,
  31. Timedelta,
  32. Timestamp,
  33. cut,
  34. date_range,
  35. timedelta_range,
  36. to_datetime,
  37. to_timedelta,
  38. )
  39. import pandas._testing as tm
  40. import pandas.core.algorithms as algos
  41. from pandas.core.arrays import (
  42. DatetimeArray,
  43. TimedeltaArray,
  44. )
  45. import pandas.core.common as com
  46. class TestFactorize:
  47. def test_factorize_complex(self):
  48. # GH#17927
  49. array = [1, 2, 2 + 1j]
  50. msg = "factorize with argument that is not not a Series"
  51. with tm.assert_produces_warning(FutureWarning, match=msg):
  52. labels, uniques = algos.factorize(array)
  53. expected_labels = np.array([0, 1, 2], dtype=np.intp)
  54. tm.assert_numpy_array_equal(labels, expected_labels)
  55. # Should return a complex dtype in the future
  56. expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=object)
  57. tm.assert_numpy_array_equal(uniques, expected_uniques)
  58. @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
  59. @pytest.mark.parametrize("sort", [True, False])
  60. def test_factorize(self, index_or_series_obj, sort):
  61. obj = index_or_series_obj
  62. result_codes, result_uniques = obj.factorize(sort=sort)
  63. constructor = Index
  64. if isinstance(obj, MultiIndex):
  65. constructor = MultiIndex.from_tuples
  66. expected_arr = obj.unique()
  67. if expected_arr.dtype == np.float16:
  68. expected_arr = expected_arr.astype(np.float32)
  69. expected_uniques = constructor(expected_arr)
  70. if (
  71. isinstance(obj, Index)
  72. and expected_uniques.dtype == bool
  73. and obj.dtype == object
  74. ):
  75. expected_uniques = expected_uniques.astype(object)
  76. if sort:
  77. expected_uniques = expected_uniques.sort_values()
  78. # construct an integer ndarray so that
  79. # `expected_uniques.take(expected_codes)` is equal to `obj`
  80. expected_uniques_list = list(expected_uniques)
  81. expected_codes = [expected_uniques_list.index(val) for val in obj]
  82. expected_codes = np.asarray(expected_codes, dtype=np.intp)
  83. tm.assert_numpy_array_equal(result_codes, expected_codes)
  84. tm.assert_index_equal(result_uniques, expected_uniques, exact=True)
  85. def test_series_factorize_use_na_sentinel_false(self):
  86. # GH#35667
  87. values = np.array([1, 2, 1, np.nan])
  88. ser = Series(values)
  89. codes, uniques = ser.factorize(use_na_sentinel=False)
  90. expected_codes = np.array([0, 1, 0, 2], dtype=np.intp)
  91. expected_uniques = Index([1.0, 2.0, np.nan])
  92. tm.assert_numpy_array_equal(codes, expected_codes)
  93. tm.assert_index_equal(uniques, expected_uniques)
  94. def test_basic(self):
  95. items = np.array(["a", "b", "b", "a", "a", "c", "c", "c"], dtype=object)
  96. codes, uniques = algos.factorize(items)
  97. tm.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object))
  98. codes, uniques = algos.factorize(items, sort=True)
  99. exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp)
  100. tm.assert_numpy_array_equal(codes, exp)
  101. exp = np.array(["a", "b", "c"], dtype=object)
  102. tm.assert_numpy_array_equal(uniques, exp)
  103. arr = np.arange(5, dtype=np.intp)[::-1]
  104. codes, uniques = algos.factorize(arr)
  105. exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
  106. tm.assert_numpy_array_equal(codes, exp)
  107. exp = np.array([4, 3, 2, 1, 0], dtype=arr.dtype)
  108. tm.assert_numpy_array_equal(uniques, exp)
  109. codes, uniques = algos.factorize(arr, sort=True)
  110. exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
  111. tm.assert_numpy_array_equal(codes, exp)
  112. exp = np.array([0, 1, 2, 3, 4], dtype=arr.dtype)
  113. tm.assert_numpy_array_equal(uniques, exp)
  114. arr = np.arange(5.0)[::-1]
  115. codes, uniques = algos.factorize(arr)
  116. exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
  117. tm.assert_numpy_array_equal(codes, exp)
  118. exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=arr.dtype)
  119. tm.assert_numpy_array_equal(uniques, exp)
  120. codes, uniques = algos.factorize(arr, sort=True)
  121. exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
  122. tm.assert_numpy_array_equal(codes, exp)
  123. exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=arr.dtype)
  124. tm.assert_numpy_array_equal(uniques, exp)
  125. def test_mixed(self):
  126. # doc example reshaping.rst
  127. x = Series(["A", "A", np.nan, "B", 3.14, np.inf])
  128. codes, uniques = algos.factorize(x)
  129. exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp)
  130. tm.assert_numpy_array_equal(codes, exp)
  131. exp = Index(["A", "B", 3.14, np.inf])
  132. tm.assert_index_equal(uniques, exp)
  133. codes, uniques = algos.factorize(x, sort=True)
  134. exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp)
  135. tm.assert_numpy_array_equal(codes, exp)
  136. exp = Index([3.14, np.inf, "A", "B"])
  137. tm.assert_index_equal(uniques, exp)
  138. def test_factorize_datetime64(self):
  139. # M8
  140. v1 = Timestamp("20130101 09:00:00.00004")
  141. v2 = Timestamp("20130101")
  142. x = Series([v1, v1, v1, v2, v2, v1])
  143. codes, uniques = algos.factorize(x)
  144. exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
  145. tm.assert_numpy_array_equal(codes, exp)
  146. exp = DatetimeIndex([v1, v2])
  147. tm.assert_index_equal(uniques, exp)
  148. codes, uniques = algos.factorize(x, sort=True)
  149. exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.intp)
  150. tm.assert_numpy_array_equal(codes, exp)
  151. exp = DatetimeIndex([v2, v1])
  152. tm.assert_index_equal(uniques, exp)
  153. def test_factorize_period(self):
  154. # period
  155. v1 = Period("201302", freq="M")
  156. v2 = Period("201303", freq="M")
  157. x = Series([v1, v1, v1, v2, v2, v1])
  158. # periods are not 'sorted' as they are converted back into an index
  159. codes, uniques = algos.factorize(x)
  160. exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
  161. tm.assert_numpy_array_equal(codes, exp)
  162. tm.assert_index_equal(uniques, PeriodIndex([v1, v2]))
  163. codes, uniques = algos.factorize(x, sort=True)
  164. exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
  165. tm.assert_numpy_array_equal(codes, exp)
  166. tm.assert_index_equal(uniques, PeriodIndex([v1, v2]))
  167. def test_factorize_timedelta(self):
  168. # GH 5986
  169. v1 = to_timedelta("1 day 1 min")
  170. v2 = to_timedelta("1 day")
  171. x = Series([v1, v2, v1, v1, v2, v2, v1])
  172. codes, uniques = algos.factorize(x)
  173. exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp)
  174. tm.assert_numpy_array_equal(codes, exp)
  175. tm.assert_index_equal(uniques, to_timedelta([v1, v2]))
  176. codes, uniques = algos.factorize(x, sort=True)
  177. exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp)
  178. tm.assert_numpy_array_equal(codes, exp)
  179. tm.assert_index_equal(uniques, to_timedelta([v2, v1]))
  180. def test_factorize_nan(self):
  181. # nan should map to na_sentinel, not reverse_indexer[na_sentinel]
  182. # rizer.factorize should not raise an exception if na_sentinel indexes
  183. # outside of reverse_indexer
  184. key = np.array([1, 2, 1, np.nan], dtype="O")
  185. rizer = ht.ObjectFactorizer(len(key))
  186. for na_sentinel in (-1, 20):
  187. ids = rizer.factorize(key, na_sentinel=na_sentinel)
  188. expected = np.array([0, 1, 0, na_sentinel], dtype=np.intp)
  189. assert len(set(key)) == len(set(expected))
  190. tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel)
  191. tm.assert_numpy_array_equal(ids, expected)
  192. def test_factorizer_with_mask(self):
  193. # GH#49549
  194. data = np.array([1, 2, 3, 1, 1, 0], dtype="int64")
  195. mask = np.array([False, False, False, False, False, True])
  196. rizer = ht.Int64Factorizer(len(data))
  197. result = rizer.factorize(data, mask=mask)
  198. expected = np.array([0, 1, 2, 0, 0, -1], dtype=np.intp)
  199. tm.assert_numpy_array_equal(result, expected)
  200. expected_uniques = np.array([1, 2, 3], dtype="int64")
  201. tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques)
  202. def test_factorizer_object_with_nan(self):
  203. # GH#49549
  204. data = np.array([1, 2, 3, 1, np.nan])
  205. rizer = ht.ObjectFactorizer(len(data))
  206. result = rizer.factorize(data.astype(object))
  207. expected = np.array([0, 1, 2, 0, -1], dtype=np.intp)
  208. tm.assert_numpy_array_equal(result, expected)
  209. expected_uniques = np.array([1, 2, 3], dtype=object)
  210. tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques)
  211. @pytest.mark.parametrize(
  212. "data, expected_codes, expected_uniques",
  213. [
  214. (
  215. [(1, 1), (1, 2), (0, 0), (1, 2), "nonsense"],
  216. [0, 1, 2, 1, 3],
  217. [(1, 1), (1, 2), (0, 0), "nonsense"],
  218. ),
  219. (
  220. [(1, 1), (1, 2), (0, 0), (1, 2), (1, 2, 3)],
  221. [0, 1, 2, 1, 3],
  222. [(1, 1), (1, 2), (0, 0), (1, 2, 3)],
  223. ),
  224. ([(1, 1), (1, 2), (0, 0), (1, 2)], [0, 1, 2, 1], [(1, 1), (1, 2), (0, 0)]),
  225. ],
  226. )
  227. def test_factorize_tuple_list(self, data, expected_codes, expected_uniques):
  228. # GH9454
  229. msg = "factorize with argument that is not not a Series"
  230. with tm.assert_produces_warning(FutureWarning, match=msg):
  231. codes, uniques = pd.factorize(data)
  232. tm.assert_numpy_array_equal(codes, np.array(expected_codes, dtype=np.intp))
  233. expected_uniques_array = com.asarray_tuplesafe(expected_uniques, dtype=object)
  234. tm.assert_numpy_array_equal(uniques, expected_uniques_array)
  235. def test_complex_sorting(self):
  236. # gh 12666 - check no segfault
  237. x17 = np.array([complex(i) for i in range(17)], dtype=object)
  238. msg = "'[<>]' not supported between instances of .*"
  239. with pytest.raises(TypeError, match=msg):
  240. algos.factorize(x17[::-1], sort=True)
  241. def test_numeric_dtype_factorize(self, any_real_numpy_dtype):
  242. # GH41132
  243. dtype = any_real_numpy_dtype
  244. data = np.array([1, 2, 2, 1], dtype=dtype)
  245. expected_codes = np.array([0, 1, 1, 0], dtype=np.intp)
  246. expected_uniques = np.array([1, 2], dtype=dtype)
  247. codes, uniques = algos.factorize(data)
  248. tm.assert_numpy_array_equal(codes, expected_codes)
  249. tm.assert_numpy_array_equal(uniques, expected_uniques)
  250. def test_float64_factorize(self, writable):
  251. data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64)
  252. data.setflags(write=writable)
  253. expected_codes = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp)
  254. expected_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64)
  255. codes, uniques = algos.factorize(data)
  256. tm.assert_numpy_array_equal(codes, expected_codes)
  257. tm.assert_numpy_array_equal(uniques, expected_uniques)
  258. def test_uint64_factorize(self, writable):
  259. data = np.array([2**64 - 1, 1, 2**64 - 1], dtype=np.uint64)
  260. data.setflags(write=writable)
  261. expected_codes = np.array([0, 1, 0], dtype=np.intp)
  262. expected_uniques = np.array([2**64 - 1, 1], dtype=np.uint64)
  263. codes, uniques = algos.factorize(data)
  264. tm.assert_numpy_array_equal(codes, expected_codes)
  265. tm.assert_numpy_array_equal(uniques, expected_uniques)
  266. def test_int64_factorize(self, writable):
  267. data = np.array([2**63 - 1, -(2**63), 2**63 - 1], dtype=np.int64)
  268. data.setflags(write=writable)
  269. expected_codes = np.array([0, 1, 0], dtype=np.intp)
  270. expected_uniques = np.array([2**63 - 1, -(2**63)], dtype=np.int64)
  271. codes, uniques = algos.factorize(data)
  272. tm.assert_numpy_array_equal(codes, expected_codes)
  273. tm.assert_numpy_array_equal(uniques, expected_uniques)
  274. def test_string_factorize(self, writable):
  275. data = np.array(["a", "c", "a", "b", "c"], dtype=object)
  276. data.setflags(write=writable)
  277. expected_codes = np.array([0, 1, 0, 2, 1], dtype=np.intp)
  278. expected_uniques = np.array(["a", "c", "b"], dtype=object)
  279. codes, uniques = algos.factorize(data)
  280. tm.assert_numpy_array_equal(codes, expected_codes)
  281. tm.assert_numpy_array_equal(uniques, expected_uniques)
  282. def test_object_factorize(self, writable):
  283. data = np.array(["a", "c", None, np.nan, "a", "b", NaT, "c"], dtype=object)
  284. data.setflags(write=writable)
  285. expected_codes = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp)
  286. expected_uniques = np.array(["a", "c", "b"], dtype=object)
  287. codes, uniques = algos.factorize(data)
  288. tm.assert_numpy_array_equal(codes, expected_codes)
  289. tm.assert_numpy_array_equal(uniques, expected_uniques)
  290. def test_datetime64_factorize(self, writable):
  291. # GH35650 Verify whether read-only datetime64 array can be factorized
  292. data = np.array([np.datetime64("2020-01-01T00:00:00.000")], dtype="M8[ns]")
  293. data.setflags(write=writable)
  294. expected_codes = np.array([0], dtype=np.intp)
  295. expected_uniques = np.array(
  296. ["2020-01-01T00:00:00.000000000"], dtype="datetime64[ns]"
  297. )
  298. codes, uniques = pd.factorize(data)
  299. tm.assert_numpy_array_equal(codes, expected_codes)
  300. tm.assert_numpy_array_equal(uniques, expected_uniques)
  301. @pytest.mark.parametrize("sort", [True, False])
  302. def test_factorize_rangeindex(self, sort):
  303. # increasing -> sort doesn't matter
  304. ri = pd.RangeIndex.from_range(range(10))
  305. expected = np.arange(10, dtype=np.intp), ri
  306. result = algos.factorize(ri, sort=sort)
  307. tm.assert_numpy_array_equal(result[0], expected[0])
  308. tm.assert_index_equal(result[1], expected[1], exact=True)
  309. result = ri.factorize(sort=sort)
  310. tm.assert_numpy_array_equal(result[0], expected[0])
  311. tm.assert_index_equal(result[1], expected[1], exact=True)
  312. @pytest.mark.parametrize("sort", [True, False])
  313. def test_factorize_rangeindex_decreasing(self, sort):
  314. # decreasing -> sort matters
  315. ri = pd.RangeIndex.from_range(range(10))
  316. expected = np.arange(10, dtype=np.intp), ri
  317. ri2 = ri[::-1]
  318. expected = expected[0], ri2
  319. if sort:
  320. expected = expected[0][::-1], expected[1][::-1]
  321. result = algos.factorize(ri2, sort=sort)
  322. tm.assert_numpy_array_equal(result[0], expected[0])
  323. tm.assert_index_equal(result[1], expected[1], exact=True)
  324. result = ri2.factorize(sort=sort)
  325. tm.assert_numpy_array_equal(result[0], expected[0])
  326. tm.assert_index_equal(result[1], expected[1], exact=True)
  327. def test_deprecate_order(self):
  328. # gh 19727 - check warning is raised for deprecated keyword, order.
  329. # Test not valid once order keyword is removed.
  330. data = np.array([2**63, 1, 2**63], dtype=np.uint64)
  331. with pytest.raises(TypeError, match="got an unexpected keyword"):
  332. algos.factorize(data, order=True)
  333. with tm.assert_produces_warning(False):
  334. algos.factorize(data)
  335. @pytest.mark.parametrize(
  336. "data",
  337. [
  338. np.array([0, 1, 0], dtype="u8"),
  339. np.array([-(2**63), 1, -(2**63)], dtype="i8"),
  340. np.array(["__nan__", "foo", "__nan__"], dtype="object"),
  341. ],
  342. )
  343. def test_parametrized_factorize_na_value_default(self, data):
  344. # arrays that include the NA default for that type, but isn't used.
  345. codes, uniques = algos.factorize(data)
  346. expected_uniques = data[[0, 1]]
  347. expected_codes = np.array([0, 1, 0], dtype=np.intp)
  348. tm.assert_numpy_array_equal(codes, expected_codes)
  349. tm.assert_numpy_array_equal(uniques, expected_uniques)
  350. @pytest.mark.parametrize(
  351. "data, na_value",
  352. [
  353. (np.array([0, 1, 0, 2], dtype="u8"), 0),
  354. (np.array([1, 0, 1, 2], dtype="u8"), 1),
  355. (np.array([-(2**63), 1, -(2**63), 0], dtype="i8"), -(2**63)),
  356. (np.array([1, -(2**63), 1, 0], dtype="i8"), 1),
  357. (np.array(["a", "", "a", "b"], dtype=object), "a"),
  358. (np.array([(), ("a", 1), (), ("a", 2)], dtype=object), ()),
  359. (np.array([("a", 1), (), ("a", 1), ("a", 2)], dtype=object), ("a", 1)),
  360. ],
  361. )
  362. def test_parametrized_factorize_na_value(self, data, na_value):
  363. codes, uniques = algos.factorize_array(data, na_value=na_value)
  364. expected_uniques = data[[1, 3]]
  365. expected_codes = np.array([-1, 0, -1, 1], dtype=np.intp)
  366. tm.assert_numpy_array_equal(codes, expected_codes)
  367. tm.assert_numpy_array_equal(uniques, expected_uniques)
  368. @pytest.mark.parametrize("sort", [True, False])
  369. @pytest.mark.parametrize(
  370. "data, uniques",
  371. [
  372. (
  373. np.array(["b", "a", None, "b"], dtype=object),
  374. np.array(["b", "a"], dtype=object),
  375. ),
  376. (
  377. pd.array([2, 1, np.nan, 2], dtype="Int64"),
  378. pd.array([2, 1], dtype="Int64"),
  379. ),
  380. ],
  381. ids=["numpy_array", "extension_array"],
  382. )
  383. def test_factorize_use_na_sentinel(self, sort, data, uniques):
  384. codes, uniques = algos.factorize(data, sort=sort, use_na_sentinel=True)
  385. if sort:
  386. expected_codes = np.array([1, 0, -1, 1], dtype=np.intp)
  387. expected_uniques = algos.safe_sort(uniques)
  388. else:
  389. expected_codes = np.array([0, 1, -1, 0], dtype=np.intp)
  390. expected_uniques = uniques
  391. tm.assert_numpy_array_equal(codes, expected_codes)
  392. if isinstance(data, np.ndarray):
  393. tm.assert_numpy_array_equal(uniques, expected_uniques)
  394. else:
  395. tm.assert_extension_array_equal(uniques, expected_uniques)
  396. @pytest.mark.parametrize(
  397. "data, expected_codes, expected_uniques",
  398. [
  399. (
  400. ["a", None, "b", "a"],
  401. np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
  402. np.array(["a", np.nan, "b"], dtype=object),
  403. ),
  404. (
  405. ["a", np.nan, "b", "a"],
  406. np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
  407. np.array(["a", np.nan, "b"], dtype=object),
  408. ),
  409. ],
  410. )
  411. def test_object_factorize_use_na_sentinel_false(
  412. self, data, expected_codes, expected_uniques
  413. ):
  414. codes, uniques = algos.factorize(
  415. np.array(data, dtype=object), use_na_sentinel=False
  416. )
  417. tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True)
  418. tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True)
  419. @pytest.mark.parametrize(
  420. "data, expected_codes, expected_uniques",
  421. [
  422. (
  423. [1, None, 1, 2],
  424. np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
  425. np.array([1, np.nan, 2], dtype="O"),
  426. ),
  427. (
  428. [1, np.nan, 1, 2],
  429. np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
  430. np.array([1, np.nan, 2], dtype=np.float64),
  431. ),
  432. ],
  433. )
  434. def test_int_factorize_use_na_sentinel_false(
  435. self, data, expected_codes, expected_uniques
  436. ):
  437. msg = "factorize with argument that is not not a Series"
  438. with tm.assert_produces_warning(FutureWarning, match=msg):
  439. codes, uniques = algos.factorize(data, use_na_sentinel=False)
  440. tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True)
  441. tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True)
  442. @pytest.mark.parametrize(
  443. "data, expected_codes, expected_uniques",
  444. [
  445. (
  446. Index(Categorical(["a", "a", "b"])),
  447. np.array([0, 0, 1], dtype=np.intp),
  448. CategoricalIndex(["a", "b"], categories=["a", "b"], dtype="category"),
  449. ),
  450. (
  451. Series(Categorical(["a", "a", "b"])),
  452. np.array([0, 0, 1], dtype=np.intp),
  453. CategoricalIndex(["a", "b"], categories=["a", "b"], dtype="category"),
  454. ),
  455. (
  456. Series(DatetimeIndex(["2017", "2017"], tz="US/Eastern")),
  457. np.array([0, 0], dtype=np.intp),
  458. DatetimeIndex(["2017"], tz="US/Eastern"),
  459. ),
  460. ],
  461. )
  462. def test_factorize_mixed_values(self, data, expected_codes, expected_uniques):
  463. # GH 19721
  464. codes, uniques = algos.factorize(data)
  465. tm.assert_numpy_array_equal(codes, expected_codes)
  466. tm.assert_index_equal(uniques, expected_uniques)
  467. def test_factorize_interval_non_nano(self, unit):
  468. # GH#56099
  469. left = DatetimeIndex(["2016-01-01", np.nan, "2015-10-11"]).as_unit(unit)
  470. right = DatetimeIndex(["2016-01-02", np.nan, "2015-10-15"]).as_unit(unit)
  471. idx = IntervalIndex.from_arrays(left, right)
  472. codes, cats = idx.factorize()
  473. assert cats.dtype == f"interval[datetime64[{unit}], right]"
  474. ts = Timestamp(0).as_unit(unit)
  475. idx2 = IntervalIndex.from_arrays(left - ts, right - ts)
  476. codes2, cats2 = idx2.factorize()
  477. assert cats2.dtype == f"interval[timedelta64[{unit}], right]"
  478. idx3 = IntervalIndex.from_arrays(
  479. left.tz_localize("US/Pacific"), right.tz_localize("US/Pacific")
  480. )
  481. codes3, cats3 = idx3.factorize()
  482. assert cats3.dtype == f"interval[datetime64[{unit}, US/Pacific], right]"
  483. class TestUnique:
  484. def test_ints(self):
  485. arr = np.random.default_rng(2).integers(0, 100, size=50)
  486. result = algos.unique(arr)
  487. assert isinstance(result, np.ndarray)
  488. def test_objects(self):
  489. arr = np.random.default_rng(2).integers(0, 100, size=50).astype("O")
  490. result = algos.unique(arr)
  491. assert isinstance(result, np.ndarray)
  492. def test_object_refcount_bug(self):
  493. lst = np.array(["A", "B", "C", "D", "E"], dtype=object)
  494. for i in range(1000):
  495. len(algos.unique(lst))
  496. def test_on_index_object(self):
  497. mindex = MultiIndex.from_arrays(
  498. [np.arange(5).repeat(5), np.tile(np.arange(5), 5)]
  499. )
  500. expected = mindex.values
  501. expected.sort()
  502. mindex = mindex.repeat(2)
  503. result = pd.unique(mindex)
  504. result.sort()
  505. tm.assert_almost_equal(result, expected)
  506. def test_dtype_preservation(self, any_numpy_dtype):
  507. # GH 15442
  508. if any_numpy_dtype in (tm.BYTES_DTYPES + tm.STRING_DTYPES):
  509. data = [1, 2, 2]
  510. uniques = [1, 2]
  511. elif is_integer_dtype(any_numpy_dtype):
  512. data = [1, 2, 2]
  513. uniques = [1, 2]
  514. elif is_float_dtype(any_numpy_dtype):
  515. data = [1, 2, 2]
  516. uniques = [1.0, 2.0]
  517. elif is_complex_dtype(any_numpy_dtype):
  518. data = [complex(1, 0), complex(2, 0), complex(2, 0)]
  519. uniques = [complex(1, 0), complex(2, 0)]
  520. elif is_bool_dtype(any_numpy_dtype):
  521. data = [True, True, False]
  522. uniques = [True, False]
  523. elif is_object_dtype(any_numpy_dtype):
  524. data = ["A", "B", "B"]
  525. uniques = ["A", "B"]
  526. else:
  527. # datetime64[ns]/M8[ns]/timedelta64[ns]/m8[ns] tested elsewhere
  528. data = [1, 2, 2]
  529. uniques = [1, 2]
  530. result = Series(data, dtype=any_numpy_dtype).unique()
  531. expected = np.array(uniques, dtype=any_numpy_dtype)
  532. if any_numpy_dtype in tm.STRING_DTYPES:
  533. expected = expected.astype(object)
  534. if expected.dtype.kind in ["m", "M"]:
  535. # We get TimedeltaArray/DatetimeArray
  536. assert isinstance(result, (DatetimeArray, TimedeltaArray))
  537. result = np.array(result)
  538. tm.assert_numpy_array_equal(result, expected)
  539. def test_datetime64_dtype_array_returned(self):
  540. # GH 9431
  541. expected = np.array(
  542. [
  543. "2015-01-03T00:00:00.000000000",
  544. "2015-01-01T00:00:00.000000000",
  545. ],
  546. dtype="M8[ns]",
  547. )
  548. dt_index = to_datetime(
  549. [
  550. "2015-01-03T00:00:00.000000000",
  551. "2015-01-01T00:00:00.000000000",
  552. "2015-01-01T00:00:00.000000000",
  553. ]
  554. )
  555. result = algos.unique(dt_index)
  556. tm.assert_numpy_array_equal(result, expected)
  557. assert result.dtype == expected.dtype
  558. s = Series(dt_index)
  559. result = algos.unique(s)
  560. tm.assert_numpy_array_equal(result, expected)
  561. assert result.dtype == expected.dtype
  562. arr = s.values
  563. result = algos.unique(arr)
  564. tm.assert_numpy_array_equal(result, expected)
  565. assert result.dtype == expected.dtype
  566. def test_datetime_non_ns(self):
  567. a = np.array(["2000", "2000", "2001"], dtype="datetime64[s]")
  568. result = pd.unique(a)
  569. expected = np.array(["2000", "2001"], dtype="datetime64[s]")
  570. tm.assert_numpy_array_equal(result, expected)
  571. def test_timedelta_non_ns(self):
  572. a = np.array(["2000", "2000", "2001"], dtype="timedelta64[s]")
  573. result = pd.unique(a)
  574. expected = np.array([2000, 2001], dtype="timedelta64[s]")
  575. tm.assert_numpy_array_equal(result, expected)
  576. def test_timedelta64_dtype_array_returned(self):
  577. # GH 9431
  578. expected = np.array([31200, 45678, 10000], dtype="m8[ns]")
  579. td_index = to_timedelta([31200, 45678, 31200, 10000, 45678])
  580. result = algos.unique(td_index)
  581. tm.assert_numpy_array_equal(result, expected)
  582. assert result.dtype == expected.dtype
  583. s = Series(td_index)
  584. result = algos.unique(s)
  585. tm.assert_numpy_array_equal(result, expected)
  586. assert result.dtype == expected.dtype
  587. arr = s.values
  588. result = algos.unique(arr)
  589. tm.assert_numpy_array_equal(result, expected)
  590. assert result.dtype == expected.dtype
  591. def test_uint64_overflow(self):
  592. s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
  593. exp = np.array([1, 2, 2**63], dtype=np.uint64)
  594. tm.assert_numpy_array_equal(algos.unique(s), exp)
  595. def test_nan_in_object_array(self):
  596. duplicated_items = ["a", np.nan, "c", "c"]
  597. result = pd.unique(np.array(duplicated_items, dtype=object))
  598. expected = np.array(["a", np.nan, "c"], dtype=object)
  599. tm.assert_numpy_array_equal(result, expected)
  600. def test_categorical(self):
  601. # we are expecting to return in the order
  602. # of appearance
  603. expected = Categorical(list("bac"))
  604. # we are expecting to return in the order
  605. # of the categories
  606. expected_o = Categorical(list("bac"), categories=list("abc"), ordered=True)
  607. # GH 15939
  608. c = Categorical(list("baabc"))
  609. result = c.unique()
  610. tm.assert_categorical_equal(result, expected)
  611. result = algos.unique(c)
  612. tm.assert_categorical_equal(result, expected)
  613. c = Categorical(list("baabc"), ordered=True)
  614. result = c.unique()
  615. tm.assert_categorical_equal(result, expected_o)
  616. result = algos.unique(c)
  617. tm.assert_categorical_equal(result, expected_o)
  618. # Series of categorical dtype
  619. s = Series(Categorical(list("baabc")), name="foo")
  620. result = s.unique()
  621. tm.assert_categorical_equal(result, expected)
  622. result = pd.unique(s)
  623. tm.assert_categorical_equal(result, expected)
  624. # CI -> return CI
  625. ci = CategoricalIndex(Categorical(list("baabc"), categories=list("abc")))
  626. expected = CategoricalIndex(expected)
  627. result = ci.unique()
  628. tm.assert_index_equal(result, expected)
  629. result = pd.unique(ci)
  630. tm.assert_index_equal(result, expected)
  631. def test_datetime64tz_aware(self, unit):
  632. # GH 15939
  633. dti = Index(
  634. [
  635. Timestamp("20160101", tz="US/Eastern"),
  636. Timestamp("20160101", tz="US/Eastern"),
  637. ]
  638. ).as_unit(unit)
  639. ser = Series(dti)
  640. result = ser.unique()
  641. expected = dti[:1]._data
  642. tm.assert_extension_array_equal(result, expected)
  643. result = dti.unique()
  644. expected = dti[:1]
  645. tm.assert_index_equal(result, expected)
  646. result = pd.unique(ser)
  647. expected = dti[:1]._data
  648. tm.assert_extension_array_equal(result, expected)
  649. result = pd.unique(dti)
  650. expected = dti[:1]
  651. tm.assert_index_equal(result, expected)
  652. def test_order_of_appearance(self):
  653. # 9346
  654. # light testing of guarantee of order of appearance
  655. # these also are the doc-examples
  656. result = pd.unique(Series([2, 1, 3, 3]))
  657. tm.assert_numpy_array_equal(result, np.array([2, 1, 3], dtype="int64"))
  658. result = pd.unique(Series([2] + [1] * 5))
  659. tm.assert_numpy_array_equal(result, np.array([2, 1], dtype="int64"))
  660. msg = "unique with argument that is not not a Series, Index,"
  661. with tm.assert_produces_warning(FutureWarning, match=msg):
  662. result = pd.unique(list("aabc"))
  663. expected = np.array(["a", "b", "c"], dtype=object)
  664. tm.assert_numpy_array_equal(result, expected)
  665. result = pd.unique(Series(Categorical(list("aabc"))))
  666. expected = Categorical(list("abc"))
  667. tm.assert_categorical_equal(result, expected)
  668. def test_order_of_appearance_dt64(self, unit):
  669. ser = Series([Timestamp("20160101"), Timestamp("20160101")]).dt.as_unit(unit)
  670. result = pd.unique(ser)
  671. expected = np.array(["2016-01-01T00:00:00.000000000"], dtype=f"M8[{unit}]")
  672. tm.assert_numpy_array_equal(result, expected)
  673. def test_order_of_appearance_dt64tz(self, unit):
  674. dti = DatetimeIndex(
  675. [
  676. Timestamp("20160101", tz="US/Eastern"),
  677. Timestamp("20160101", tz="US/Eastern"),
  678. ]
  679. ).as_unit(unit)
  680. result = pd.unique(dti)
  681. expected = DatetimeIndex(
  682. ["2016-01-01 00:00:00"], dtype=f"datetime64[{unit}, US/Eastern]", freq=None
  683. )
  684. tm.assert_index_equal(result, expected)
  685. @pytest.mark.parametrize(
  686. "arg ,expected",
  687. [
  688. (("1", "1", "2"), np.array(["1", "2"], dtype=object)),
  689. (("foo",), np.array(["foo"], dtype=object)),
  690. ],
  691. )
  692. def test_tuple_with_strings(self, arg, expected):
  693. # see GH 17108
  694. msg = "unique with argument that is not not a Series"
  695. with tm.assert_produces_warning(FutureWarning, match=msg):
  696. result = pd.unique(arg)
  697. tm.assert_numpy_array_equal(result, expected)
  698. def test_obj_none_preservation(self):
  699. # GH 20866
  700. arr = np.array(["foo", None], dtype=object)
  701. result = pd.unique(arr)
  702. expected = np.array(["foo", None], dtype=object)
  703. tm.assert_numpy_array_equal(result, expected, strict_nan=True)
  704. def test_signed_zero(self):
  705. # GH 21866
  706. a = np.array([-0.0, 0.0])
  707. result = pd.unique(a)
  708. expected = np.array([-0.0]) # 0.0 and -0.0 are equivalent
  709. tm.assert_numpy_array_equal(result, expected)
  710. def test_different_nans(self):
  711. # GH 21866
  712. # create different nans from bit-patterns:
  713. NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
  714. NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
  715. assert NAN1 != NAN1
  716. assert NAN2 != NAN2
  717. a = np.array([NAN1, NAN2]) # NAN1 and NAN2 are equivalent
  718. result = pd.unique(a)
  719. expected = np.array([np.nan])
  720. tm.assert_numpy_array_equal(result, expected)
  721. @pytest.mark.parametrize("el_type", [np.float64, object])
  722. def test_first_nan_kept(self, el_type):
  723. # GH 22295
  724. # create different nans from bit-patterns:
  725. bits_for_nan1 = 0xFFF8000000000001
  726. bits_for_nan2 = 0x7FF8000000000001
  727. NAN1 = struct.unpack("d", struct.pack("=Q", bits_for_nan1))[0]
  728. NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0]
  729. assert NAN1 != NAN1
  730. assert NAN2 != NAN2
  731. a = np.array([NAN1, NAN2], dtype=el_type)
  732. result = pd.unique(a)
  733. assert result.size == 1
  734. # use bit patterns to identify which nan was kept:
  735. result_nan_bits = struct.unpack("=Q", struct.pack("d", result[0]))[0]
  736. assert result_nan_bits == bits_for_nan1
  737. def test_do_not_mangle_na_values(self, unique_nulls_fixture, unique_nulls_fixture2):
  738. # GH 22295
  739. if unique_nulls_fixture is unique_nulls_fixture2:
  740. return # skip it, values not unique
  741. a = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=object)
  742. result = pd.unique(a)
  743. assert result.size == 2
  744. assert a[0] is unique_nulls_fixture
  745. assert a[1] is unique_nulls_fixture2
  746. def test_unique_masked(self, any_numeric_ea_dtype):
  747. # GH#48019
  748. ser = Series([1, pd.NA, 2] * 3, dtype=any_numeric_ea_dtype)
  749. result = pd.unique(ser)
  750. expected = pd.array([1, pd.NA, 2], dtype=any_numeric_ea_dtype)
  751. tm.assert_extension_array_equal(result, expected)
  752. def test_nunique_ints(index_or_series_or_array):
  753. # GH#36327
  754. values = index_or_series_or_array(np.random.default_rng(2).integers(0, 20, 30))
  755. result = algos.nunique_ints(values)
  756. expected = len(algos.unique(values))
  757. assert result == expected
  758. class TestIsin:
  759. def test_invalid(self):
  760. msg = (
  761. r"only list-like objects are allowed to be passed to isin\(\), "
  762. r"you passed a `int`"
  763. )
  764. with pytest.raises(TypeError, match=msg):
  765. algos.isin(1, 1)
  766. with pytest.raises(TypeError, match=msg):
  767. algos.isin(1, [1])
  768. with pytest.raises(TypeError, match=msg):
  769. algos.isin([1], 1)
  770. def test_basic(self):
  771. msg = "isin with argument that is not not a Series"
  772. with tm.assert_produces_warning(FutureWarning, match=msg):
  773. result = algos.isin([1, 2], [1])
  774. expected = np.array([True, False])
  775. tm.assert_numpy_array_equal(result, expected)
  776. result = algos.isin(np.array([1, 2]), [1])
  777. expected = np.array([True, False])
  778. tm.assert_numpy_array_equal(result, expected)
  779. result = algos.isin(Series([1, 2]), [1])
  780. expected = np.array([True, False])
  781. tm.assert_numpy_array_equal(result, expected)
  782. result = algos.isin(Series([1, 2]), Series([1]))
  783. expected = np.array([True, False])
  784. tm.assert_numpy_array_equal(result, expected)
  785. result = algos.isin(Series([1, 2]), {1})
  786. expected = np.array([True, False])
  787. tm.assert_numpy_array_equal(result, expected)
  788. with tm.assert_produces_warning(FutureWarning, match=msg):
  789. result = algos.isin(["a", "b"], ["a"])
  790. expected = np.array([True, False])
  791. tm.assert_numpy_array_equal(result, expected)
  792. result = algos.isin(Series(["a", "b"]), Series(["a"]))
  793. expected = np.array([True, False])
  794. tm.assert_numpy_array_equal(result, expected)
  795. result = algos.isin(Series(["a", "b"]), {"a"})
  796. expected = np.array([True, False])
  797. tm.assert_numpy_array_equal(result, expected)
  798. with tm.assert_produces_warning(FutureWarning, match=msg):
  799. result = algos.isin(["a", "b"], [1])
  800. expected = np.array([False, False])
  801. tm.assert_numpy_array_equal(result, expected)
  802. def test_i8(self):
  803. arr = date_range("20130101", periods=3).values
  804. result = algos.isin(arr, [arr[0]])
  805. expected = np.array([True, False, False])
  806. tm.assert_numpy_array_equal(result, expected)
  807. result = algos.isin(arr, arr[0:2])
  808. expected = np.array([True, True, False])
  809. tm.assert_numpy_array_equal(result, expected)
  810. result = algos.isin(arr, set(arr[0:2]))
  811. expected = np.array([True, True, False])
  812. tm.assert_numpy_array_equal(result, expected)
  813. arr = timedelta_range("1 day", periods=3).values
  814. result = algos.isin(arr, [arr[0]])
  815. expected = np.array([True, False, False])
  816. tm.assert_numpy_array_equal(result, expected)
  817. result = algos.isin(arr, arr[0:2])
  818. expected = np.array([True, True, False])
  819. tm.assert_numpy_array_equal(result, expected)
  820. result = algos.isin(arr, set(arr[0:2]))
  821. expected = np.array([True, True, False])
  822. tm.assert_numpy_array_equal(result, expected)
  823. @pytest.mark.parametrize("dtype1", ["m8[ns]", "M8[ns]", "M8[ns, UTC]", "period[D]"])
  824. @pytest.mark.parametrize("dtype", ["i8", "f8", "u8"])
  825. def test_isin_datetimelike_values_numeric_comps(self, dtype, dtype1):
  826. # Anything but object and we get all-False shortcut
  827. dta = date_range("2013-01-01", periods=3)._values
  828. arr = Series(dta.view("i8")).array.view(dtype1)
  829. comps = arr.view("i8").astype(dtype)
  830. result = algos.isin(comps, arr)
  831. expected = np.zeros(comps.shape, dtype=bool)
  832. tm.assert_numpy_array_equal(result, expected)
  833. def test_large(self):
  834. s = date_range("20000101", periods=2000000, freq="s").values
  835. result = algos.isin(s, s[0:2])
  836. expected = np.zeros(len(s), dtype=bool)
  837. expected[0] = True
  838. expected[1] = True
  839. tm.assert_numpy_array_equal(result, expected)
  840. @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]", "M8[ns, UTC]", "period[D]"])
  841. def test_isin_datetimelike_all_nat(self, dtype):
  842. # GH#56427
  843. dta = date_range("2013-01-01", periods=3)._values
  844. arr = Series(dta.view("i8")).array.view(dtype)
  845. arr[0] = NaT
  846. result = algos.isin(arr, [NaT])
  847. expected = np.array([True, False, False], dtype=bool)
  848. tm.assert_numpy_array_equal(result, expected)
  849. @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]", "M8[ns, UTC]"])
  850. def test_isin_datetimelike_strings_deprecated(self, dtype):
  851. # GH#53111
  852. dta = date_range("2013-01-01", periods=3)._values
  853. arr = Series(dta.view("i8")).array.view(dtype)
  854. vals = [str(x) for x in arr]
  855. msg = "The behavior of 'isin' with dtype=.* is deprecated"
  856. with tm.assert_produces_warning(FutureWarning, match=msg):
  857. res = algos.isin(arr, vals)
  858. assert res.all()
  859. vals2 = np.array(vals, dtype=str)
  860. with tm.assert_produces_warning(FutureWarning, match=msg):
  861. res2 = algos.isin(arr, vals2)
  862. assert res2.all()
  863. def test_isin_dt64tz_with_nat(self):
  864. # the all-NaT values used to get inferred to tznaive, which was evaluated
  865. # as non-matching GH#56427
  866. dti = date_range("2016-01-01", periods=3, tz="UTC")
  867. ser = Series(dti)
  868. ser[0] = NaT
  869. res = algos.isin(ser._values, [NaT])
  870. exp = np.array([True, False, False], dtype=bool)
  871. tm.assert_numpy_array_equal(res, exp)
  872. def test_categorical_from_codes(self):
  873. # GH 16639
  874. vals = np.array([0, 1, 2, 0])
  875. cats = ["a", "b", "c"]
  876. Sd = Series(Categorical([1]).from_codes(vals, cats))
  877. St = Series(Categorical([1]).from_codes(np.array([0, 1]), cats))
  878. expected = np.array([True, True, False, True])
  879. result = algos.isin(Sd, St)
  880. tm.assert_numpy_array_equal(expected, result)
  881. def test_categorical_isin(self):
  882. vals = np.array([0, 1, 2, 0])
  883. cats = ["a", "b", "c"]
  884. cat = Categorical([1]).from_codes(vals, cats)
  885. other = Categorical([1]).from_codes(np.array([0, 1]), cats)
  886. expected = np.array([True, True, False, True])
  887. result = algos.isin(cat, other)
  888. tm.assert_numpy_array_equal(expected, result)
  889. def test_same_nan_is_in(self):
  890. # GH 22160
  891. # nan is special, because from " a is b" doesn't follow "a == b"
  892. # at least, isin() should follow python's "np.nan in [nan] == True"
  893. # casting to -> np.float64 -> another float-object somewhere on
  894. # the way could lead jeopardize this behavior
  895. comps = [np.nan] # could be casted to float64
  896. values = [np.nan]
  897. expected = np.array([True])
  898. msg = "isin with argument that is not not a Series"
  899. with tm.assert_produces_warning(FutureWarning, match=msg):
  900. result = algos.isin(comps, values)
  901. tm.assert_numpy_array_equal(expected, result)
  902. def test_same_nan_is_in_large(self):
  903. # https://github.com/pandas-dev/pandas/issues/22205
  904. s = np.tile(1.0, 1_000_001)
  905. s[0] = np.nan
  906. result = algos.isin(s, np.array([np.nan, 1]))
  907. expected = np.ones(len(s), dtype=bool)
  908. tm.assert_numpy_array_equal(result, expected)
  909. def test_same_nan_is_in_large_series(self):
  910. # https://github.com/pandas-dev/pandas/issues/22205
  911. s = np.tile(1.0, 1_000_001)
  912. series = Series(s)
  913. s[0] = np.nan
  914. result = series.isin(np.array([np.nan, 1]))
  915. expected = Series(np.ones(len(s), dtype=bool))
  916. tm.assert_series_equal(result, expected)
  917. def test_same_object_is_in(self):
  918. # GH 22160
  919. # there could be special treatment for nans
  920. # the user however could define a custom class
  921. # with similar behavior, then we at least should
  922. # fall back to usual python's behavior: "a in [a] == True"
  923. class LikeNan:
  924. def __eq__(self, other) -> bool:
  925. return False
  926. def __hash__(self):
  927. return 0
  928. a, b = LikeNan(), LikeNan()
  929. msg = "isin with argument that is not not a Series"
  930. with tm.assert_produces_warning(FutureWarning, match=msg):
  931. # same object -> True
  932. tm.assert_numpy_array_equal(algos.isin([a], [a]), np.array([True]))
  933. # different objects -> False
  934. tm.assert_numpy_array_equal(algos.isin([a], [b]), np.array([False]))
  935. def test_different_nans(self):
  936. # GH 22160
  937. # all nans are handled as equivalent
  938. comps = [float("nan")]
  939. values = [float("nan")]
  940. assert comps[0] is not values[0] # different nan-objects
  941. # as list of python-objects:
  942. result = algos.isin(np.array(comps), values)
  943. tm.assert_numpy_array_equal(np.array([True]), result)
  944. # as object-array:
  945. result = algos.isin(
  946. np.asarray(comps, dtype=object), np.asarray(values, dtype=object)
  947. )
  948. tm.assert_numpy_array_equal(np.array([True]), result)
  949. # as float64-array:
  950. result = algos.isin(
  951. np.asarray(comps, dtype=np.float64), np.asarray(values, dtype=np.float64)
  952. )
  953. tm.assert_numpy_array_equal(np.array([True]), result)
  954. def test_no_cast(self):
  955. # GH 22160
  956. # ensure 42 is not casted to a string
  957. comps = ["ss", 42]
  958. values = ["42"]
  959. expected = np.array([False, False])
  960. msg = "isin with argument that is not not a Series, Index"
  961. with tm.assert_produces_warning(FutureWarning, match=msg):
  962. result = algos.isin(comps, values)
  963. tm.assert_numpy_array_equal(expected, result)
  964. @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])])
  965. def test_empty(self, empty):
  966. # see gh-16991
  967. vals = Index(["a", "b"])
  968. expected = np.array([False, False])
  969. result = algos.isin(vals, empty)
  970. tm.assert_numpy_array_equal(expected, result)
  971. def test_different_nan_objects(self):
  972. # GH 22119
  973. comps = np.array(["nan", np.nan * 1j, float("nan")], dtype=object)
  974. vals = np.array([float("nan")], dtype=object)
  975. expected = np.array([False, False, True])
  976. result = algos.isin(comps, vals)
  977. tm.assert_numpy_array_equal(expected, result)
  978. def test_different_nans_as_float64(self):
  979. # GH 21866
  980. # create different nans from bit-patterns,
  981. # these nans will land in different buckets in the hash-table
  982. # if no special care is taken
  983. NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
  984. NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
  985. assert NAN1 != NAN1
  986. assert NAN2 != NAN2
  987. # check that NAN1 and NAN2 are equivalent:
  988. arr = np.array([NAN1, NAN2], dtype=np.float64)
  989. lookup1 = np.array([NAN1], dtype=np.float64)
  990. result = algos.isin(arr, lookup1)
  991. expected = np.array([True, True])
  992. tm.assert_numpy_array_equal(result, expected)
  993. lookup2 = np.array([NAN2], dtype=np.float64)
  994. result = algos.isin(arr, lookup2)
  995. expected = np.array([True, True])
  996. tm.assert_numpy_array_equal(result, expected)
  997. def test_isin_int_df_string_search(self):
  998. """Comparing df with int`s (1,2) with a string at isin() ("1")
  999. -> should not match values because int 1 is not equal str 1"""
  1000. df = DataFrame({"values": [1, 2]})
  1001. result = df.isin(["1"])
  1002. expected_false = DataFrame({"values": [False, False]})
  1003. tm.assert_frame_equal(result, expected_false)
  1004. def test_isin_nan_df_string_search(self):
  1005. """Comparing df with nan value (np.nan,2) with a string at isin() ("NaN")
  1006. -> should not match values because np.nan is not equal str NaN"""
  1007. df = DataFrame({"values": [np.nan, 2]})
  1008. result = df.isin(np.array(["NaN"], dtype=object))
  1009. expected_false = DataFrame({"values": [False, False]})
  1010. tm.assert_frame_equal(result, expected_false)
  1011. def test_isin_float_df_string_search(self):
  1012. """Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245")
  1013. -> should not match values because float 1.4245 is not equal str 1.4245"""
  1014. df = DataFrame({"values": [1.4245, 2.32441]})
  1015. result = df.isin(np.array(["1.4245"], dtype=object))
  1016. expected_false = DataFrame({"values": [False, False]})
  1017. tm.assert_frame_equal(result, expected_false)
  1018. def test_isin_unsigned_dtype(self):
  1019. # GH#46485
  1020. ser = Series([1378774140726870442], dtype=np.uint64)
  1021. result = ser.isin([1378774140726870528])
  1022. expected = Series(False)
  1023. tm.assert_series_equal(result, expected)
  1024. class TestValueCounts:
  1025. def test_value_counts(self):
  1026. arr = np.random.default_rng(1234).standard_normal(4)
  1027. factor = cut(arr, 4)
  1028. # assert isinstance(factor, n)
  1029. msg = "pandas.value_counts is deprecated"
  1030. with tm.assert_produces_warning(FutureWarning, match=msg):
  1031. result = algos.value_counts(factor)
  1032. breaks = [-1.606, -1.018, -0.431, 0.155, 0.741]
  1033. index = IntervalIndex.from_breaks(breaks).astype(CategoricalDtype(ordered=True))
  1034. expected = Series([1, 0, 2, 1], index=index, name="count")
  1035. tm.assert_series_equal(result.sort_index(), expected.sort_index())
  1036. def test_value_counts_bins(self):
  1037. s = [1, 2, 3, 4]
  1038. msg = "pandas.value_counts is deprecated"
  1039. with tm.assert_produces_warning(FutureWarning, match=msg):
  1040. result = algos.value_counts(s, bins=1)
  1041. expected = Series(
  1042. [4], index=IntervalIndex.from_tuples([(0.996, 4.0)]), name="count"
  1043. )
  1044. tm.assert_series_equal(result, expected)
  1045. with tm.assert_produces_warning(FutureWarning, match=msg):
  1046. result = algos.value_counts(s, bins=2, sort=False)
  1047. expected = Series(
  1048. [2, 2],
  1049. index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)]),
  1050. name="count",
  1051. )
  1052. tm.assert_series_equal(result, expected)
  1053. def test_value_counts_dtypes(self):
  1054. msg2 = "pandas.value_counts is deprecated"
  1055. with tm.assert_produces_warning(FutureWarning, match=msg2):
  1056. result = algos.value_counts(np.array([1, 1.0]))
  1057. assert len(result) == 1
  1058. with tm.assert_produces_warning(FutureWarning, match=msg2):
  1059. result = algos.value_counts(np.array([1, 1.0]), bins=1)
  1060. assert len(result) == 1
  1061. with tm.assert_produces_warning(FutureWarning, match=msg2):
  1062. result = algos.value_counts(Series([1, 1.0, "1"])) # object
  1063. assert len(result) == 2
  1064. msg = "bins argument only works with numeric data"
  1065. with pytest.raises(TypeError, match=msg):
  1066. with tm.assert_produces_warning(FutureWarning, match=msg2):
  1067. algos.value_counts(np.array(["1", 1], dtype=object), bins=1)
  1068. def test_value_counts_nat(self):
  1069. td = Series([np.timedelta64(10000), NaT], dtype="timedelta64[ns]")
  1070. dt = to_datetime(["NaT", "2014-01-01"])
  1071. msg = "pandas.value_counts is deprecated"
  1072. for ser in [td, dt]:
  1073. with tm.assert_produces_warning(FutureWarning, match=msg):
  1074. vc = algos.value_counts(ser)
  1075. vc_with_na = algos.value_counts(ser, dropna=False)
  1076. assert len(vc) == 1
  1077. assert len(vc_with_na) == 2
  1078. exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}, name="count")
  1079. with tm.assert_produces_warning(FutureWarning, match=msg):
  1080. result_dt = algos.value_counts(dt)
  1081. tm.assert_series_equal(result_dt, exp_dt)
  1082. exp_td = Series([1], index=[np.timedelta64(10000)], name="count")
  1083. with tm.assert_produces_warning(FutureWarning, match=msg):
  1084. result_td = algos.value_counts(td)
  1085. tm.assert_series_equal(result_td, exp_td)
  1086. @pytest.mark.parametrize("dtype", [object, "M8[us]"])
  1087. def test_value_counts_datetime_outofbounds(self, dtype):
  1088. # GH 13663
  1089. ser = Series(
  1090. [
  1091. datetime(3000, 1, 1),
  1092. datetime(5000, 1, 1),
  1093. datetime(5000, 1, 1),
  1094. datetime(6000, 1, 1),
  1095. datetime(3000, 1, 1),
  1096. datetime(3000, 1, 1),
  1097. ],
  1098. dtype=dtype,
  1099. )
  1100. res = ser.value_counts()
  1101. exp_index = Index(
  1102. [datetime(3000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)],
  1103. dtype=dtype,
  1104. )
  1105. exp = Series([3, 2, 1], index=exp_index, name="count")
  1106. tm.assert_series_equal(res, exp)
  1107. def test_categorical(self):
  1108. s = Series(Categorical(list("aaabbc")))
  1109. result = s.value_counts()
  1110. expected = Series(
  1111. [3, 2, 1], index=CategoricalIndex(["a", "b", "c"]), name="count"
  1112. )
  1113. tm.assert_series_equal(result, expected, check_index_type=True)
  1114. # preserve order?
  1115. s = s.cat.as_ordered()
  1116. result = s.value_counts()
  1117. expected.index = expected.index.as_ordered()
  1118. tm.assert_series_equal(result, expected, check_index_type=True)
  1119. def test_categorical_nans(self):
  1120. s = Series(Categorical(list("aaaaabbbcc"))) # 4,3,2,1 (nan)
  1121. s.iloc[1] = np.nan
  1122. result = s.value_counts()
  1123. expected = Series(
  1124. [4, 3, 2],
  1125. index=CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c"]),
  1126. name="count",
  1127. )
  1128. tm.assert_series_equal(result, expected, check_index_type=True)
  1129. result = s.value_counts(dropna=False)
  1130. expected = Series(
  1131. [4, 3, 2, 1], index=CategoricalIndex(["a", "b", "c", np.nan]), name="count"
  1132. )
  1133. tm.assert_series_equal(result, expected, check_index_type=True)
  1134. # out of order
  1135. s = Series(
  1136. Categorical(list("aaaaabbbcc"), ordered=True, categories=["b", "a", "c"])
  1137. )
  1138. s.iloc[1] = np.nan
  1139. result = s.value_counts()
  1140. expected = Series(
  1141. [4, 3, 2],
  1142. index=CategoricalIndex(
  1143. ["a", "b", "c"],
  1144. categories=["b", "a", "c"],
  1145. ordered=True,
  1146. ),
  1147. name="count",
  1148. )
  1149. tm.assert_series_equal(result, expected, check_index_type=True)
  1150. result = s.value_counts(dropna=False)
  1151. expected = Series(
  1152. [4, 3, 2, 1],
  1153. index=CategoricalIndex(
  1154. ["a", "b", "c", np.nan], categories=["b", "a", "c"], ordered=True
  1155. ),
  1156. name="count",
  1157. )
  1158. tm.assert_series_equal(result, expected, check_index_type=True)
  1159. def test_categorical_zeroes(self):
  1160. # keep the `d` category with 0
  1161. s = Series(Categorical(list("bbbaac"), categories=list("abcd"), ordered=True))
  1162. result = s.value_counts()
  1163. expected = Series(
  1164. [3, 2, 1, 0],
  1165. index=Categorical(
  1166. ["b", "a", "c", "d"], categories=list("abcd"), ordered=True
  1167. ),
  1168. name="count",
  1169. )
  1170. tm.assert_series_equal(result, expected, check_index_type=True)
  1171. def test_value_counts_dropna(self):
  1172. # https://github.com/pandas-dev/pandas/issues/9443#issuecomment-73719328
  1173. tm.assert_series_equal(
  1174. Series([True, True, False]).value_counts(dropna=True),
  1175. Series([2, 1], index=[True, False], name="count"),
  1176. )
  1177. tm.assert_series_equal(
  1178. Series([True, True, False]).value_counts(dropna=False),
  1179. Series([2, 1], index=[True, False], name="count"),
  1180. )
  1181. tm.assert_series_equal(
  1182. Series([True] * 3 + [False] * 2 + [None] * 5).value_counts(dropna=True),
  1183. Series([3, 2], index=Index([True, False], dtype=object), name="count"),
  1184. )
  1185. tm.assert_series_equal(
  1186. Series([True] * 5 + [False] * 3 + [None] * 2).value_counts(dropna=False),
  1187. Series([5, 3, 2], index=[True, False, None], name="count"),
  1188. )
  1189. tm.assert_series_equal(
  1190. Series([10.3, 5.0, 5.0]).value_counts(dropna=True),
  1191. Series([2, 1], index=[5.0, 10.3], name="count"),
  1192. )
  1193. tm.assert_series_equal(
  1194. Series([10.3, 5.0, 5.0]).value_counts(dropna=False),
  1195. Series([2, 1], index=[5.0, 10.3], name="count"),
  1196. )
  1197. tm.assert_series_equal(
  1198. Series([10.3, 5.0, 5.0, None]).value_counts(dropna=True),
  1199. Series([2, 1], index=[5.0, 10.3], name="count"),
  1200. )
  1201. result = Series([10.3, 10.3, 5.0, 5.0, 5.0, None]).value_counts(dropna=False)
  1202. expected = Series([3, 2, 1], index=[5.0, 10.3, None], name="count")
  1203. tm.assert_series_equal(result, expected)
  1204. @pytest.mark.parametrize("dtype", (np.float64, object, "M8[ns]"))
  1205. def test_value_counts_normalized(self, dtype):
  1206. # GH12558
  1207. s = Series([1] * 2 + [2] * 3 + [np.nan] * 5)
  1208. s_typed = s.astype(dtype)
  1209. result = s_typed.value_counts(normalize=True, dropna=False)
  1210. expected = Series(
  1211. [0.5, 0.3, 0.2],
  1212. index=Series([np.nan, 2.0, 1.0], dtype=dtype),
  1213. name="proportion",
  1214. )
  1215. tm.assert_series_equal(result, expected)
  1216. result = s_typed.value_counts(normalize=True, dropna=True)
  1217. expected = Series(
  1218. [0.6, 0.4], index=Series([2.0, 1.0], dtype=dtype), name="proportion"
  1219. )
  1220. tm.assert_series_equal(result, expected)
  1221. def test_value_counts_uint64(self):
  1222. arr = np.array([2**63], dtype=np.uint64)
  1223. expected = Series([1], index=[2**63], name="count")
  1224. msg = "pandas.value_counts is deprecated"
  1225. with tm.assert_produces_warning(FutureWarning, match=msg):
  1226. result = algos.value_counts(arr)
  1227. tm.assert_series_equal(result, expected)
  1228. arr = np.array([-1, 2**63], dtype=object)
  1229. expected = Series([1, 1], index=[-1, 2**63], name="count")
  1230. with tm.assert_produces_warning(FutureWarning, match=msg):
  1231. result = algos.value_counts(arr)
  1232. tm.assert_series_equal(result, expected)
  1233. def test_value_counts_series(self):
  1234. # GH#54857
  1235. values = np.array([3, 1, 2, 3, 4, np.nan])
  1236. result = Series(values).value_counts(bins=3)
  1237. expected = Series(
  1238. [2, 2, 1],
  1239. index=IntervalIndex.from_tuples(
  1240. [(0.996, 2.0), (2.0, 3.0), (3.0, 4.0)], dtype="interval[float64, right]"
  1241. ),
  1242. name="count",
  1243. )
  1244. tm.assert_series_equal(result, expected)
  1245. class TestDuplicated:
  1246. def test_duplicated_with_nas(self):
  1247. keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object)
  1248. result = algos.duplicated(keys)
  1249. expected = np.array([False, False, False, True, False, True])
  1250. tm.assert_numpy_array_equal(result, expected)
  1251. result = algos.duplicated(keys, keep="first")
  1252. expected = np.array([False, False, False, True, False, True])
  1253. tm.assert_numpy_array_equal(result, expected)
  1254. result = algos.duplicated(keys, keep="last")
  1255. expected = np.array([True, False, True, False, False, False])
  1256. tm.assert_numpy_array_equal(result, expected)
  1257. result = algos.duplicated(keys, keep=False)
  1258. expected = np.array([True, False, True, True, False, True])
  1259. tm.assert_numpy_array_equal(result, expected)
  1260. keys = np.empty(8, dtype=object)
  1261. for i, t in enumerate(
  1262. zip([0, 0, np.nan, np.nan] * 2, [0, np.nan, 0, np.nan] * 2)
  1263. ):
  1264. keys[i] = t
  1265. result = algos.duplicated(keys)
  1266. falses = [False] * 4
  1267. trues = [True] * 4
  1268. expected = np.array(falses + trues)
  1269. tm.assert_numpy_array_equal(result, expected)
  1270. result = algos.duplicated(keys, keep="last")
  1271. expected = np.array(trues + falses)
  1272. tm.assert_numpy_array_equal(result, expected)
  1273. result = algos.duplicated(keys, keep=False)
  1274. expected = np.array(trues + trues)
  1275. tm.assert_numpy_array_equal(result, expected)
  1276. @pytest.mark.parametrize(
  1277. "case",
  1278. [
  1279. np.array([1, 2, 1, 5, 3, 2, 4, 1, 5, 6]),
  1280. np.array([1.1, 2.2, 1.1, np.nan, 3.3, 2.2, 4.4, 1.1, np.nan, 6.6]),
  1281. np.array(
  1282. [
  1283. 1 + 1j,
  1284. 2 + 2j,
  1285. 1 + 1j,
  1286. 5 + 5j,
  1287. 3 + 3j,
  1288. 2 + 2j,
  1289. 4 + 4j,
  1290. 1 + 1j,
  1291. 5 + 5j,
  1292. 6 + 6j,
  1293. ]
  1294. ),
  1295. np.array(["a", "b", "a", "e", "c", "b", "d", "a", "e", "f"], dtype=object),
  1296. np.array(
  1297. [1, 2**63, 1, 3**5, 10, 2**63, 39, 1, 3**5, 7], dtype=np.uint64
  1298. ),
  1299. ],
  1300. )
  1301. def test_numeric_object_likes(self, case):
  1302. exp_first = np.array(
  1303. [False, False, True, False, False, True, False, True, True, False]
  1304. )
  1305. exp_last = np.array(
  1306. [True, True, True, True, False, False, False, False, False, False]
  1307. )
  1308. exp_false = exp_first | exp_last
  1309. res_first = algos.duplicated(case, keep="first")
  1310. tm.assert_numpy_array_equal(res_first, exp_first)
  1311. res_last = algos.duplicated(case, keep="last")
  1312. tm.assert_numpy_array_equal(res_last, exp_last)
  1313. res_false = algos.duplicated(case, keep=False)
  1314. tm.assert_numpy_array_equal(res_false, exp_false)
  1315. # index
  1316. for idx in [Index(case), Index(case, dtype="category")]:
  1317. res_first = idx.duplicated(keep="first")
  1318. tm.assert_numpy_array_equal(res_first, exp_first)
  1319. res_last = idx.duplicated(keep="last")
  1320. tm.assert_numpy_array_equal(res_last, exp_last)
  1321. res_false = idx.duplicated(keep=False)
  1322. tm.assert_numpy_array_equal(res_false, exp_false)
  1323. # series
  1324. for s in [Series(case), Series(case, dtype="category")]:
  1325. res_first = s.duplicated(keep="first")
  1326. tm.assert_series_equal(res_first, Series(exp_first))
  1327. res_last = s.duplicated(keep="last")
  1328. tm.assert_series_equal(res_last, Series(exp_last))
  1329. res_false = s.duplicated(keep=False)
  1330. tm.assert_series_equal(res_false, Series(exp_false))
  1331. def test_datetime_likes(self):
  1332. dt = [
  1333. "2011-01-01",
  1334. "2011-01-02",
  1335. "2011-01-01",
  1336. "NaT",
  1337. "2011-01-03",
  1338. "2011-01-02",
  1339. "2011-01-04",
  1340. "2011-01-01",
  1341. "NaT",
  1342. "2011-01-06",
  1343. ]
  1344. td = [
  1345. "1 days",
  1346. "2 days",
  1347. "1 days",
  1348. "NaT",
  1349. "3 days",
  1350. "2 days",
  1351. "4 days",
  1352. "1 days",
  1353. "NaT",
  1354. "6 days",
  1355. ]
  1356. cases = [
  1357. np.array([Timestamp(d) for d in dt]),
  1358. np.array([Timestamp(d, tz="US/Eastern") for d in dt]),
  1359. np.array([Period(d, freq="D") for d in dt]),
  1360. np.array([np.datetime64(d) for d in dt]),
  1361. np.array([Timedelta(d) for d in td]),
  1362. ]
  1363. exp_first = np.array(
  1364. [False, False, True, False, False, True, False, True, True, False]
  1365. )
  1366. exp_last = np.array(
  1367. [True, True, True, True, False, False, False, False, False, False]
  1368. )
  1369. exp_false = exp_first | exp_last
  1370. for case in cases:
  1371. res_first = algos.duplicated(case, keep="first")
  1372. tm.assert_numpy_array_equal(res_first, exp_first)
  1373. res_last = algos.duplicated(case, keep="last")
  1374. tm.assert_numpy_array_equal(res_last, exp_last)
  1375. res_false = algos.duplicated(case, keep=False)
  1376. tm.assert_numpy_array_equal(res_false, exp_false)
  1377. # index
  1378. for idx in [
  1379. Index(case),
  1380. Index(case, dtype="category"),
  1381. Index(case, dtype=object),
  1382. ]:
  1383. res_first = idx.duplicated(keep="first")
  1384. tm.assert_numpy_array_equal(res_first, exp_first)
  1385. res_last = idx.duplicated(keep="last")
  1386. tm.assert_numpy_array_equal(res_last, exp_last)
  1387. res_false = idx.duplicated(keep=False)
  1388. tm.assert_numpy_array_equal(res_false, exp_false)
  1389. # series
  1390. for s in [
  1391. Series(case),
  1392. Series(case, dtype="category"),
  1393. Series(case, dtype=object),
  1394. ]:
  1395. res_first = s.duplicated(keep="first")
  1396. tm.assert_series_equal(res_first, Series(exp_first))
  1397. res_last = s.duplicated(keep="last")
  1398. tm.assert_series_equal(res_last, Series(exp_last))
  1399. res_false = s.duplicated(keep=False)
  1400. tm.assert_series_equal(res_false, Series(exp_false))
  1401. @pytest.mark.parametrize("case", [Index([1, 2, 3]), pd.RangeIndex(0, 3)])
  1402. def test_unique_index(self, case):
  1403. assert case.is_unique is True
  1404. tm.assert_numpy_array_equal(case.duplicated(), np.array([False, False, False]))
  1405. @pytest.mark.parametrize(
  1406. "arr, uniques",
  1407. [
  1408. (
  1409. [(0, 0), (0, 1), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0), (1, 1)],
  1410. [(0, 0), (0, 1), (1, 0), (1, 1)],
  1411. ),
  1412. (
  1413. [("b", "c"), ("a", "b"), ("a", "b"), ("b", "c")],
  1414. [("b", "c"), ("a", "b")],
  1415. ),
  1416. ([("a", 1), ("b", 2), ("a", 3), ("a", 1)], [("a", 1), ("b", 2), ("a", 3)]),
  1417. ],
  1418. )
  1419. def test_unique_tuples(self, arr, uniques):
  1420. # https://github.com/pandas-dev/pandas/issues/16519
  1421. expected = np.empty(len(uniques), dtype=object)
  1422. expected[:] = uniques
  1423. msg = "unique with argument that is not not a Series"
  1424. with tm.assert_produces_warning(FutureWarning, match=msg):
  1425. result = pd.unique(arr)
  1426. tm.assert_numpy_array_equal(result, expected)
  1427. @pytest.mark.parametrize(
  1428. "array,expected",
  1429. [
  1430. (
  1431. [1 + 1j, 0, 1, 1j, 1 + 2j, 1 + 2j],
  1432. # Should return a complex dtype in the future
  1433. np.array([(1 + 1j), 0j, (1 + 0j), 1j, (1 + 2j)], dtype=object),
  1434. )
  1435. ],
  1436. )
  1437. def test_unique_complex_numbers(self, array, expected):
  1438. # GH 17927
  1439. msg = "unique with argument that is not not a Series"
  1440. with tm.assert_produces_warning(FutureWarning, match=msg):
  1441. result = pd.unique(array)
  1442. tm.assert_numpy_array_equal(result, expected)
  1443. class TestHashTable:
  1444. @pytest.mark.parametrize(
  1445. "htable, data",
  1446. [
  1447. (
  1448. ht.PyObjectHashTable,
  1449. np.array([f"foo_{i}" for i in range(1000)], dtype=object),
  1450. ),
  1451. (
  1452. ht.StringHashTable,
  1453. np.array([f"foo_{i}" for i in range(1000)], dtype=object),
  1454. ),
  1455. (ht.Float64HashTable, np.arange(1000, dtype=np.float64)),
  1456. (ht.Int64HashTable, np.arange(1000, dtype=np.int64)),
  1457. (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)),
  1458. ],
  1459. )
  1460. def test_hashtable_unique(self, htable, data, writable):
  1461. # output of maker has guaranteed unique elements
  1462. s = Series(data, dtype=data.dtype)
  1463. if htable == ht.Float64HashTable:
  1464. # add NaN for float column
  1465. s.loc[500] = np.nan
  1466. elif htable == ht.PyObjectHashTable:
  1467. # use different NaN types for object column
  1468. s.loc[500:502] = [np.nan, None, NaT]
  1469. # create duplicated selection
  1470. s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
  1471. s_duplicated.values.setflags(write=writable)
  1472. # drop_duplicates has own cython code (hash_table_func_helper.pxi)
  1473. # and is tested separately; keeps first occurrence like ht.unique()
  1474. expected_unique = s_duplicated.drop_duplicates(keep="first").values
  1475. result_unique = htable().unique(s_duplicated.values)
  1476. tm.assert_numpy_array_equal(result_unique, expected_unique)
  1477. # test return_inverse=True
  1478. # reconstruction can only succeed if the inverse is correct
  1479. result_unique, result_inverse = htable().unique(
  1480. s_duplicated.values, return_inverse=True
  1481. )
  1482. tm.assert_numpy_array_equal(result_unique, expected_unique)
  1483. reconstr = result_unique[result_inverse]
  1484. tm.assert_numpy_array_equal(reconstr, s_duplicated.values)
  1485. @pytest.mark.parametrize(
  1486. "htable, data",
  1487. [
  1488. (
  1489. ht.PyObjectHashTable,
  1490. np.array([f"foo_{i}" for i in range(1000)], dtype=object),
  1491. ),
  1492. (
  1493. ht.StringHashTable,
  1494. np.array([f"foo_{i}" for i in range(1000)], dtype=object),
  1495. ),
  1496. (ht.Float64HashTable, np.arange(1000, dtype=np.float64)),
  1497. (ht.Int64HashTable, np.arange(1000, dtype=np.int64)),
  1498. (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)),
  1499. ],
  1500. )
  1501. def test_hashtable_factorize(self, htable, writable, data):
  1502. # output of maker has guaranteed unique elements
  1503. s = Series(data, dtype=data.dtype)
  1504. if htable == ht.Float64HashTable:
  1505. # add NaN for float column
  1506. s.loc[500] = np.nan
  1507. elif htable == ht.PyObjectHashTable:
  1508. # use different NaN types for object column
  1509. s.loc[500:502] = [np.nan, None, NaT]
  1510. # create duplicated selection
  1511. s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
  1512. s_duplicated.values.setflags(write=writable)
  1513. na_mask = s_duplicated.isna().values
  1514. result_unique, result_inverse = htable().factorize(s_duplicated.values)
  1515. # drop_duplicates has own cython code (hash_table_func_helper.pxi)
  1516. # and is tested separately; keeps first occurrence like ht.factorize()
  1517. # since factorize removes all NaNs, we do the same here
  1518. expected_unique = s_duplicated.dropna().drop_duplicates().values
  1519. tm.assert_numpy_array_equal(result_unique, expected_unique)
  1520. # reconstruction can only succeed if the inverse is correct. Since
  1521. # factorize removes the NaNs, those have to be excluded here as well
  1522. result_reconstruct = result_unique[result_inverse[~na_mask]]
  1523. expected_reconstruct = s_duplicated.dropna().values
  1524. tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct)
  1525. class TestRank:
  1526. @pytest.mark.parametrize(
  1527. "arr",
  1528. [
  1529. [np.nan, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 3, np.nan],
  1530. [4.0, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 4.0, np.nan],
  1531. ],
  1532. )
  1533. def test_scipy_compat(self, arr):
  1534. sp_stats = pytest.importorskip("scipy.stats")
  1535. arr = np.array(arr)
  1536. mask = ~np.isfinite(arr)
  1537. arr = arr.copy()
  1538. result = libalgos.rank_1d(arr)
  1539. arr[mask] = np.inf
  1540. exp = sp_stats.rankdata(arr)
  1541. exp[mask] = np.nan
  1542. tm.assert_almost_equal(result, exp)
  1543. @pytest.mark.parametrize("dtype", np.typecodes["AllInteger"])
  1544. def test_basic(self, writable, dtype):
  1545. exp = np.array([1, 2], dtype=np.float64)
  1546. data = np.array([1, 100], dtype=dtype)
  1547. data.setflags(write=writable)
  1548. ser = Series(data)
  1549. result = algos.rank(ser)
  1550. tm.assert_numpy_array_equal(result, exp)
  1551. @pytest.mark.parametrize("dtype", [np.float64, np.uint64])
  1552. def test_uint64_overflow(self, dtype):
  1553. exp = np.array([1, 2], dtype=np.float64)
  1554. s = Series([1, 2**63], dtype=dtype)
  1555. tm.assert_numpy_array_equal(algos.rank(s), exp)
  1556. def test_too_many_ndims(self):
  1557. arr = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]])
  1558. msg = "Array with ndim > 2 are not supported"
  1559. with pytest.raises(TypeError, match=msg):
  1560. algos.rank(arr)
  1561. @pytest.mark.single_cpu
  1562. def test_pct_max_many_rows(self):
  1563. # GH 18271
  1564. values = np.arange(2**24 + 1)
  1565. result = algos.rank(values, pct=True).max()
  1566. assert result == 1
  1567. values = np.arange(2**25 + 2).reshape(2**24 + 1, 2)
  1568. result = algos.rank(values, pct=True).max()
  1569. assert result == 1
  1570. class TestMode:
  1571. def test_no_mode(self):
  1572. exp = Series([], dtype=np.float64, index=Index([], dtype=int))
  1573. tm.assert_numpy_array_equal(algos.mode(np.array([])), exp.values)
  1574. @pytest.mark.parametrize("dt", np.typecodes["AllInteger"] + np.typecodes["Float"])
  1575. def test_mode_single(self, dt):
  1576. # GH 15714
  1577. exp_single = [1]
  1578. data_single = [1]
  1579. exp_multi = [1]
  1580. data_multi = [1, 1]
  1581. ser = Series(data_single, dtype=dt)
  1582. exp = Series(exp_single, dtype=dt)
  1583. tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
  1584. tm.assert_series_equal(ser.mode(), exp)
  1585. ser = Series(data_multi, dtype=dt)
  1586. exp = Series(exp_multi, dtype=dt)
  1587. tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
  1588. tm.assert_series_equal(ser.mode(), exp)
  1589. def test_mode_obj_int(self):
  1590. exp = Series([1], dtype=int)
  1591. tm.assert_numpy_array_equal(algos.mode(exp.values), exp.values)
  1592. exp = Series(["a", "b", "c"], dtype=object)
  1593. tm.assert_numpy_array_equal(algos.mode(exp.values), exp.values)
  1594. @pytest.mark.parametrize("dt", np.typecodes["AllInteger"] + np.typecodes["Float"])
  1595. def test_number_mode(self, dt):
  1596. exp_single = [1]
  1597. data_single = [1] * 5 + [2] * 3
  1598. exp_multi = [1, 3]
  1599. data_multi = [1] * 5 + [2] * 3 + [3] * 5
  1600. ser = Series(data_single, dtype=dt)
  1601. exp = Series(exp_single, dtype=dt)
  1602. tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
  1603. tm.assert_series_equal(ser.mode(), exp)
  1604. ser = Series(data_multi, dtype=dt)
  1605. exp = Series(exp_multi, dtype=dt)
  1606. tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
  1607. tm.assert_series_equal(ser.mode(), exp)
  1608. def test_strobj_mode(self):
  1609. exp = ["b"]
  1610. data = ["a"] * 2 + ["b"] * 3
  1611. ser = Series(data, dtype="c")
  1612. exp = Series(exp, dtype="c")
  1613. tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
  1614. tm.assert_series_equal(ser.mode(), exp)
  1615. @pytest.mark.parametrize("dt", [str, object])
  1616. def test_strobj_multi_char(self, dt, using_infer_string):
  1617. exp = ["bar"]
  1618. data = ["foo"] * 2 + ["bar"] * 3
  1619. ser = Series(data, dtype=dt)
  1620. exp = Series(exp, dtype=dt)
  1621. if using_infer_string and dt is str:
  1622. tm.assert_extension_array_equal(algos.mode(ser.values), exp.values)
  1623. else:
  1624. tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
  1625. tm.assert_series_equal(ser.mode(), exp)
  1626. def test_datelike_mode(self):
  1627. exp = Series(["1900-05-03", "2011-01-03", "2013-01-02"], dtype="M8[ns]")
  1628. ser = Series(["2011-01-03", "2013-01-02", "1900-05-03"], dtype="M8[ns]")
  1629. tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
  1630. tm.assert_series_equal(ser.mode(), exp)
  1631. exp = Series(["2011-01-03", "2013-01-02"], dtype="M8[ns]")
  1632. ser = Series(
  1633. ["2011-01-03", "2013-01-02", "1900-05-03", "2011-01-03", "2013-01-02"],
  1634. dtype="M8[ns]",
  1635. )
  1636. tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
  1637. tm.assert_series_equal(ser.mode(), exp)
  1638. def test_timedelta_mode(self):
  1639. exp = Series(["-1 days", "0 days", "1 days"], dtype="timedelta64[ns]")
  1640. ser = Series(["1 days", "-1 days", "0 days"], dtype="timedelta64[ns]")
  1641. tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
  1642. tm.assert_series_equal(ser.mode(), exp)
  1643. exp = Series(["2 min", "1 day"], dtype="timedelta64[ns]")
  1644. ser = Series(
  1645. ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"],
  1646. dtype="timedelta64[ns]",
  1647. )
  1648. tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
  1649. tm.assert_series_equal(ser.mode(), exp)
  1650. def test_mixed_dtype(self):
  1651. exp = Series(["foo"], dtype=object)
  1652. ser = Series([1, "foo", "foo"])
  1653. tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
  1654. tm.assert_series_equal(ser.mode(), exp)
  1655. def test_uint64_overflow(self):
  1656. exp = Series([2**63], dtype=np.uint64)
  1657. ser = Series([1, 2**63, 2**63], dtype=np.uint64)
  1658. tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
  1659. tm.assert_series_equal(ser.mode(), exp)
  1660. exp = Series([1, 2**63], dtype=np.uint64)
  1661. ser = Series([1, 2**63], dtype=np.uint64)
  1662. tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
  1663. tm.assert_series_equal(ser.mode(), exp)
  1664. def test_categorical(self):
  1665. c = Categorical([1, 2])
  1666. exp = c
  1667. res = Series(c).mode()._values
  1668. tm.assert_categorical_equal(res, exp)
  1669. c = Categorical([1, "a", "a"])
  1670. exp = Categorical(["a"], categories=[1, "a"])
  1671. res = Series(c).mode()._values
  1672. tm.assert_categorical_equal(res, exp)
  1673. c = Categorical([1, 1, 2, 3, 3])
  1674. exp = Categorical([1, 3], categories=[1, 2, 3])
  1675. res = Series(c).mode()._values
  1676. tm.assert_categorical_equal(res, exp)
  1677. def test_index(self):
  1678. idx = Index([1, 2, 3])
  1679. exp = Series([1, 2, 3], dtype=np.int64)
  1680. tm.assert_numpy_array_equal(algos.mode(idx), exp.values)
  1681. idx = Index([1, "a", "a"])
  1682. exp = Series(["a"], dtype=object)
  1683. tm.assert_numpy_array_equal(algos.mode(idx), exp.values)
  1684. idx = Index([1, 1, 2, 3, 3])
  1685. exp = Series([1, 3], dtype=np.int64)
  1686. tm.assert_numpy_array_equal(algos.mode(idx), exp.values)
  1687. idx = Index(
  1688. ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"],
  1689. dtype="timedelta64[ns]",
  1690. )
  1691. with pytest.raises(AttributeError, match="TimedeltaIndex"):
  1692. # algos.mode expects Arraylike, does *not* unwrap TimedeltaIndex
  1693. algos.mode(idx)
  1694. def test_ser_mode_with_name(self):
  1695. # GH 46737
  1696. ser = Series([1, 1, 3], name="foo")
  1697. result = ser.mode()
  1698. expected = Series([1], name="foo")
  1699. tm.assert_series_equal(result, expected)
  1700. class TestDiff:
  1701. @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
  1702. def test_diff_datetimelike_nat(self, dtype):
  1703. # NaT - NaT is NaT, not 0
  1704. arr = np.arange(12).astype(np.int64).view(dtype).reshape(3, 4)
  1705. arr[:, 2] = arr.dtype.type("NaT", "ns")
  1706. result = algos.diff(arr, 1, axis=0)
  1707. expected = np.ones(arr.shape, dtype="timedelta64[ns]") * 4
  1708. expected[:, 2] = np.timedelta64("NaT", "ns")
  1709. expected[0, :] = np.timedelta64("NaT", "ns")
  1710. tm.assert_numpy_array_equal(result, expected)
  1711. result = algos.diff(arr.T, 1, axis=1)
  1712. tm.assert_numpy_array_equal(result, expected.T)
  1713. def test_diff_ea_axis(self):
  1714. dta = date_range("2016-01-01", periods=3, tz="US/Pacific")._data
  1715. msg = "cannot diff DatetimeArray on axis=1"
  1716. with pytest.raises(ValueError, match=msg):
  1717. algos.diff(dta, 1, axis=1)
  1718. @pytest.mark.parametrize("dtype", ["int8", "int16"])
  1719. def test_diff_low_precision_int(self, dtype):
  1720. arr = np.array([0, 1, 1, 0, 0], dtype=dtype)
  1721. result = algos.diff(arr, 1)
  1722. expected = np.array([np.nan, 1, 0, -1, 0], dtype="float32")
  1723. tm.assert_numpy_array_equal(result, expected)
  1724. @pytest.mark.parametrize("op", [np.array, pd.array])
  1725. def test_union_with_duplicates(op):
  1726. # GH#36289
  1727. lvals = op([3, 1, 3, 4])
  1728. rvals = op([2, 3, 1, 1])
  1729. expected = op([3, 3, 1, 1, 4, 2])
  1730. if isinstance(expected, np.ndarray):
  1731. result = algos.union_with_duplicates(lvals, rvals)
  1732. tm.assert_numpy_array_equal(result, expected)
  1733. else:
  1734. result = algos.union_with_duplicates(lvals, rvals)
  1735. tm.assert_extension_array_equal(result, expected)