test_algos.py 77 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083
  1. from datetime import datetime
  2. import struct
  3. import numpy as np
  4. import pytest
  5. from pandas._libs import (
  6. algos as libalgos,
  7. hashtable as ht,
  8. )
  9. from pandas.core.dtypes.common import (
  10. is_bool_dtype,
  11. is_complex_dtype,
  12. is_float_dtype,
  13. is_integer_dtype,
  14. is_object_dtype,
  15. )
  16. from pandas.core.dtypes.dtypes import (
  17. CategoricalDtype,
  18. DatetimeTZDtype,
  19. )
  20. import pandas as pd
  21. from pandas import (
  22. Categorical,
  23. CategoricalIndex,
  24. DataFrame,
  25. DatetimeIndex,
  26. Index,
  27. IntervalIndex,
  28. MultiIndex,
  29. NaT,
  30. Period,
  31. PeriodIndex,
  32. Series,
  33. Timedelta,
  34. Timestamp,
  35. cut,
  36. date_range,
  37. timedelta_range,
  38. to_datetime,
  39. to_timedelta,
  40. )
  41. import pandas._testing as tm
  42. import pandas.core.algorithms as algos
  43. from pandas.core.arrays import (
  44. DatetimeArray,
  45. TimedeltaArray,
  46. )
  47. import pandas.core.common as com
  48. class TestFactorize:
  49. def test_factorize_complex(self):
  50. # GH#17927
  51. array = np.array([1, 2, 2 + 1j], dtype=complex)
  52. labels, uniques = algos.factorize(array)
  53. expected_labels = np.array([0, 1, 2], dtype=np.intp)
  54. tm.assert_numpy_array_equal(labels, expected_labels)
  55. expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=complex)
  56. tm.assert_numpy_array_equal(uniques, expected_uniques)
  57. def test_factorize(self, index_or_series_obj, sort):
  58. obj = index_or_series_obj
  59. result_codes, result_uniques = obj.factorize(sort=sort)
  60. constructor = Index
  61. if isinstance(obj, MultiIndex):
  62. constructor = MultiIndex.from_tuples
  63. expected_arr = obj.unique()
  64. if expected_arr.dtype == np.float16:
  65. expected_arr = expected_arr.astype(np.float32)
  66. expected_uniques = constructor(expected_arr)
  67. if (
  68. isinstance(obj, Index)
  69. and expected_uniques.dtype == bool
  70. and obj.dtype == object
  71. ):
  72. expected_uniques = expected_uniques.astype(object)
  73. if sort:
  74. expected_uniques = expected_uniques.sort_values()
  75. # construct an integer ndarray so that
  76. # `expected_uniques.take(expected_codes)` is equal to `obj`
  77. expected_uniques_list = list(expected_uniques)
  78. expected_codes = [expected_uniques_list.index(val) for val in obj]
  79. expected_codes = np.asarray(expected_codes, dtype=np.intp)
  80. tm.assert_numpy_array_equal(result_codes, expected_codes)
  81. tm.assert_index_equal(result_uniques, expected_uniques, exact=True)
  82. def test_series_factorize_use_na_sentinel_false(self):
  83. # GH#35667
  84. values = np.array([1, 2, 1, np.nan])
  85. ser = Series(values)
  86. codes, uniques = ser.factorize(use_na_sentinel=False)
  87. expected_codes = np.array([0, 1, 0, 2], dtype=np.intp)
  88. expected_uniques = Index([1.0, 2.0, np.nan])
  89. tm.assert_numpy_array_equal(codes, expected_codes)
  90. tm.assert_index_equal(uniques, expected_uniques)
  91. def test_basic(self):
  92. items = np.array(["a", "b", "b", "a", "a", "c", "c", "c"], dtype=object)
  93. codes, uniques = algos.factorize(items)
  94. tm.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object))
  95. codes, uniques = algos.factorize(items, sort=True)
  96. exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp)
  97. tm.assert_numpy_array_equal(codes, exp)
  98. exp = np.array(["a", "b", "c"], dtype=object)
  99. tm.assert_numpy_array_equal(uniques, exp)
  100. arr = np.arange(5, dtype=np.intp)[::-1]
  101. codes, uniques = algos.factorize(arr)
  102. exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
  103. tm.assert_numpy_array_equal(codes, exp)
  104. exp = np.array([4, 3, 2, 1, 0], dtype=arr.dtype)
  105. tm.assert_numpy_array_equal(uniques, exp)
  106. codes, uniques = algos.factorize(arr, sort=True)
  107. exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
  108. tm.assert_numpy_array_equal(codes, exp)
  109. exp = np.array([0, 1, 2, 3, 4], dtype=arr.dtype)
  110. tm.assert_numpy_array_equal(uniques, exp)
  111. arr = np.arange(5.0)[::-1]
  112. codes, uniques = algos.factorize(arr)
  113. exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
  114. tm.assert_numpy_array_equal(codes, exp)
  115. exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=arr.dtype)
  116. tm.assert_numpy_array_equal(uniques, exp)
  117. codes, uniques = algos.factorize(arr, sort=True)
  118. exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
  119. tm.assert_numpy_array_equal(codes, exp)
  120. exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=arr.dtype)
  121. tm.assert_numpy_array_equal(uniques, exp)
  122. def test_mixed(self):
  123. # doc example reshaping.rst
  124. x = Series(["A", "A", np.nan, "B", 3.14, np.inf])
  125. codes, uniques = algos.factorize(x)
  126. exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp)
  127. tm.assert_numpy_array_equal(codes, exp)
  128. exp = Index(["A", "B", 3.14, np.inf])
  129. tm.assert_index_equal(uniques, exp)
  130. codes, uniques = algos.factorize(x, sort=True)
  131. exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp)
  132. tm.assert_numpy_array_equal(codes, exp)
  133. exp = Index([3.14, np.inf, "A", "B"])
  134. tm.assert_index_equal(uniques, exp)
  135. def test_factorize_datetime64(self):
  136. # M8
  137. v1 = Timestamp("20130101 09:00:00.00004")
  138. v2 = Timestamp("20130101")
  139. x = Series([v1, v1, v1, v2, v2, v1])
  140. codes, uniques = algos.factorize(x)
  141. exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
  142. tm.assert_numpy_array_equal(codes, exp)
  143. exp = DatetimeIndex([v1, v2])
  144. tm.assert_index_equal(uniques, exp)
  145. codes, uniques = algos.factorize(x, sort=True)
  146. exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.intp)
  147. tm.assert_numpy_array_equal(codes, exp)
  148. exp = DatetimeIndex([v2, v1])
  149. tm.assert_index_equal(uniques, exp)
  150. def test_factorize_period(self):
  151. # period
  152. v1 = Period("201302", freq="M")
  153. v2 = Period("201303", freq="M")
  154. x = Series([v1, v1, v1, v2, v2, v1])
  155. # periods are not 'sorted' as they are converted back into an index
  156. codes, uniques = algos.factorize(x)
  157. exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
  158. tm.assert_numpy_array_equal(codes, exp)
  159. tm.assert_index_equal(uniques, PeriodIndex([v1, v2]))
  160. codes, uniques = algos.factorize(x, sort=True)
  161. exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
  162. tm.assert_numpy_array_equal(codes, exp)
  163. tm.assert_index_equal(uniques, PeriodIndex([v1, v2]))
  164. def test_factorize_timedelta(self):
  165. # GH 5986
  166. v1 = to_timedelta("1 day 1 min")
  167. v2 = to_timedelta("1 day")
  168. x = Series([v1, v2, v1, v1, v2, v2, v1])
  169. codes, uniques = algos.factorize(x)
  170. exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp)
  171. tm.assert_numpy_array_equal(codes, exp)
  172. tm.assert_index_equal(uniques, to_timedelta([v1, v2]))
  173. codes, uniques = algos.factorize(x, sort=True)
  174. exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp)
  175. tm.assert_numpy_array_equal(codes, exp)
  176. tm.assert_index_equal(uniques, to_timedelta([v2, v1]))
  177. def test_factorize_nan(self):
  178. # nan should map to na_sentinel, not reverse_indexer[na_sentinel]
  179. # rizer.factorize should not raise an exception if na_sentinel indexes
  180. # outside of reverse_indexer
  181. key = np.array([1, 2, 1, np.nan], dtype="O")
  182. rizer = ht.ObjectFactorizer(len(key))
  183. for na_sentinel in (-1, 20):
  184. ids = rizer.factorize(key, na_sentinel=na_sentinel)
  185. expected = np.array([0, 1, 0, na_sentinel], dtype=np.intp)
  186. assert len(set(key)) == len(set(expected))
  187. tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel)
  188. tm.assert_numpy_array_equal(ids, expected)
  189. def test_factorizer_with_mask(self):
  190. # GH#49549
  191. data = np.array([1, 2, 3, 1, 1, 0], dtype="int64")
  192. mask = np.array([False, False, False, False, False, True])
  193. rizer = ht.Int64Factorizer(len(data))
  194. result = rizer.factorize(data, mask=mask)
  195. expected = np.array([0, 1, 2, 0, 0, -1], dtype=np.intp)
  196. tm.assert_numpy_array_equal(result, expected)
  197. expected_uniques = np.array([1, 2, 3], dtype="int64")
  198. tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques)
  199. def test_factorizer_object_with_nan(self):
  200. # GH#49549
  201. data = np.array([1, 2, 3, 1, np.nan])
  202. rizer = ht.ObjectFactorizer(len(data))
  203. result = rizer.factorize(data.astype(object))
  204. expected = np.array([0, 1, 2, 0, -1], dtype=np.intp)
  205. tm.assert_numpy_array_equal(result, expected)
  206. expected_uniques = np.array([1, 2, 3], dtype=object)
  207. tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques)
  208. @pytest.mark.parametrize(
  209. "data, expected_codes, expected_uniques",
  210. [
  211. (
  212. [(1, 1), (1, 2), (0, 0), (1, 2), "nonsense"],
  213. [0, 1, 2, 1, 3],
  214. [(1, 1), (1, 2), (0, 0), "nonsense"],
  215. ),
  216. (
  217. [(1, 1), (1, 2), (0, 0), (1, 2), (1, 2, 3)],
  218. [0, 1, 2, 1, 3],
  219. [(1, 1), (1, 2), (0, 0), (1, 2, 3)],
  220. ),
  221. ([(1, 1), (1, 2), (0, 0), (1, 2)], [0, 1, 2, 1], [(1, 1), (1, 2), (0, 0)]),
  222. ],
  223. )
  224. def test_factorize_tuple_list(self, data, expected_codes, expected_uniques):
  225. # GH9454
  226. data = com.asarray_tuplesafe(data, dtype=object)
  227. codes, uniques = pd.factorize(data)
  228. tm.assert_numpy_array_equal(codes, np.array(expected_codes, dtype=np.intp))
  229. expected_uniques_array = com.asarray_tuplesafe(expected_uniques, dtype=object)
  230. tm.assert_numpy_array_equal(uniques, expected_uniques_array)
  231. def test_complex_sorting(self):
  232. # gh 12666 - check no segfault
  233. x17 = np.array([complex(i) for i in range(17)], dtype=object)
  234. msg = "'[<>]' not supported between instances of .*"
  235. with pytest.raises(TypeError, match=msg):
  236. algos.factorize(x17[::-1], sort=True)
  237. def test_numeric_dtype_factorize(self, any_real_numpy_dtype):
  238. # GH41132
  239. dtype = any_real_numpy_dtype
  240. data = np.array([1, 2, 2, 1], dtype=dtype)
  241. expected_codes = np.array([0, 1, 1, 0], dtype=np.intp)
  242. expected_uniques = np.array([1, 2], dtype=dtype)
  243. codes, uniques = algos.factorize(data)
  244. tm.assert_numpy_array_equal(codes, expected_codes)
  245. tm.assert_numpy_array_equal(uniques, expected_uniques)
  246. def test_float64_factorize(self, writable):
  247. data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64)
  248. data.setflags(write=writable)
  249. expected_codes = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp)
  250. expected_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64)
  251. codes, uniques = algos.factorize(data)
  252. tm.assert_numpy_array_equal(codes, expected_codes)
  253. tm.assert_numpy_array_equal(uniques, expected_uniques)
  254. def test_uint64_factorize(self, writable):
  255. data = np.array([2**64 - 1, 1, 2**64 - 1], dtype=np.uint64)
  256. data.setflags(write=writable)
  257. expected_codes = np.array([0, 1, 0], dtype=np.intp)
  258. expected_uniques = np.array([2**64 - 1, 1], dtype=np.uint64)
  259. codes, uniques = algos.factorize(data)
  260. tm.assert_numpy_array_equal(codes, expected_codes)
  261. tm.assert_numpy_array_equal(uniques, expected_uniques)
  262. def test_int64_factorize(self, writable):
  263. data = np.array([2**63 - 1, -(2**63), 2**63 - 1], dtype=np.int64)
  264. data.setflags(write=writable)
  265. expected_codes = np.array([0, 1, 0], dtype=np.intp)
  266. expected_uniques = np.array([2**63 - 1, -(2**63)], dtype=np.int64)
  267. codes, uniques = algos.factorize(data)
  268. tm.assert_numpy_array_equal(codes, expected_codes)
  269. tm.assert_numpy_array_equal(uniques, expected_uniques)
  270. def test_string_factorize(self, writable):
  271. data = np.array(["a", "c", "a", "b", "c"], dtype=object)
  272. data.setflags(write=writable)
  273. expected_codes = np.array([0, 1, 0, 2, 1], dtype=np.intp)
  274. expected_uniques = np.array(["a", "c", "b"], dtype=object)
  275. codes, uniques = algos.factorize(data)
  276. tm.assert_numpy_array_equal(codes, expected_codes)
  277. tm.assert_numpy_array_equal(uniques, expected_uniques)
  278. def test_object_factorize(self, writable):
  279. data = np.array(["a", "c", None, np.nan, "a", "b", NaT, "c"], dtype=object)
  280. data.setflags(write=writable)
  281. expected_codes = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp)
  282. expected_uniques = np.array(["a", "c", "b"], dtype=object)
  283. codes, uniques = algos.factorize(data)
  284. tm.assert_numpy_array_equal(codes, expected_codes)
  285. tm.assert_numpy_array_equal(uniques, expected_uniques)
  286. def test_datetime64_factorize(self, writable):
  287. # GH35650 Verify whether read-only datetime64 array can be factorized
  288. data = np.array([np.datetime64("2020-01-01T00:00:00.000")], dtype="M8[ns]")
  289. data.setflags(write=writable)
  290. expected_codes = np.array([0], dtype=np.intp)
  291. expected_uniques = np.array(
  292. ["2020-01-01T00:00:00.000000000"], dtype="datetime64[ns]"
  293. )
  294. codes, uniques = pd.factorize(data)
  295. tm.assert_numpy_array_equal(codes, expected_codes)
  296. tm.assert_numpy_array_equal(uniques, expected_uniques)
  297. def test_factorize_rangeindex(self, sort):
  298. # increasing -> sort doesn't matter
  299. ri = pd.RangeIndex.from_range(range(10))
  300. expected = np.arange(10, dtype=np.intp), ri
  301. result = algos.factorize(ri, sort=sort)
  302. tm.assert_numpy_array_equal(result[0], expected[0])
  303. tm.assert_index_equal(result[1], expected[1], exact=True)
  304. result = ri.factorize(sort=sort)
  305. tm.assert_numpy_array_equal(result[0], expected[0])
  306. tm.assert_index_equal(result[1], expected[1], exact=True)
  307. def test_factorize_rangeindex_decreasing(self, sort):
  308. # decreasing -> sort matters
  309. ri = pd.RangeIndex.from_range(range(10))
  310. expected = np.arange(10, dtype=np.intp), ri
  311. ri2 = ri[::-1]
  312. expected = expected[0], ri2
  313. if sort:
  314. expected = expected[0][::-1], expected[1][::-1]
  315. result = algos.factorize(ri2, sort=sort)
  316. tm.assert_numpy_array_equal(result[0], expected[0])
  317. tm.assert_index_equal(result[1], expected[1], exact=True)
  318. result = ri2.factorize(sort=sort)
  319. tm.assert_numpy_array_equal(result[0], expected[0])
  320. tm.assert_index_equal(result[1], expected[1], exact=True)
  321. def test_deprecate_order(self):
  322. # gh 19727 - check warning is raised for deprecated keyword, order.
  323. # Test not valid once order keyword is removed.
  324. data = np.array([2**63, 1, 2**63], dtype=np.uint64)
  325. with pytest.raises(TypeError, match="got an unexpected keyword"):
  326. algos.factorize(data, order=True)
  327. with tm.assert_produces_warning(False):
  328. algos.factorize(data)
  329. @pytest.mark.parametrize(
  330. "data",
  331. [
  332. np.array([0, 1, 0], dtype="u8"),
  333. np.array([-(2**63), 1, -(2**63)], dtype="i8"),
  334. np.array(["__nan__", "foo", "__nan__"], dtype="object"),
  335. ],
  336. )
  337. def test_parametrized_factorize_na_value_default(self, data):
  338. # arrays that include the NA default for that type, but isn't used.
  339. codes, uniques = algos.factorize(data)
  340. expected_uniques = data[[0, 1]]
  341. expected_codes = np.array([0, 1, 0], dtype=np.intp)
  342. tm.assert_numpy_array_equal(codes, expected_codes)
  343. tm.assert_numpy_array_equal(uniques, expected_uniques)
  344. @pytest.mark.parametrize(
  345. "data, na_value",
  346. [
  347. (np.array([0, 1, 0, 2], dtype="u8"), 0),
  348. (np.array([1, 0, 1, 2], dtype="u8"), 1),
  349. (np.array([-(2**63), 1, -(2**63), 0], dtype="i8"), -(2**63)),
  350. (np.array([1, -(2**63), 1, 0], dtype="i8"), 1),
  351. (np.array(["a", "", "a", "b"], dtype=object), "a"),
  352. (np.array([(), ("a", 1), (), ("a", 2)], dtype=object), ()),
  353. (np.array([("a", 1), (), ("a", 1), ("a", 2)], dtype=object), ("a", 1)),
  354. ],
  355. )
  356. def test_parametrized_factorize_na_value(self, data, na_value):
  357. codes, uniques = algos.factorize_array(data, na_value=na_value)
  358. expected_uniques = data[[1, 3]]
  359. expected_codes = np.array([-1, 0, -1, 1], dtype=np.intp)
  360. tm.assert_numpy_array_equal(codes, expected_codes)
  361. tm.assert_numpy_array_equal(uniques, expected_uniques)
  362. @pytest.mark.parametrize(
  363. "data, uniques",
  364. [
  365. (
  366. np.array(["b", "a", None, "b"], dtype=object),
  367. np.array(["b", "a"], dtype=object),
  368. ),
  369. (
  370. pd.array([2, 1, pd.NA, 2], dtype="Int64"),
  371. pd.array([2, 1], dtype="Int64"),
  372. ),
  373. ],
  374. ids=["numpy_array", "extension_array"],
  375. )
  376. def test_factorize_use_na_sentinel(self, sort, data, uniques):
  377. codes, uniques = algos.factorize(data, sort=sort, use_na_sentinel=True)
  378. if sort:
  379. expected_codes = np.array([1, 0, -1, 1], dtype=np.intp)
  380. expected_uniques = algos.safe_sort(uniques)
  381. else:
  382. expected_codes = np.array([0, 1, -1, 0], dtype=np.intp)
  383. expected_uniques = uniques
  384. tm.assert_numpy_array_equal(codes, expected_codes)
  385. if isinstance(data, np.ndarray):
  386. tm.assert_numpy_array_equal(uniques, expected_uniques)
  387. else:
  388. tm.assert_extension_array_equal(uniques, expected_uniques)
  389. @pytest.mark.parametrize(
  390. "data, expected_codes, expected_uniques",
  391. [
  392. (
  393. ["a", None, "b", "a"],
  394. np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
  395. np.array(["a", np.nan, "b"], dtype=object),
  396. ),
  397. (
  398. ["a", np.nan, "b", "a"],
  399. np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
  400. np.array(["a", np.nan, "b"], dtype=object),
  401. ),
  402. ],
  403. )
  404. def test_object_factorize_use_na_sentinel_false(
  405. self, data, expected_codes, expected_uniques
  406. ):
  407. codes, uniques = algos.factorize(
  408. np.array(data, dtype=object), use_na_sentinel=False
  409. )
  410. tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True)
  411. tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True)
  412. @pytest.mark.parametrize(
  413. "data, expected_codes, expected_uniques",
  414. [
  415. (
  416. np.array([1, None, 1, 2], dtype=object),
  417. np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
  418. np.array([1, np.nan, 2], dtype="O"),
  419. ),
  420. (
  421. np.array([1, np.nan, 1, 2], dtype=np.float64),
  422. np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
  423. np.array([1, np.nan, 2], dtype=np.float64),
  424. ),
  425. ],
  426. )
  427. def test_int_factorize_use_na_sentinel_false(
  428. self, data, expected_codes, expected_uniques
  429. ):
  430. codes, uniques = algos.factorize(data, use_na_sentinel=False)
  431. tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True)
  432. tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True)
  433. @pytest.mark.parametrize(
  434. "data, expected_codes, expected_uniques",
  435. [
  436. (
  437. Index(Categorical(["a", "a", "b"])),
  438. np.array([0, 0, 1], dtype=np.intp),
  439. CategoricalIndex(["a", "b"], categories=["a", "b"], dtype="category"),
  440. ),
  441. (
  442. Series(Categorical(["a", "a", "b"])),
  443. np.array([0, 0, 1], dtype=np.intp),
  444. CategoricalIndex(["a", "b"], categories=["a", "b"], dtype="category"),
  445. ),
  446. (
  447. Series(DatetimeIndex(["2017", "2017"], tz="US/Eastern")),
  448. np.array([0, 0], dtype=np.intp),
  449. DatetimeIndex(["2017"], tz="US/Eastern"),
  450. ),
  451. ],
  452. )
  453. def test_factorize_mixed_values(self, data, expected_codes, expected_uniques):
  454. # GH 19721
  455. codes, uniques = algos.factorize(data)
  456. tm.assert_numpy_array_equal(codes, expected_codes)
  457. tm.assert_index_equal(uniques, expected_uniques)
  458. def test_factorize_interval_non_nano(self, unit):
  459. # GH#56099
  460. left = DatetimeIndex(["2016-01-01", np.nan, "2015-10-11"]).as_unit(unit)
  461. right = DatetimeIndex(["2016-01-02", np.nan, "2015-10-15"]).as_unit(unit)
  462. idx = IntervalIndex.from_arrays(left, right)
  463. codes, cats = idx.factorize()
  464. assert cats.dtype == f"interval[datetime64[{unit}], right]"
  465. ts = Timestamp(0).as_unit(unit)
  466. idx2 = IntervalIndex.from_arrays(left - ts, right - ts)
  467. codes2, cats2 = idx2.factorize()
  468. assert cats2.dtype == f"interval[timedelta64[{unit}], right]"
  469. idx3 = IntervalIndex.from_arrays(
  470. left.tz_localize("US/Pacific"), right.tz_localize("US/Pacific")
  471. )
  472. codes3, cats3 = idx3.factorize()
  473. assert cats3.dtype == f"interval[datetime64[{unit}, US/Pacific], right]"
  474. class TestUnique:
  475. def test_ints(self):
  476. arr = np.random.default_rng(2).integers(0, 100, size=50)
  477. result = algos.unique(arr)
  478. assert isinstance(result, np.ndarray)
  479. def test_objects(self):
  480. arr = np.random.default_rng(2).integers(0, 100, size=50).astype("O")
  481. result = algos.unique(arr)
  482. assert isinstance(result, np.ndarray)
  483. def test_object_refcount_bug(self):
  484. lst = np.array(["A", "B", "C", "D", "E"], dtype=object)
  485. for i in range(1000):
  486. len(algos.unique(lst))
  487. def test_index_returned(self, index):
  488. # GH#57043
  489. index = index.repeat(2)
  490. result = algos.unique(index)
  491. # dict.fromkeys preserves the order
  492. unique_values = list(dict.fromkeys(index.values))
  493. if isinstance(index, MultiIndex):
  494. expected = MultiIndex.from_tuples(unique_values, names=index.names)
  495. else:
  496. expected = Index(unique_values, dtype=index.dtype)
  497. if isinstance(index.dtype, DatetimeTZDtype):
  498. expected = expected.normalize()
  499. tm.assert_index_equal(result, expected, exact=True)
  500. def test_factorize_multiindex_empty(self):
  501. # GH#57517
  502. mi = MultiIndex.from_product(
  503. [Index([], name="a", dtype=object), Index([], name="i", dtype="f4")]
  504. )
  505. codes, uniques = mi.factorize()
  506. exp_codes = np.array([], dtype=np.intp)
  507. tm.assert_numpy_array_equal(codes, exp_codes)
  508. tm.assert_index_equal(uniques, mi[:0])
  509. def test_dtype_preservation(self, any_numpy_dtype):
  510. # GH 15442
  511. if any_numpy_dtype in (tm.BYTES_DTYPES + tm.STRING_DTYPES):
  512. data = [1, 2, 2]
  513. uniques = [1, 2]
  514. elif is_integer_dtype(any_numpy_dtype):
  515. data = [1, 2, 2]
  516. uniques = [1, 2]
  517. elif is_float_dtype(any_numpy_dtype):
  518. data = [1, 2, 2]
  519. uniques = [1.0, 2.0]
  520. elif is_complex_dtype(any_numpy_dtype):
  521. data = [complex(1, 0), complex(2, 0), complex(2, 0)]
  522. uniques = [complex(1, 0), complex(2, 0)]
  523. elif is_bool_dtype(any_numpy_dtype):
  524. data = [True, True, False]
  525. uniques = [True, False]
  526. elif is_object_dtype(any_numpy_dtype):
  527. data = ["A", "B", "B"]
  528. uniques = ["A", "B"]
  529. else:
  530. # datetime64[ns]/M8[ns]/timedelta64[ns]/m8[ns] tested elsewhere
  531. data = [1, 2, 2]
  532. uniques = [1, 2]
  533. result = Series(data, dtype=any_numpy_dtype).unique()
  534. expected = np.array(uniques, dtype=any_numpy_dtype)
  535. if any_numpy_dtype in tm.STRING_DTYPES:
  536. expected = expected.astype(object)
  537. if expected.dtype.kind in ["m", "M"]:
  538. # We get TimedeltaArray/DatetimeArray
  539. assert isinstance(result, (DatetimeArray, TimedeltaArray))
  540. result = np.array(result)
  541. tm.assert_numpy_array_equal(result, expected)
  542. def test_datetime64_dtype_array_returned(self):
  543. # GH 9431
  544. dt_arr = np.array(
  545. [
  546. "2015-01-03T00:00:00.000000000",
  547. "2015-01-01T00:00:00.000000000",
  548. ],
  549. dtype="M8[ns]",
  550. )
  551. dt_index = to_datetime(
  552. [
  553. "2015-01-03T00:00:00.000000000",
  554. "2015-01-01T00:00:00.000000000",
  555. "2015-01-01T00:00:00.000000000",
  556. ]
  557. )
  558. result = algos.unique(dt_index)
  559. expected = to_datetime(dt_arr)
  560. tm.assert_index_equal(result, expected, exact=True)
  561. s = Series(dt_index)
  562. result = algos.unique(s)
  563. tm.assert_numpy_array_equal(result, dt_arr)
  564. assert result.dtype == dt_arr.dtype
  565. arr = s.values
  566. result = algos.unique(arr)
  567. tm.assert_numpy_array_equal(result, dt_arr)
  568. assert result.dtype == dt_arr.dtype
  569. def test_datetime_non_ns(self):
  570. a = np.array(["2000", "2000", "2001"], dtype="datetime64[s]")
  571. result = pd.unique(a)
  572. expected = np.array(["2000", "2001"], dtype="datetime64[s]")
  573. tm.assert_numpy_array_equal(result, expected)
  574. def test_timedelta_non_ns(self):
  575. a = np.array(["2000", "2000", "2001"], dtype="timedelta64[s]")
  576. result = pd.unique(a)
  577. expected = np.array([2000, 2001], dtype="timedelta64[s]")
  578. tm.assert_numpy_array_equal(result, expected)
  579. def test_timedelta64_dtype_array_returned(self):
  580. # GH 9431
  581. td_arr = np.array([31200, 45678, 10000], dtype="m8[ns]")
  582. td_index = to_timedelta([31200, 45678, 31200, 10000, 45678])
  583. result = algos.unique(td_index)
  584. expected = to_timedelta(td_arr)
  585. tm.assert_index_equal(result, expected)
  586. assert result.dtype == expected.dtype
  587. s = Series(td_index)
  588. result = algos.unique(s)
  589. tm.assert_numpy_array_equal(result, td_arr)
  590. assert result.dtype == td_arr.dtype
  591. arr = s.values
  592. result = algos.unique(arr)
  593. tm.assert_numpy_array_equal(result, td_arr)
  594. assert result.dtype == td_arr.dtype
  595. def test_uint64_overflow(self):
  596. s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
  597. exp = np.array([1, 2, 2**63], dtype=np.uint64)
  598. tm.assert_numpy_array_equal(algos.unique(s), exp)
  599. def test_nan_in_object_array(self):
  600. duplicated_items = ["a", np.nan, "c", "c"]
  601. result = pd.unique(np.array(duplicated_items, dtype=object))
  602. expected = np.array(["a", np.nan, "c"], dtype=object)
  603. tm.assert_numpy_array_equal(result, expected)
  604. def test_categorical(self):
  605. # we are expecting to return in the order
  606. # of appearance
  607. expected = Categorical(list("bac"))
  608. # we are expecting to return in the order
  609. # of the categories
  610. expected_o = Categorical(list("bac"), categories=list("abc"), ordered=True)
  611. # GH 15939
  612. c = Categorical(list("baabc"))
  613. result = c.unique()
  614. tm.assert_categorical_equal(result, expected)
  615. result = algos.unique(c)
  616. tm.assert_categorical_equal(result, expected)
  617. c = Categorical(list("baabc"), ordered=True)
  618. result = c.unique()
  619. tm.assert_categorical_equal(result, expected_o)
  620. result = algos.unique(c)
  621. tm.assert_categorical_equal(result, expected_o)
  622. # Series of categorical dtype
  623. s = Series(Categorical(list("baabc")), name="foo")
  624. result = s.unique()
  625. tm.assert_categorical_equal(result, expected)
  626. result = pd.unique(s)
  627. tm.assert_categorical_equal(result, expected)
  628. # CI -> return CI
  629. ci = CategoricalIndex(Categorical(list("baabc"), categories=list("abc")))
  630. expected = CategoricalIndex(expected)
  631. result = ci.unique()
  632. tm.assert_index_equal(result, expected)
  633. result = pd.unique(ci)
  634. tm.assert_index_equal(result, expected)
  635. def test_datetime64tz_aware(self, unit):
  636. # GH 15939
  637. dti = Index(
  638. [
  639. Timestamp("20160101", tz="US/Eastern"),
  640. Timestamp("20160101", tz="US/Eastern"),
  641. ]
  642. ).as_unit(unit)
  643. ser = Series(dti)
  644. result = ser.unique()
  645. expected = dti[:1]._data
  646. tm.assert_extension_array_equal(result, expected)
  647. result = dti.unique()
  648. expected = dti[:1]
  649. tm.assert_index_equal(result, expected)
  650. result = pd.unique(ser)
  651. expected = dti[:1]._data
  652. tm.assert_extension_array_equal(result, expected)
  653. result = pd.unique(dti)
  654. expected = dti[:1]
  655. tm.assert_index_equal(result, expected)
  656. def test_order_of_appearance(self):
  657. # 9346
  658. # light testing of guarantee of order of appearance
  659. # these also are the doc-examples
  660. result = pd.unique(Series([2, 1, 3, 3]))
  661. tm.assert_numpy_array_equal(result, np.array([2, 1, 3], dtype="int64"))
  662. result = pd.unique(Series([2] + [1] * 5))
  663. tm.assert_numpy_array_equal(result, np.array([2, 1], dtype="int64"))
  664. data = np.array(["a", "a", "b", "c"], dtype=object)
  665. result = pd.unique(data)
  666. expected = np.array(["a", "b", "c"], dtype=object)
  667. tm.assert_numpy_array_equal(result, expected)
  668. result = pd.unique(Series(Categorical(list("aabc"))))
  669. expected = Categorical(list("abc"))
  670. tm.assert_categorical_equal(result, expected)
  671. def test_order_of_appearance_dt64(self, unit):
  672. ser = Series([Timestamp("20160101"), Timestamp("20160101")]).dt.as_unit(unit)
  673. result = pd.unique(ser)
  674. expected = np.array(["2016-01-01T00:00:00.000000000"], dtype=f"M8[{unit}]")
  675. tm.assert_numpy_array_equal(result, expected)
  676. def test_order_of_appearance_dt64tz(self, unit):
  677. dti = DatetimeIndex(
  678. [
  679. Timestamp("20160101", tz="US/Eastern"),
  680. Timestamp("20160101", tz="US/Eastern"),
  681. ]
  682. ).as_unit(unit)
  683. result = pd.unique(dti)
  684. expected = DatetimeIndex(
  685. ["2016-01-01 00:00:00"], dtype=f"datetime64[{unit}, US/Eastern]", freq=None
  686. )
  687. tm.assert_index_equal(result, expected)
  688. @pytest.mark.parametrize(
  689. "arg ,expected",
  690. [
  691. (("1", "1", "2"), np.array(["1", "2"], dtype=object)),
  692. (("foo",), np.array(["foo"], dtype=object)),
  693. ],
  694. )
  695. def test_tuple_with_strings(self, arg, expected):
  696. # see GH 17108
  697. arg = com.asarray_tuplesafe(arg, dtype=object)
  698. result = pd.unique(arg)
  699. tm.assert_numpy_array_equal(result, expected)
  700. def test_obj_none_preservation(self):
  701. # GH 20866
  702. arr = np.array(["foo", None], dtype=object)
  703. result = pd.unique(arr)
  704. expected = np.array(["foo", None], dtype=object)
  705. tm.assert_numpy_array_equal(result, expected, strict_nan=True)
  706. def test_signed_zero(self):
  707. # GH 21866
  708. a = np.array([-0.0, 0.0])
  709. result = pd.unique(a)
  710. expected = np.array([-0.0]) # 0.0 and -0.0 are equivalent
  711. tm.assert_numpy_array_equal(result, expected)
  712. def test_different_nans(self):
  713. # GH 21866
  714. # create different nans from bit-patterns:
  715. NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
  716. NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
  717. assert NAN1 != NAN1
  718. assert NAN2 != NAN2
  719. a = np.array([NAN1, NAN2]) # NAN1 and NAN2 are equivalent
  720. result = pd.unique(a)
  721. expected = np.array([np.nan])
  722. tm.assert_numpy_array_equal(result, expected)
  723. @pytest.mark.parametrize("el_type", [np.float64, object])
  724. def test_first_nan_kept(self, el_type):
  725. # GH 22295
  726. # create different nans from bit-patterns:
  727. bits_for_nan1 = 0xFFF8000000000001
  728. bits_for_nan2 = 0x7FF8000000000001
  729. NAN1 = struct.unpack("d", struct.pack("=Q", bits_for_nan1))[0]
  730. NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0]
  731. assert NAN1 != NAN1
  732. assert NAN2 != NAN2
  733. a = np.array([NAN1, NAN2], dtype=el_type)
  734. result = pd.unique(a)
  735. assert result.size == 1
  736. # use bit patterns to identify which nan was kept:
  737. result_nan_bits = struct.unpack("=Q", struct.pack("d", result[0]))[0]
  738. assert result_nan_bits == bits_for_nan1
  739. def test_do_not_mangle_na_values(self, unique_nulls_fixture, unique_nulls_fixture2):
  740. # GH 22295
  741. if unique_nulls_fixture is unique_nulls_fixture2:
  742. return # skip it, values not unique
  743. a = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=object)
  744. result = pd.unique(a)
  745. assert result.size == 2
  746. assert a[0] is unique_nulls_fixture
  747. assert a[1] is unique_nulls_fixture2
  748. def test_unique_masked(self, any_numeric_ea_dtype):
  749. # GH#48019
  750. ser = Series([1, pd.NA, 2] * 3, dtype=any_numeric_ea_dtype)
  751. result = pd.unique(ser)
  752. expected = pd.array([1, pd.NA, 2], dtype=any_numeric_ea_dtype)
  753. tm.assert_extension_array_equal(result, expected)
  754. def test_unique_NumpyExtensionArray(self):
  755. arr_complex = pd.array(
  756. [1 + 1j, 2, 3]
  757. ) # NumpyEADtype('complex128') => NumpyExtensionArray
  758. result = pd.unique(arr_complex)
  759. expected = pd.array([1 + 1j, 2 + 0j, 3 + 0j])
  760. tm.assert_extension_array_equal(result, expected)
  761. def test_nunique_ints(index_or_series_or_array):
  762. # GH#36327
  763. values = index_or_series_or_array(np.random.default_rng(2).integers(0, 20, 30))
  764. result = algos.nunique_ints(values)
  765. expected = len(algos.unique(values))
  766. assert result == expected
  767. class TestIsin:
  768. def test_invalid(self):
  769. msg = (
  770. r"only list-like objects are allowed to be passed to isin\(\), "
  771. r"you passed a `int`"
  772. )
  773. with pytest.raises(TypeError, match=msg):
  774. algos.isin(1, 1)
  775. with pytest.raises(TypeError, match=msg):
  776. algos.isin(1, [1])
  777. with pytest.raises(TypeError, match=msg):
  778. algos.isin([1], 1)
  779. def test_basic(self):
  780. result = algos.isin(np.array([1, 2]), [1])
  781. expected = np.array([True, False])
  782. tm.assert_numpy_array_equal(result, expected)
  783. result = algos.isin(Series([1, 2]), [1])
  784. expected = np.array([True, False])
  785. tm.assert_numpy_array_equal(result, expected)
  786. result = algos.isin(Series([1, 2]), Series([1]))
  787. expected = np.array([True, False])
  788. tm.assert_numpy_array_equal(result, expected)
  789. result = algos.isin(Series([1, 2]), {1})
  790. expected = np.array([True, False])
  791. tm.assert_numpy_array_equal(result, expected)
  792. arg = np.array(["a", "b"], dtype=object)
  793. result = algos.isin(arg, ["a"])
  794. expected = np.array([True, False])
  795. tm.assert_numpy_array_equal(result, expected)
  796. result = algos.isin(Series(arg), Series(["a"]))
  797. expected = np.array([True, False])
  798. tm.assert_numpy_array_equal(result, expected)
  799. result = algos.isin(Series(arg), {"a"})
  800. expected = np.array([True, False])
  801. tm.assert_numpy_array_equal(result, expected)
  802. result = algos.isin(arg, [1])
  803. expected = np.array([False, False])
  804. tm.assert_numpy_array_equal(result, expected)
  805. def test_i8(self):
  806. arr = date_range("20130101", periods=3).values
  807. result = algos.isin(arr, [arr[0]])
  808. expected = np.array([True, False, False])
  809. tm.assert_numpy_array_equal(result, expected)
  810. result = algos.isin(arr, arr[0:2])
  811. expected = np.array([True, True, False])
  812. tm.assert_numpy_array_equal(result, expected)
  813. result = algos.isin(arr, set(arr[0:2]))
  814. expected = np.array([True, True, False])
  815. tm.assert_numpy_array_equal(result, expected)
  816. arr = timedelta_range("1 day", periods=3).values
  817. result = algos.isin(arr, [arr[0]])
  818. expected = np.array([True, False, False])
  819. tm.assert_numpy_array_equal(result, expected)
  820. result = algos.isin(arr, arr[0:2])
  821. expected = np.array([True, True, False])
  822. tm.assert_numpy_array_equal(result, expected)
  823. result = algos.isin(arr, set(arr[0:2]))
  824. expected = np.array([True, True, False])
  825. tm.assert_numpy_array_equal(result, expected)
  826. @pytest.mark.parametrize("dtype1", ["m8[ns]", "M8[ns]", "M8[ns, UTC]", "period[D]"])
  827. @pytest.mark.parametrize("dtype", ["i8", "f8", "u8"])
  828. def test_isin_datetimelike_values_numeric_comps(self, dtype, dtype1):
  829. # Anything but object and we get all-False shortcut
  830. dta = date_range("2013-01-01", periods=3)._values
  831. arr = Series(dta.view("i8")).array.view(dtype1)
  832. comps = arr.view("i8").astype(dtype)
  833. result = algos.isin(comps, arr)
  834. expected = np.zeros(comps.shape, dtype=bool)
  835. tm.assert_numpy_array_equal(result, expected)
  836. def test_large(self):
  837. s = date_range("20000101", periods=2000000, freq="s").values
  838. result = algos.isin(s, s[0:2])
  839. expected = np.zeros(len(s), dtype=bool)
  840. expected[0] = True
  841. expected[1] = True
  842. tm.assert_numpy_array_equal(result, expected)
  843. @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]", "M8[ns, UTC]", "period[D]"])
  844. def test_isin_datetimelike_all_nat(self, dtype):
  845. # GH#56427
  846. dta = date_range("2013-01-01", periods=3)._values
  847. arr = Series(dta.view("i8")).array.view(dtype)
  848. arr[0] = NaT
  849. result = algos.isin(arr, [NaT])
  850. expected = np.array([True, False, False], dtype=bool)
  851. tm.assert_numpy_array_equal(result, expected)
  852. @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]", "M8[ns, UTC]"])
  853. def test_isin_datetimelike_strings_returns_false(self, dtype):
  854. # GH#53111
  855. dta = date_range("2013-01-01", periods=3)._values
  856. arr = Series(dta.view("i8")).array.view(dtype)
  857. vals = [str(x) for x in arr]
  858. res = algos.isin(arr, vals)
  859. assert not res.any()
  860. vals2 = np.array(vals, dtype=str)
  861. res2 = algos.isin(arr, vals2)
  862. assert not res2.any()
  863. def test_isin_dt64tz_with_nat(self):
  864. # the all-NaT values used to get inferred to tznaive, which was evaluated
  865. # as non-matching GH#56427
  866. dti = date_range("2016-01-01", periods=3, tz="UTC")
  867. ser = Series(dti)
  868. ser[0] = NaT
  869. res = algos.isin(ser._values, [NaT])
  870. exp = np.array([True, False, False], dtype=bool)
  871. tm.assert_numpy_array_equal(res, exp)
  872. def test_categorical_from_codes(self):
  873. # GH 16639
  874. vals = np.array([0, 1, 2, 0])
  875. cats = ["a", "b", "c"]
  876. Sd = Series(Categorical([1]).from_codes(vals, cats))
  877. St = Series(Categorical([1]).from_codes(np.array([0, 1]), cats))
  878. expected = np.array([True, True, False, True])
  879. result = algos.isin(Sd, St)
  880. tm.assert_numpy_array_equal(expected, result)
  881. def test_categorical_isin(self):
  882. vals = np.array([0, 1, 2, 0])
  883. cats = ["a", "b", "c"]
  884. cat = Categorical([1]).from_codes(vals, cats)
  885. other = Categorical([1]).from_codes(np.array([0, 1]), cats)
  886. expected = np.array([True, True, False, True])
  887. result = algos.isin(cat, other)
  888. tm.assert_numpy_array_equal(expected, result)
  889. def test_same_nan_is_in(self):
  890. # GH 22160
  891. # nan is special, because from " a is b" doesn't follow "a == b"
  892. # at least, isin() should follow python's "np.nan in [nan] == True"
  893. # casting to -> np.float64 -> another float-object somewhere on
  894. # the way could lead jeopardize this behavior
  895. comps = np.array([np.nan], dtype=object) # could be casted to float64
  896. values = [np.nan]
  897. expected = np.array([True])
  898. result = algos.isin(comps, values)
  899. tm.assert_numpy_array_equal(expected, result)
  900. def test_same_nan_is_in_large(self):
  901. # https://github.com/pandas-dev/pandas/issues/22205
  902. s = np.tile(1.0, 1_000_001)
  903. s[0] = np.nan
  904. result = algos.isin(s, np.array([np.nan, 1]))
  905. expected = np.ones(len(s), dtype=bool)
  906. tm.assert_numpy_array_equal(result, expected)
  907. def test_same_nan_is_in_large_series(self):
  908. # https://github.com/pandas-dev/pandas/issues/22205
  909. s = np.tile(1.0, 1_000_001)
  910. series = Series(s)
  911. s[0] = np.nan
  912. result = series.isin(np.array([np.nan, 1]))
  913. expected = Series(np.ones(len(s), dtype=bool))
  914. tm.assert_series_equal(result, expected)
  915. def test_same_object_is_in(self):
  916. # GH 22160
  917. # there could be special treatment for nans
  918. # the user however could define a custom class
  919. # with similar behavior, then we at least should
  920. # fall back to usual python's behavior: "a in [a] == True"
  921. class LikeNan:
  922. def __eq__(self, other) -> bool:
  923. return False
  924. def __hash__(self):
  925. return 0
  926. a, b = LikeNan(), LikeNan()
  927. arg = np.array([a], dtype=object)
  928. # same object -> True
  929. tm.assert_numpy_array_equal(algos.isin(arg, [a]), np.array([True]))
  930. # different objects -> False
  931. tm.assert_numpy_array_equal(algos.isin(arg, [b]), np.array([False]))
  932. def test_different_nans(self):
  933. # GH 22160
  934. # all nans are handled as equivalent
  935. comps = [float("nan")]
  936. values = [float("nan")]
  937. assert comps[0] is not values[0] # different nan-objects
  938. # as list of python-objects:
  939. result = algos.isin(np.array(comps), values)
  940. tm.assert_numpy_array_equal(np.array([True]), result)
  941. # as object-array:
  942. result = algos.isin(
  943. np.asarray(comps, dtype=object), np.asarray(values, dtype=object)
  944. )
  945. tm.assert_numpy_array_equal(np.array([True]), result)
  946. # as float64-array:
  947. result = algos.isin(
  948. np.asarray(comps, dtype=np.float64), np.asarray(values, dtype=np.float64)
  949. )
  950. tm.assert_numpy_array_equal(np.array([True]), result)
  951. def test_no_cast(self):
  952. # GH 22160
  953. # ensure 42 is not casted to a string
  954. comps = np.array(["ss", 42], dtype=object)
  955. values = ["42"]
  956. expected = np.array([False, False])
  957. result = algos.isin(comps, values)
  958. tm.assert_numpy_array_equal(expected, result)
  959. @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])])
  960. def test_empty(self, empty):
  961. # see gh-16991
  962. vals = Index(["a", "b"])
  963. expected = np.array([False, False])
  964. result = algos.isin(vals, empty)
  965. tm.assert_numpy_array_equal(expected, result)
  966. def test_different_nan_objects(self):
  967. # GH 22119
  968. comps = np.array(["nan", np.nan * 1j, float("nan")], dtype=object)
  969. vals = np.array([float("nan")], dtype=object)
  970. expected = np.array([False, False, True])
  971. result = algos.isin(comps, vals)
  972. tm.assert_numpy_array_equal(expected, result)
  973. def test_different_nans_as_float64(self):
  974. # GH 21866
  975. # create different nans from bit-patterns,
  976. # these nans will land in different buckets in the hash-table
  977. # if no special care is taken
  978. NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
  979. NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
  980. assert NAN1 != NAN1
  981. assert NAN2 != NAN2
  982. # check that NAN1 and NAN2 are equivalent:
  983. arr = np.array([NAN1, NAN2], dtype=np.float64)
  984. lookup1 = np.array([NAN1], dtype=np.float64)
  985. result = algos.isin(arr, lookup1)
  986. expected = np.array([True, True])
  987. tm.assert_numpy_array_equal(result, expected)
  988. lookup2 = np.array([NAN2], dtype=np.float64)
  989. result = algos.isin(arr, lookup2)
  990. expected = np.array([True, True])
  991. tm.assert_numpy_array_equal(result, expected)
  992. def test_isin_int_df_string_search(self):
  993. """Comparing df with int`s (1,2) with a string at isin() ("1")
  994. -> should not match values because int 1 is not equal str 1"""
  995. df = DataFrame({"values": [1, 2]})
  996. result = df.isin(["1"])
  997. expected_false = DataFrame({"values": [False, False]})
  998. tm.assert_frame_equal(result, expected_false)
  999. def test_isin_nan_df_string_search(self):
  1000. """Comparing df with nan value (np.nan,2) with a string at isin() ("NaN")
  1001. -> should not match values because np.nan is not equal str NaN"""
  1002. df = DataFrame({"values": [np.nan, 2]})
  1003. result = df.isin(np.array(["NaN"], dtype=object))
  1004. expected_false = DataFrame({"values": [False, False]})
  1005. tm.assert_frame_equal(result, expected_false)
  1006. def test_isin_float_df_string_search(self):
  1007. """Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245")
  1008. -> should not match values because float 1.4245 is not equal str 1.4245"""
  1009. df = DataFrame({"values": [1.4245, 2.32441]})
  1010. result = df.isin(np.array(["1.4245"], dtype=object))
  1011. expected_false = DataFrame({"values": [False, False]})
  1012. tm.assert_frame_equal(result, expected_false)
  1013. def test_isin_unsigned_dtype(self):
  1014. # GH#46485
  1015. ser = Series([1378774140726870442], dtype=np.uint64)
  1016. result = ser.isin([1378774140726870528])
  1017. expected = Series(False)
  1018. tm.assert_series_equal(result, expected)
  1019. class TestValueCounts:
  1020. def test_value_counts(self):
  1021. arr = np.random.default_rng(1234).standard_normal(4)
  1022. factor = cut(arr, 4)
  1023. # assert isinstance(factor, n)
  1024. result = algos.value_counts_internal(factor)
  1025. breaks = [-1.606, -1.018, -0.431, 0.155, 0.741]
  1026. index = IntervalIndex.from_breaks(breaks).astype(CategoricalDtype(ordered=True))
  1027. expected = Series([1, 0, 2, 1], index=index, name="count")
  1028. tm.assert_series_equal(result.sort_index(), expected.sort_index())
  1029. def test_value_counts_bins(self):
  1030. s = [1, 2, 3, 4]
  1031. result = algos.value_counts_internal(s, bins=1)
  1032. expected = Series(
  1033. [4], index=IntervalIndex.from_tuples([(0.996, 4.0)]), name="count"
  1034. )
  1035. tm.assert_series_equal(result, expected)
  1036. result = algos.value_counts_internal(s, bins=2, sort=False)
  1037. expected = Series(
  1038. [2, 2],
  1039. index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)]),
  1040. name="count",
  1041. )
  1042. tm.assert_series_equal(result, expected)
  1043. def test_value_counts_dtypes(self):
  1044. result = algos.value_counts_internal(np.array([1, 1.0]))
  1045. assert len(result) == 1
  1046. result = algos.value_counts_internal(np.array([1, 1.0]), bins=1)
  1047. assert len(result) == 1
  1048. result = algos.value_counts_internal(Series([1, 1.0, "1"])) # object
  1049. assert len(result) == 2
  1050. msg = "bins argument only works with numeric data"
  1051. with pytest.raises(TypeError, match=msg):
  1052. algos.value_counts_internal(np.array(["1", 1], dtype=object), bins=1)
  1053. def test_value_counts_nat(self):
  1054. td = Series([np.timedelta64(10000), NaT], dtype="timedelta64[ns]")
  1055. dt = to_datetime(["NaT", "2014-01-01"])
  1056. for ser in [td, dt]:
  1057. vc = algos.value_counts_internal(ser)
  1058. vc_with_na = algos.value_counts_internal(ser, dropna=False)
  1059. assert len(vc) == 1
  1060. assert len(vc_with_na) == 2
  1061. exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}, name="count")
  1062. result_dt = algos.value_counts_internal(dt)
  1063. tm.assert_series_equal(result_dt, exp_dt)
  1064. exp_td = Series([1], index=[np.timedelta64(10000)], name="count")
  1065. result_td = algos.value_counts_internal(td)
  1066. tm.assert_series_equal(result_td, exp_td)
  1067. @pytest.mark.parametrize("dtype", [object, "M8[us]"])
  1068. def test_value_counts_datetime_outofbounds(self, dtype):
  1069. # GH 13663
  1070. ser = Series(
  1071. [
  1072. datetime(3000, 1, 1),
  1073. datetime(5000, 1, 1),
  1074. datetime(5000, 1, 1),
  1075. datetime(6000, 1, 1),
  1076. datetime(3000, 1, 1),
  1077. datetime(3000, 1, 1),
  1078. ],
  1079. dtype=dtype,
  1080. )
  1081. res = ser.value_counts()
  1082. exp_index = Index(
  1083. [datetime(3000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)],
  1084. dtype=dtype,
  1085. )
  1086. exp = Series([3, 2, 1], index=exp_index, name="count")
  1087. tm.assert_series_equal(res, exp)
  1088. def test_categorical(self):
  1089. s = Series(Categorical(list("aaabbc")))
  1090. result = s.value_counts()
  1091. expected = Series(
  1092. [3, 2, 1], index=CategoricalIndex(["a", "b", "c"]), name="count"
  1093. )
  1094. tm.assert_series_equal(result, expected, check_index_type=True)
  1095. # preserve order?
  1096. s = s.cat.as_ordered()
  1097. result = s.value_counts()
  1098. expected.index = expected.index.as_ordered()
  1099. tm.assert_series_equal(result, expected, check_index_type=True)
  1100. def test_categorical_nans(self):
  1101. s = Series(Categorical(list("aaaaabbbcc"))) # 4,3,2,1 (nan)
  1102. s.iloc[1] = np.nan
  1103. result = s.value_counts()
  1104. expected = Series(
  1105. [4, 3, 2],
  1106. index=CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c"]),
  1107. name="count",
  1108. )
  1109. tm.assert_series_equal(result, expected, check_index_type=True)
  1110. result = s.value_counts(dropna=False)
  1111. expected = Series(
  1112. [4, 3, 2, 1], index=CategoricalIndex(["a", "b", "c", np.nan]), name="count"
  1113. )
  1114. tm.assert_series_equal(result, expected, check_index_type=True)
  1115. # out of order
  1116. s = Series(
  1117. Categorical(list("aaaaabbbcc"), ordered=True, categories=["b", "a", "c"])
  1118. )
  1119. s.iloc[1] = np.nan
  1120. result = s.value_counts()
  1121. expected = Series(
  1122. [4, 3, 2],
  1123. index=CategoricalIndex(
  1124. ["a", "b", "c"],
  1125. categories=["b", "a", "c"],
  1126. ordered=True,
  1127. ),
  1128. name="count",
  1129. )
  1130. tm.assert_series_equal(result, expected, check_index_type=True)
  1131. result = s.value_counts(dropna=False)
  1132. expected = Series(
  1133. [4, 3, 2, 1],
  1134. index=CategoricalIndex(
  1135. ["a", "b", "c", np.nan], categories=["b", "a", "c"], ordered=True
  1136. ),
  1137. name="count",
  1138. )
  1139. tm.assert_series_equal(result, expected, check_index_type=True)
  1140. def test_categorical_zeroes(self):
  1141. # keep the `d` category with 0
  1142. s = Series(Categorical(list("bbbaac"), categories=list("abcd"), ordered=True))
  1143. result = s.value_counts()
  1144. expected = Series(
  1145. [3, 2, 1, 0],
  1146. index=Categorical(
  1147. ["b", "a", "c", "d"], categories=list("abcd"), ordered=True
  1148. ),
  1149. name="count",
  1150. )
  1151. tm.assert_series_equal(result, expected, check_index_type=True)
  1152. def test_value_counts_dropna(self):
  1153. # https://github.com/pandas-dev/pandas/issues/9443#issuecomment-73719328
  1154. tm.assert_series_equal(
  1155. Series([True, True, False]).value_counts(dropna=True),
  1156. Series([2, 1], index=[True, False], name="count"),
  1157. )
  1158. tm.assert_series_equal(
  1159. Series([True, True, False]).value_counts(dropna=False),
  1160. Series([2, 1], index=[True, False], name="count"),
  1161. )
  1162. tm.assert_series_equal(
  1163. Series([True] * 3 + [False] * 2 + [None] * 5).value_counts(dropna=True),
  1164. Series([3, 2], index=Index([True, False], dtype=object), name="count"),
  1165. )
  1166. tm.assert_series_equal(
  1167. Series([True] * 5 + [False] * 3 + [None] * 2).value_counts(dropna=False),
  1168. Series([5, 3, 2], index=[True, False, None], name="count"),
  1169. )
  1170. tm.assert_series_equal(
  1171. Series([10.3, 5.0, 5.0]).value_counts(dropna=True),
  1172. Series([2, 1], index=[5.0, 10.3], name="count"),
  1173. )
  1174. tm.assert_series_equal(
  1175. Series([10.3, 5.0, 5.0]).value_counts(dropna=False),
  1176. Series([2, 1], index=[5.0, 10.3], name="count"),
  1177. )
  1178. tm.assert_series_equal(
  1179. Series([10.3, 5.0, 5.0, None]).value_counts(dropna=True),
  1180. Series([2, 1], index=[5.0, 10.3], name="count"),
  1181. )
  1182. result = Series([10.3, 10.3, 5.0, 5.0, 5.0, None]).value_counts(dropna=False)
  1183. expected = Series([3, 2, 1], index=[5.0, 10.3, None], name="count")
  1184. tm.assert_series_equal(result, expected)
  1185. @pytest.mark.parametrize("dtype", (np.float64, object, "M8[ns]"))
  1186. def test_value_counts_normalized(self, dtype):
  1187. # GH12558
  1188. s = Series([1] * 2 + [2] * 3 + [np.nan] * 5)
  1189. s_typed = s.astype(dtype)
  1190. result = s_typed.value_counts(normalize=True, dropna=False)
  1191. expected = Series(
  1192. [0.5, 0.3, 0.2],
  1193. index=Series([np.nan, 2.0, 1.0], dtype=dtype),
  1194. name="proportion",
  1195. )
  1196. tm.assert_series_equal(result, expected)
  1197. result = s_typed.value_counts(normalize=True, dropna=True)
  1198. expected = Series(
  1199. [0.6, 0.4], index=Series([2.0, 1.0], dtype=dtype), name="proportion"
  1200. )
  1201. tm.assert_series_equal(result, expected)
  1202. def test_value_counts_uint64(self):
  1203. arr = np.array([2**63], dtype=np.uint64)
  1204. expected = Series([1], index=[2**63], name="count")
  1205. result = algos.value_counts_internal(arr)
  1206. tm.assert_series_equal(result, expected)
  1207. arr = np.array([-1, 2**63], dtype=object)
  1208. expected = Series([1, 1], index=[-1, 2**63], name="count")
  1209. result = algos.value_counts_internal(arr)
  1210. tm.assert_series_equal(result, expected)
  1211. def test_value_counts_series(self):
  1212. # GH#54857
  1213. values = np.array([3, 1, 2, 3, 4, np.nan])
  1214. result = Series(values).value_counts(bins=3)
  1215. expected = Series(
  1216. [2, 2, 1],
  1217. index=IntervalIndex.from_tuples(
  1218. [(0.996, 2.0), (2.0, 3.0), (3.0, 4.0)], dtype="interval[float64, right]"
  1219. ),
  1220. name="count",
  1221. )
  1222. tm.assert_series_equal(result, expected)
  1223. def test_value_counts_stability(self):
  1224. # GH 63155
  1225. arr = np.random.default_rng(2).integers(0, 32, 64)
  1226. result = algos.value_counts_internal(arr, sort=True)
  1227. value_counts = Series(arr).value_counts(sort=False)
  1228. expected = value_counts.sort_values(ascending=False, kind="stable")
  1229. tm.assert_series_equal(result, expected)
  1230. unstable_sorted = value_counts.sort_values(ascending=False, kind="quicksort")
  1231. with pytest.raises(AssertionError):
  1232. tm.assert_series_equal(result, unstable_sorted)
  1233. class TestDuplicated:
  1234. def test_duplicated_with_nas(self):
  1235. keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object)
  1236. result = algos.duplicated(keys)
  1237. expected = np.array([False, False, False, True, False, True])
  1238. tm.assert_numpy_array_equal(result, expected)
  1239. result = algos.duplicated(keys, keep="first")
  1240. expected = np.array([False, False, False, True, False, True])
  1241. tm.assert_numpy_array_equal(result, expected)
  1242. result = algos.duplicated(keys, keep="last")
  1243. expected = np.array([True, False, True, False, False, False])
  1244. tm.assert_numpy_array_equal(result, expected)
  1245. result = algos.duplicated(keys, keep=False)
  1246. expected = np.array([True, False, True, True, False, True])
  1247. tm.assert_numpy_array_equal(result, expected)
  1248. keys = np.empty(8, dtype=object)
  1249. for i, t in enumerate(
  1250. zip([0, 0, np.nan, np.nan] * 2, [0, np.nan, 0, np.nan] * 2, strict=True)
  1251. ):
  1252. keys[i] = t
  1253. result = algos.duplicated(keys)
  1254. falses = [False] * 4
  1255. trues = [True] * 4
  1256. expected = np.array(falses + trues)
  1257. tm.assert_numpy_array_equal(result, expected)
  1258. result = algos.duplicated(keys, keep="last")
  1259. expected = np.array(trues + falses)
  1260. tm.assert_numpy_array_equal(result, expected)
  1261. result = algos.duplicated(keys, keep=False)
  1262. expected = np.array(trues + trues)
  1263. tm.assert_numpy_array_equal(result, expected)
  1264. @pytest.mark.parametrize(
  1265. "case",
  1266. [
  1267. np.array([1, 2, 1, 5, 3, 2, 4, 1, 5, 6]),
  1268. np.array([1.1, 2.2, 1.1, np.nan, 3.3, 2.2, 4.4, 1.1, np.nan, 6.6]),
  1269. np.array(
  1270. [
  1271. 1 + 1j,
  1272. 2 + 2j,
  1273. 1 + 1j,
  1274. 5 + 5j,
  1275. 3 + 3j,
  1276. 2 + 2j,
  1277. 4 + 4j,
  1278. 1 + 1j,
  1279. 5 + 5j,
  1280. 6 + 6j,
  1281. ]
  1282. ),
  1283. np.array(["a", "b", "a", "e", "c", "b", "d", "a", "e", "f"], dtype=object),
  1284. np.array([1, 2**63, 1, 3**5, 10, 2**63, 39, 1, 3**5, 7], dtype=np.uint64),
  1285. ],
  1286. )
  1287. def test_numeric_object_likes(self, case):
  1288. exp_first = np.array(
  1289. [False, False, True, False, False, True, False, True, True, False]
  1290. )
  1291. exp_last = np.array(
  1292. [True, True, True, True, False, False, False, False, False, False]
  1293. )
  1294. exp_false = exp_first | exp_last
  1295. res_first = algos.duplicated(case, keep="first")
  1296. tm.assert_numpy_array_equal(res_first, exp_first)
  1297. res_last = algos.duplicated(case, keep="last")
  1298. tm.assert_numpy_array_equal(res_last, exp_last)
  1299. res_false = algos.duplicated(case, keep=False)
  1300. tm.assert_numpy_array_equal(res_false, exp_false)
  1301. # index
  1302. for idx in [Index(case), Index(case, dtype="category")]:
  1303. res_first = idx.duplicated(keep="first")
  1304. tm.assert_numpy_array_equal(res_first, exp_first)
  1305. res_last = idx.duplicated(keep="last")
  1306. tm.assert_numpy_array_equal(res_last, exp_last)
  1307. res_false = idx.duplicated(keep=False)
  1308. tm.assert_numpy_array_equal(res_false, exp_false)
  1309. # series
  1310. for s in [Series(case), Series(case, dtype="category")]:
  1311. res_first = s.duplicated(keep="first")
  1312. tm.assert_series_equal(res_first, Series(exp_first))
  1313. res_last = s.duplicated(keep="last")
  1314. tm.assert_series_equal(res_last, Series(exp_last))
  1315. res_false = s.duplicated(keep=False)
  1316. tm.assert_series_equal(res_false, Series(exp_false))
  1317. def test_datetime_likes(self):
  1318. dt = [
  1319. "2011-01-01",
  1320. "2011-01-02",
  1321. "2011-01-01",
  1322. "NaT",
  1323. "2011-01-03",
  1324. "2011-01-02",
  1325. "2011-01-04",
  1326. "2011-01-01",
  1327. "NaT",
  1328. "2011-01-06",
  1329. ]
  1330. td = [
  1331. "1 days",
  1332. "2 days",
  1333. "1 days",
  1334. "NaT",
  1335. "3 days",
  1336. "2 days",
  1337. "4 days",
  1338. "1 days",
  1339. "NaT",
  1340. "6 days",
  1341. ]
  1342. cases = [
  1343. np.array([Timestamp(d) for d in dt]),
  1344. np.array([Timestamp(d, tz="US/Eastern") for d in dt]),
  1345. np.array([Period(d, freq="D") for d in dt]),
  1346. np.array([np.datetime64(d) for d in dt]),
  1347. np.array([Timedelta(d) for d in td]),
  1348. ]
  1349. exp_first = np.array(
  1350. [False, False, True, False, False, True, False, True, True, False]
  1351. )
  1352. exp_last = np.array(
  1353. [True, True, True, True, False, False, False, False, False, False]
  1354. )
  1355. exp_false = exp_first | exp_last
  1356. for case in cases:
  1357. res_first = algos.duplicated(case, keep="first")
  1358. tm.assert_numpy_array_equal(res_first, exp_first)
  1359. res_last = algos.duplicated(case, keep="last")
  1360. tm.assert_numpy_array_equal(res_last, exp_last)
  1361. res_false = algos.duplicated(case, keep=False)
  1362. tm.assert_numpy_array_equal(res_false, exp_false)
  1363. # index
  1364. for idx in [
  1365. Index(case),
  1366. Index(case, dtype="category"),
  1367. Index(case, dtype=object),
  1368. ]:
  1369. res_first = idx.duplicated(keep="first")
  1370. tm.assert_numpy_array_equal(res_first, exp_first)
  1371. res_last = idx.duplicated(keep="last")
  1372. tm.assert_numpy_array_equal(res_last, exp_last)
  1373. res_false = idx.duplicated(keep=False)
  1374. tm.assert_numpy_array_equal(res_false, exp_false)
  1375. # series
  1376. for s in [
  1377. Series(case),
  1378. Series(case, dtype="category"),
  1379. Series(case, dtype=object),
  1380. ]:
  1381. res_first = s.duplicated(keep="first")
  1382. tm.assert_series_equal(res_first, Series(exp_first))
  1383. res_last = s.duplicated(keep="last")
  1384. tm.assert_series_equal(res_last, Series(exp_last))
  1385. res_false = s.duplicated(keep=False)
  1386. tm.assert_series_equal(res_false, Series(exp_false))
  1387. @pytest.mark.parametrize("case", [Index([1, 2, 3]), pd.RangeIndex(0, 3)])
  1388. def test_unique_index(self, case):
  1389. assert case.is_unique is True
  1390. tm.assert_numpy_array_equal(case.duplicated(), np.array([False, False, False]))
  1391. @pytest.mark.parametrize(
  1392. "arr, uniques",
  1393. [
  1394. (
  1395. [(0, 0), (0, 1), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0), (1, 1)],
  1396. [(0, 0), (0, 1), (1, 0), (1, 1)],
  1397. ),
  1398. (
  1399. [("b", "c"), ("a", "b"), ("a", "b"), ("b", "c")],
  1400. [("b", "c"), ("a", "b")],
  1401. ),
  1402. ([("a", 1), ("b", 2), ("a", 3), ("a", 1)], [("a", 1), ("b", 2), ("a", 3)]),
  1403. ],
  1404. )
  1405. def test_unique_tuples(self, arr, uniques):
  1406. # https://github.com/pandas-dev/pandas/issues/16519
  1407. expected = np.empty(len(uniques), dtype=object)
  1408. expected[:] = uniques
  1409. msg = (
  1410. r"unique requires a Series, Index, ExtensionArray, np.ndarray "
  1411. r"or NumpyExtensionArray got list"
  1412. )
  1413. with pytest.raises(TypeError, match=msg):
  1414. # GH#52986
  1415. pd.unique(arr)
  1416. res = pd.unique(com.asarray_tuplesafe(arr, dtype=object))
  1417. tm.assert_numpy_array_equal(res, expected)
  1418. @pytest.mark.parametrize(
  1419. "array,expected",
  1420. [
  1421. (
  1422. [1 + 1j, 0, 1, 1j, 1 + 2j, 1 + 2j],
  1423. np.array([(1 + 1j), 0j, (1 + 0j), 1j, (1 + 2j)], dtype=complex),
  1424. )
  1425. ],
  1426. )
  1427. def test_unique_complex_numbers(self, array, expected):
  1428. # GH 17927
  1429. msg = (
  1430. r"unique requires a Series, Index, ExtensionArray, np.ndarray "
  1431. r"or NumpyExtensionArray got list"
  1432. )
  1433. with pytest.raises(TypeError, match=msg):
  1434. # GH#52986
  1435. pd.unique(array)
  1436. res = pd.unique(np.array(array))
  1437. tm.assert_numpy_array_equal(res, expected)
  1438. class TestHashTable:
  1439. @pytest.mark.parametrize(
  1440. "htable, data",
  1441. [
  1442. (
  1443. ht.PyObjectHashTable,
  1444. np.array([f"foo_{i}" for i in range(1000)], dtype=object),
  1445. ),
  1446. (
  1447. ht.StringHashTable,
  1448. np.array([f"foo_{i}" for i in range(1000)], dtype=object),
  1449. ),
  1450. (ht.Float64HashTable, np.arange(1000, dtype=np.float64)),
  1451. (ht.Int64HashTable, np.arange(1000, dtype=np.int64)),
  1452. (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)),
  1453. ],
  1454. )
  1455. def test_hashtable_unique(self, htable, data, writable):
  1456. # output of maker has guaranteed unique elements
  1457. s = Series(data, dtype=data.dtype)
  1458. if htable == ht.Float64HashTable:
  1459. # add NaN for float column
  1460. s.loc[500] = np.nan
  1461. elif htable == ht.PyObjectHashTable:
  1462. # use different NaN types for object column
  1463. s.loc[500:502] = [np.nan, None, NaT]
  1464. # create duplicated selection
  1465. s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
  1466. s_duplicated.values.setflags(write=writable)
  1467. # drop_duplicates has own cython code (hash_table_func_helper.pxi)
  1468. # and is tested separately; keeps first occurrence like ht.unique()
  1469. expected_unique = s_duplicated.drop_duplicates(keep="first").values
  1470. result_unique = htable().unique(s_duplicated.values)
  1471. tm.assert_numpy_array_equal(result_unique, expected_unique)
  1472. # test return_inverse=True
  1473. # reconstruction can only succeed if the inverse is correct
  1474. result_unique, result_inverse = htable().unique(
  1475. s_duplicated.values, return_inverse=True
  1476. )
  1477. tm.assert_numpy_array_equal(result_unique, expected_unique)
  1478. reconstr = result_unique[result_inverse]
  1479. tm.assert_numpy_array_equal(reconstr, s_duplicated.values)
  1480. @pytest.mark.parametrize(
  1481. "htable, data",
  1482. [
  1483. (
  1484. ht.PyObjectHashTable,
  1485. np.array([f"foo_{i}" for i in range(1000)], dtype=object),
  1486. ),
  1487. (
  1488. ht.StringHashTable,
  1489. np.array([f"foo_{i}" for i in range(1000)], dtype=object),
  1490. ),
  1491. (ht.Float64HashTable, np.arange(1000, dtype=np.float64)),
  1492. (ht.Int64HashTable, np.arange(1000, dtype=np.int64)),
  1493. (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)),
  1494. ],
  1495. )
  1496. def test_hashtable_factorize(self, htable, writable, data):
  1497. # output of maker has guaranteed unique elements
  1498. s = Series(data, dtype=data.dtype)
  1499. if htable == ht.Float64HashTable:
  1500. # add NaN for float column
  1501. s.loc[500] = np.nan
  1502. elif htable == ht.PyObjectHashTable:
  1503. # use different NaN types for object column
  1504. s.loc[500:502] = [np.nan, None, NaT]
  1505. # create duplicated selection
  1506. s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
  1507. s_duplicated.values.setflags(write=writable)
  1508. na_mask = s_duplicated.isna().values
  1509. result_unique, result_inverse = htable().factorize(s_duplicated.values)
  1510. # drop_duplicates has own cython code (hash_table_func_helper.pxi)
  1511. # and is tested separately; keeps first occurrence like ht.factorize()
  1512. # since factorize removes all NaNs, we do the same here
  1513. expected_unique = s_duplicated.dropna().drop_duplicates().values
  1514. tm.assert_numpy_array_equal(result_unique, expected_unique)
  1515. # reconstruction can only succeed if the inverse is correct. Since
  1516. # factorize removes the NaNs, those have to be excluded here as well
  1517. result_reconstruct = result_unique[result_inverse[~na_mask]]
  1518. expected_reconstruct = s_duplicated.dropna().values
  1519. tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct)
  1520. class TestRank:
  1521. @pytest.mark.parametrize(
  1522. "arr",
  1523. [
  1524. [np.nan, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 3, np.nan],
  1525. [4.0, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 4.0, np.nan],
  1526. ],
  1527. )
  1528. def test_scipy_compat(self, arr):
  1529. sp_stats = pytest.importorskip("scipy.stats")
  1530. arr = np.array(arr)
  1531. mask = ~np.isfinite(arr)
  1532. result = libalgos.rank_1d(arr)
  1533. arr[mask] = np.inf
  1534. exp = sp_stats.rankdata(arr)
  1535. exp[mask] = np.nan
  1536. tm.assert_almost_equal(result, exp)
  1537. def test_basic(self, writable, any_int_numpy_dtype):
  1538. exp = np.array([1, 2], dtype=np.float64)
  1539. data = np.array([1, 100], dtype=any_int_numpy_dtype)
  1540. data.setflags(write=writable)
  1541. ser = Series(data)
  1542. result = algos.rank(ser)
  1543. tm.assert_numpy_array_equal(result, exp)
  1544. @pytest.mark.parametrize("dtype", [np.float64, np.uint64])
  1545. def test_uint64_overflow(self, dtype):
  1546. exp = np.array([1, 2], dtype=np.float64)
  1547. s = Series([1, 2**63], dtype=dtype)
  1548. tm.assert_numpy_array_equal(algos.rank(s), exp)
  1549. @pytest.mark.parametrize("method", ["average", "min", "max"])
  1550. def test_rank_tiny_values(self, method):
  1551. # GH62036: regression test for ranking with tiny float values
  1552. exp = np.array([4.0, 1.0, 3.0, np.nan, 2.0], dtype=np.float64)
  1553. s = Series(
  1554. [5.4954145e29, -9.791984e-21, 9.3715776e-26, pd.NA, 1.8790257e-28],
  1555. dtype="Float64",
  1556. )
  1557. s = s.astype(object)
  1558. result = algos.rank(s, method=method)
  1559. tm.assert_numpy_array_equal(result, exp)
  1560. def test_too_many_ndims(self):
  1561. arr = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]])
  1562. msg = "Array with ndim > 2 are not supported"
  1563. with pytest.raises(TypeError, match=msg):
  1564. algos.rank(arr)
  1565. @pytest.mark.single_cpu
  1566. def test_pct_max_many_rows(self):
  1567. # GH 18271
  1568. values = np.arange(2**24 + 1)
  1569. result = algos.rank(values, pct=True).max()
  1570. assert result == 1
  1571. values = np.arange(2**25 + 2).reshape(2**24 + 1, 2)
  1572. result = algos.rank(values, pct=True).max()
  1573. assert result == 1
  1574. class TestMode:
  1575. def test_no_mode(self):
  1576. exp = Series([], dtype=np.float64, index=Index([], dtype=int))
  1577. result, _ = algos.mode(np.array([]))
  1578. tm.assert_numpy_array_equal(result, exp.values)
  1579. def test_mode_single(self, any_real_numpy_dtype):
  1580. # GH 15714
  1581. exp_single = [1]
  1582. data_single = [1]
  1583. exp_multi = [1]
  1584. data_multi = [1, 1]
  1585. ser = Series(data_single, dtype=any_real_numpy_dtype)
  1586. exp = Series(exp_single, dtype=any_real_numpy_dtype)
  1587. result, _ = algos.mode(ser.values)
  1588. tm.assert_numpy_array_equal(result, exp.values)
  1589. tm.assert_series_equal(ser.mode(), exp)
  1590. ser = Series(data_multi, dtype=any_real_numpy_dtype)
  1591. exp = Series(exp_multi, dtype=any_real_numpy_dtype)
  1592. result, _ = algos.mode(ser.values)
  1593. tm.assert_numpy_array_equal(result, exp.values)
  1594. tm.assert_series_equal(ser.mode(), exp)
  1595. def test_mode_obj_int(self):
  1596. exp = Series([1], dtype=int)
  1597. result, _ = algos.mode(exp.values)
  1598. tm.assert_numpy_array_equal(result, exp.values)
  1599. exp = Series(["a", "b", "c"], dtype=object)
  1600. result, _ = algos.mode(exp.values)
  1601. tm.assert_numpy_array_equal(result, exp.values)
  1602. def test_number_mode(self, any_real_numpy_dtype):
  1603. exp_single = [1]
  1604. data_single = [1] * 5 + [2] * 3
  1605. exp_multi = [1, 3]
  1606. data_multi = [1] * 5 + [2] * 3 + [3] * 5
  1607. ser = Series(data_single, dtype=any_real_numpy_dtype)
  1608. exp = Series(exp_single, dtype=any_real_numpy_dtype)
  1609. result, _ = algos.mode(ser.values)
  1610. tm.assert_numpy_array_equal(result, exp.values)
  1611. tm.assert_series_equal(ser.mode(), exp)
  1612. ser = Series(data_multi, dtype=any_real_numpy_dtype)
  1613. exp = Series(exp_multi, dtype=any_real_numpy_dtype)
  1614. result, _ = algos.mode(ser.values)
  1615. tm.assert_numpy_array_equal(result, exp.values)
  1616. tm.assert_series_equal(ser.mode(), exp)
  1617. def test_strobj_mode(self):
  1618. exp = ["b"]
  1619. data = ["a"] * 2 + ["b"] * 3
  1620. ser = Series(data, dtype="c")
  1621. exp = Series(exp, dtype="c")
  1622. result, _ = algos.mode(ser.values)
  1623. tm.assert_numpy_array_equal(result, exp.values)
  1624. tm.assert_series_equal(ser.mode(), exp)
  1625. @pytest.mark.parametrize("dt", [str, object])
  1626. def test_strobj_multi_char(self, dt, using_infer_string):
  1627. exp = ["bar"]
  1628. data = ["foo"] * 2 + ["bar"] * 3
  1629. ser = Series(data, dtype=dt)
  1630. exp = Series(exp, dtype=dt)
  1631. result, _ = algos.mode(ser.values)
  1632. if using_infer_string and dt is str:
  1633. tm.assert_extension_array_equal(result, exp.values)
  1634. else:
  1635. tm.assert_numpy_array_equal(result, exp.values)
  1636. tm.assert_series_equal(ser.mode(), exp)
  1637. def test_datelike_mode(self):
  1638. exp = Series(["1900-05-03", "2011-01-03", "2013-01-02"], dtype="M8[ns]")
  1639. ser = Series(["2011-01-03", "2013-01-02", "1900-05-03"], dtype="M8[ns]")
  1640. tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
  1641. tm.assert_series_equal(ser.mode(), exp)
  1642. exp = Series(["2011-01-03", "2013-01-02"], dtype="M8[ns]")
  1643. ser = Series(
  1644. ["2011-01-03", "2013-01-02", "1900-05-03", "2011-01-03", "2013-01-02"],
  1645. dtype="M8[ns]",
  1646. )
  1647. tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
  1648. tm.assert_series_equal(ser.mode(), exp)
  1649. def test_timedelta_mode(self):
  1650. exp = Series(["-1 days", "0 days", "1 days"], dtype="timedelta64[ns]")
  1651. ser = Series(["1 days", "-1 days", "0 days"], dtype="timedelta64[ns]")
  1652. tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
  1653. tm.assert_series_equal(ser.mode(), exp)
  1654. exp = Series(["2 min", "1 day"], dtype="timedelta64[ns]")
  1655. ser = Series(
  1656. ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"],
  1657. dtype="timedelta64[ns]",
  1658. )
  1659. tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
  1660. tm.assert_series_equal(ser.mode(), exp)
  1661. def test_mixed_dtype(self):
  1662. exp = Series(["foo"], dtype=object)
  1663. ser = Series([1, "foo", "foo"])
  1664. result, _ = algos.mode(ser.values)
  1665. tm.assert_numpy_array_equal(result, exp.values)
  1666. tm.assert_series_equal(ser.mode(), exp)
  1667. def test_uint64_overflow(self):
  1668. exp = Series([2**63], dtype=np.uint64)
  1669. ser = Series([1, 2**63, 2**63], dtype=np.uint64)
  1670. result, _ = algos.mode(ser.values)
  1671. tm.assert_numpy_array_equal(result, exp.values)
  1672. tm.assert_series_equal(ser.mode(), exp)
  1673. exp = Series([1, 2**63], dtype=np.uint64)
  1674. ser = Series([1, 2**63], dtype=np.uint64)
  1675. result, _ = algos.mode(ser.values)
  1676. tm.assert_numpy_array_equal(result, exp.values)
  1677. tm.assert_series_equal(ser.mode(), exp)
  1678. def test_categorical(self):
  1679. c = Categorical([1, 2])
  1680. exp = c
  1681. res = Series(c).mode()._values
  1682. tm.assert_categorical_equal(res, exp)
  1683. c = Categorical([1, "a", "a"])
  1684. exp = Categorical(["a"], categories=[1, "a"])
  1685. res = Series(c).mode()._values
  1686. tm.assert_categorical_equal(res, exp)
  1687. c = Categorical([1, 1, 2, 3, 3])
  1688. exp = Categorical([1, 3], categories=[1, 2, 3])
  1689. res = Series(c).mode()._values
  1690. tm.assert_categorical_equal(res, exp)
  1691. def test_index(self):
  1692. idx = Index([1, 2, 3])
  1693. exp = Series([1, 2, 3], dtype=np.int64)
  1694. result, _ = algos.mode(idx)
  1695. tm.assert_numpy_array_equal(result, exp.values)
  1696. idx = Index([1, "a", "a"])
  1697. exp = Series(["a"], dtype=object)
  1698. result, _ = algos.mode(idx)
  1699. tm.assert_numpy_array_equal(result, exp.values)
  1700. idx = Index([1, 1, 2, 3, 3])
  1701. exp = Series([1, 3], dtype=np.int64)
  1702. result, _ = algos.mode(idx)
  1703. tm.assert_numpy_array_equal(result, exp.values)
  1704. idx = Index(
  1705. ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"],
  1706. dtype="timedelta64[ns]",
  1707. )
  1708. with pytest.raises(AttributeError, match="TimedeltaIndex"):
  1709. # algos.mode expects Arraylike, does *not* unwrap TimedeltaIndex
  1710. algos.mode(idx)
  1711. def test_ser_mode_with_name(self):
  1712. # GH 46737
  1713. ser = Series([1, 1, 3], name="foo")
  1714. result = ser.mode()
  1715. expected = Series([1], name="foo")
  1716. tm.assert_series_equal(result, expected)
  1717. class TestDiff:
  1718. @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
  1719. def test_diff_datetimelike_nat(self, dtype):
  1720. # NaT - NaT is NaT, not 0
  1721. arr = np.arange(12).astype(np.int64).view(dtype).reshape(3, 4)
  1722. arr[:, 2] = arr.dtype.type("NaT", "ns")
  1723. result = algos.diff(arr, 1, axis=0)
  1724. expected = np.ones(arr.shape, dtype="timedelta64[ns]") * 4
  1725. expected[:, 2] = np.timedelta64("NaT", "ns")
  1726. expected[0, :] = np.timedelta64("NaT", "ns")
  1727. tm.assert_numpy_array_equal(result, expected)
  1728. result = algos.diff(arr.T, 1, axis=1)
  1729. tm.assert_numpy_array_equal(result, expected.T)
  1730. def test_diff_ea_axis(self):
  1731. dta = date_range("2016-01-01", periods=3, tz="US/Pacific")._data
  1732. msg = "cannot diff DatetimeArray on axis=1"
  1733. with pytest.raises(ValueError, match=msg):
  1734. algos.diff(dta, 1, axis=1)
  1735. @pytest.mark.parametrize("dtype", ["int8", "int16"])
  1736. def test_diff_low_precision_int(self, dtype):
  1737. arr = np.array([0, 1, 1, 0, 0], dtype=dtype)
  1738. result = algos.diff(arr, 1)
  1739. expected = np.array([np.nan, 1, 0, -1, 0], dtype="float32")
  1740. tm.assert_numpy_array_equal(result, expected)
  1741. @pytest.mark.parametrize("op", [np.array, pd.array])
  1742. def test_union_with_duplicates(op):
  1743. # GH#36289
  1744. lvals = op([3, 1, 3, 4])
  1745. rvals = op([2, 3, 1, 1])
  1746. expected = op([3, 3, 1, 1, 4, 2])
  1747. if isinstance(expected, np.ndarray):
  1748. result = algos.union_with_duplicates(lvals, rvals)
  1749. tm.assert_numpy_array_equal(result, expected)
  1750. else:
  1751. result = algos.union_with_duplicates(lvals, rvals)
  1752. tm.assert_extension_array_equal(result, expected)