test_arrow.py 115 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425
  1. """
  2. This file contains a minimal set of tests for compliance with the extension
  3. array interface test suite, and should contain no other tests.
  4. The test suite for the full functionality of the array is located in
  5. `pandas/tests/arrays/`.
  6. The tests in this file are inherited from the BaseExtensionTests, and only
  7. minimal tweaks should be applied to get the tests passing (by overwriting a
  8. parent method).
  9. Additional tests should either be added to one of the BaseExtensionTests
  10. classes (if they are relevant for the extension interface for all dtypes), or
  11. be added to the array-specific tests in `pandas/tests/arrays/`.
  12. """
  13. from __future__ import annotations
  14. from datetime import (
  15. date,
  16. datetime,
  17. time,
  18. timedelta,
  19. )
  20. from decimal import Decimal
  21. from io import (
  22. BytesIO,
  23. StringIO,
  24. )
  25. import operator
  26. import pickle
  27. import re
  28. import numpy as np
  29. import pytest
  30. from pandas._libs import lib
  31. from pandas._libs.tslibs import timezones
  32. from pandas.compat import (
  33. PY311,
  34. PY312,
  35. is_ci_environment,
  36. is_platform_windows,
  37. pa_version_under11p0,
  38. pa_version_under13p0,
  39. pa_version_under14p0,
  40. pa_version_under20p0,
  41. pa_version_under21p0,
  42. )
  43. from pandas.core.dtypes.dtypes import (
  44. ArrowDtype,
  45. CategoricalDtypeType,
  46. )
  47. import pandas as pd
  48. import pandas._testing as tm
  49. from pandas.api.extensions import no_default
  50. from pandas.api.types import (
  51. is_bool_dtype,
  52. is_float_dtype,
  53. is_integer_dtype,
  54. is_numeric_dtype,
  55. is_signed_integer_dtype,
  56. is_string_dtype,
  57. is_unsigned_integer_dtype,
  58. )
  59. from pandas.tests.extension import base
  60. pa = pytest.importorskip("pyarrow")
  61. from pandas.core.arrays.arrow.array import ArrowExtensionArray
  62. from pandas.core.arrays.arrow.extension_types import ArrowPeriodType
  63. def _require_timezone_database(request):
  64. if is_platform_windows() and is_ci_environment():
  65. mark = pytest.mark.xfail(
  66. raises=pa.ArrowInvalid,
  67. reason=(
  68. "TODO: Set ARROW_TIMEZONE_DATABASE environment variable "
  69. "on CI to path to the tzdata for pyarrow."
  70. ),
  71. )
  72. request.applymarker(mark)
  73. @pytest.fixture(params=tm.ALL_PYARROW_DTYPES, ids=str)
  74. def dtype(request):
  75. return ArrowDtype(pyarrow_dtype=request.param)
  76. @pytest.fixture
  77. def data(dtype):
  78. pa_dtype = dtype.pyarrow_dtype
  79. if pa.types.is_boolean(pa_dtype):
  80. data = [True, False] * 4 + [None] + [True, False] * 44 + [None] + [True, False]
  81. elif pa.types.is_floating(pa_dtype):
  82. data = [1.0, 0.0] * 4 + [None] + [-2.0, -1.0] * 44 + [None] + [0.5, 99.5]
  83. elif pa.types.is_signed_integer(pa_dtype):
  84. data = [1, 0] * 4 + [None] + [-2, -1] * 44 + [None] + [1, 99]
  85. elif pa.types.is_unsigned_integer(pa_dtype):
  86. data = [1, 0] * 4 + [None] + [2, 1] * 44 + [None] + [1, 99]
  87. elif pa.types.is_decimal(pa_dtype):
  88. data = (
  89. [Decimal("1"), Decimal("0.0")] * 4
  90. + [None]
  91. + [Decimal("-2.0"), Decimal("-1.0")] * 44
  92. + [None]
  93. + [Decimal("0.5"), Decimal("33.123")]
  94. )
  95. elif pa.types.is_date(pa_dtype):
  96. data = (
  97. [date(2022, 1, 1), date(1999, 12, 31)] * 4
  98. + [None]
  99. + [date(2022, 1, 1), date(2022, 1, 1)] * 44
  100. + [None]
  101. + [date(1999, 12, 31), date(1999, 12, 31)]
  102. )
  103. elif pa.types.is_timestamp(pa_dtype):
  104. data = (
  105. [datetime(2020, 1, 1, 1, 1, 1, 1), datetime(1999, 1, 1, 1, 1, 1, 1)] * 4
  106. + [None]
  107. + [datetime(2020, 1, 1, 1), datetime(1999, 1, 1, 1)] * 44
  108. + [None]
  109. + [datetime(2020, 1, 1), datetime(1999, 1, 1)]
  110. )
  111. elif pa.types.is_duration(pa_dtype):
  112. data = (
  113. [timedelta(1), timedelta(1, 1)] * 4
  114. + [None]
  115. + [timedelta(-1), timedelta(0)] * 44
  116. + [None]
  117. + [timedelta(-10), timedelta(10)]
  118. )
  119. elif pa.types.is_time(pa_dtype):
  120. data = (
  121. [time(12, 0), time(0, 12)] * 4
  122. + [None]
  123. + [time(0, 0), time(1, 1)] * 44
  124. + [None]
  125. + [time(0, 5), time(5, 0)]
  126. )
  127. elif pa.types.is_string(pa_dtype):
  128. data = ["a", "b"] * 4 + [None] + ["1", "2"] * 44 + [None] + ["!", ">"]
  129. elif pa.types.is_binary(pa_dtype):
  130. data = [b"a", b"b"] * 4 + [None] + [b"1", b"2"] * 44 + [None] + [b"!", b">"]
  131. else:
  132. raise NotImplementedError
  133. return pd.array(data, dtype=dtype)
  134. @pytest.fixture
  135. def data_missing(data):
  136. """Length-2 array with [NA, Valid]"""
  137. return type(data)._from_sequence([None, data[0]], dtype=data.dtype)
  138. @pytest.fixture(params=["data", "data_missing"])
  139. def all_data(request, data, data_missing):
  140. """Parametrized fixture returning 'data' or 'data_missing' integer arrays.
  141. Used to test dtype conversion with and without missing values.
  142. """
  143. if request.param == "data":
  144. return data
  145. elif request.param == "data_missing":
  146. return data_missing
  147. @pytest.fixture
  148. def data_for_grouping(dtype):
  149. """
  150. Data for factorization, grouping, and unique tests.
  151. Expected to be like [B, B, NA, NA, A, A, B, C]
  152. Where A < B < C and NA is missing
  153. """
  154. pa_dtype = dtype.pyarrow_dtype
  155. if pa.types.is_boolean(pa_dtype):
  156. A = False
  157. B = True
  158. C = True
  159. elif pa.types.is_floating(pa_dtype):
  160. A = -1.1
  161. B = 0.0
  162. C = 1.1
  163. elif pa.types.is_signed_integer(pa_dtype):
  164. A = -1
  165. B = 0
  166. C = 1
  167. elif pa.types.is_unsigned_integer(pa_dtype):
  168. A = 0
  169. B = 1
  170. C = 10
  171. elif pa.types.is_date(pa_dtype):
  172. A = date(1999, 12, 31)
  173. B = date(2010, 1, 1)
  174. C = date(2022, 1, 1)
  175. elif pa.types.is_timestamp(pa_dtype):
  176. A = datetime(1999, 1, 1, 1, 1, 1, 1)
  177. B = datetime(2020, 1, 1)
  178. C = datetime(2020, 1, 1, 1)
  179. elif pa.types.is_duration(pa_dtype):
  180. A = timedelta(-1)
  181. B = timedelta(0)
  182. C = timedelta(1, 4)
  183. elif pa.types.is_time(pa_dtype):
  184. A = time(0, 0)
  185. B = time(0, 12)
  186. C = time(12, 12)
  187. elif pa.types.is_string(pa_dtype):
  188. A = "a"
  189. B = "b"
  190. C = "c"
  191. elif pa.types.is_binary(pa_dtype):
  192. A = b"a"
  193. B = b"b"
  194. C = b"c"
  195. elif pa.types.is_decimal(pa_dtype):
  196. A = Decimal("-1.1")
  197. B = Decimal("0.0")
  198. C = Decimal("1.1")
  199. else:
  200. raise NotImplementedError
  201. return pd.array([B, B, None, None, A, A, B, C], dtype=dtype)
  202. @pytest.fixture
  203. def data_for_sorting(data_for_grouping):
  204. """
  205. Length-3 array with a known sort order.
  206. This should be three items [B, C, A] with
  207. A < B < C
  208. """
  209. return type(data_for_grouping)._from_sequence(
  210. [data_for_grouping[0], data_for_grouping[7], data_for_grouping[4]],
  211. dtype=data_for_grouping.dtype,
  212. )
  213. @pytest.fixture
  214. def data_missing_for_sorting(data_for_grouping):
  215. """
  216. Length-3 array with a known sort order.
  217. This should be three items [B, NA, A] with
  218. A < B and NA missing.
  219. """
  220. return type(data_for_grouping)._from_sequence(
  221. [data_for_grouping[0], data_for_grouping[2], data_for_grouping[4]],
  222. dtype=data_for_grouping.dtype,
  223. )
  224. @pytest.fixture
  225. def data_for_twos(data):
  226. """Length-100 array in which all the elements are two."""
  227. pa_dtype = data.dtype.pyarrow_dtype
  228. if (
  229. pa.types.is_integer(pa_dtype)
  230. or pa.types.is_floating(pa_dtype)
  231. or pa.types.is_decimal(pa_dtype)
  232. or pa.types.is_duration(pa_dtype)
  233. ):
  234. return pd.array([2] * 100, dtype=data.dtype)
  235. # tests will be xfailed where 2 is not a valid scalar for pa_dtype
  236. return data
  237. # TODO: skip otherwise?
  238. class TestArrowArray(base.ExtensionTests):
  239. def test_compare_scalar(self, data, comparison_op):
  240. ser = pd.Series(data)
  241. self._compare_other(ser, data, comparison_op, data[0])
  242. @pytest.mark.parametrize("na_action", [None, "ignore"])
  243. def test_map(self, data_missing, na_action):
  244. if data_missing.dtype.kind in "mM":
  245. result = data_missing.map(lambda x: x, na_action=na_action)
  246. expected = data_missing.to_numpy(dtype=object)
  247. tm.assert_numpy_array_equal(result, expected)
  248. else:
  249. result = data_missing.map(lambda x: x, na_action=na_action)
  250. if data_missing.dtype == "float32[pyarrow]":
  251. # map roundtrips through objects, which converts to float64
  252. expected = data_missing.to_numpy(dtype="float64", na_value=np.nan)
  253. else:
  254. expected = data_missing.to_numpy()
  255. tm.assert_numpy_array_equal(result, expected)
  256. def test_astype_str(self, data, request, using_infer_string):
  257. pa_dtype = data.dtype.pyarrow_dtype
  258. if pa.types.is_binary(pa_dtype):
  259. request.applymarker(
  260. pytest.mark.xfail(
  261. reason=f"For {pa_dtype} .astype(str) decodes.",
  262. )
  263. )
  264. elif not using_infer_string and (
  265. (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None)
  266. or pa.types.is_duration(pa_dtype)
  267. ):
  268. request.applymarker(
  269. pytest.mark.xfail(
  270. reason="pd.Timestamp/pd.Timedelta repr different from numpy repr",
  271. )
  272. )
  273. super().test_astype_str(data)
  274. def test_from_dtype(self, data, request):
  275. pa_dtype = data.dtype.pyarrow_dtype
  276. if pa.types.is_string(pa_dtype) or pa.types.is_decimal(pa_dtype):
  277. if pa.types.is_string(pa_dtype):
  278. reason = "ArrowDtype(pa.string()) != StringDtype('pyarrow')"
  279. else:
  280. reason = f"pyarrow.type_for_alias cannot infer {pa_dtype}"
  281. request.applymarker(
  282. pytest.mark.xfail(
  283. reason=reason,
  284. )
  285. )
  286. super().test_from_dtype(data)
  287. def test_from_sequence_pa_array(self, data):
  288. # https://github.com/pandas-dev/pandas/pull/47034#discussion_r955500784
  289. # data._pa_array = pa.ChunkedArray
  290. result = type(data)._from_sequence(data._pa_array, dtype=data.dtype)
  291. tm.assert_extension_array_equal(result, data)
  292. assert isinstance(result._pa_array, pa.ChunkedArray)
  293. result = type(data)._from_sequence(
  294. data._pa_array.combine_chunks(), dtype=data.dtype
  295. )
  296. tm.assert_extension_array_equal(result, data)
  297. assert isinstance(result._pa_array, pa.ChunkedArray)
  298. def test_from_sequence_pa_array_notimplemented(self, request):
  299. with pytest.raises(NotImplementedError, match="Converting strings to"):
  300. ArrowExtensionArray._from_sequence_of_strings(
  301. ["12-1"], dtype=pa.month_day_nano_interval()
  302. )
  303. def test_from_sequence_of_strings_pa_array(self, data, request):
  304. pa_dtype = data.dtype.pyarrow_dtype
  305. if pa.types.is_time64(pa_dtype) and pa_dtype.equals("time64[ns]") and not PY311:
  306. request.applymarker(
  307. pytest.mark.xfail(
  308. reason="Nanosecond time parsing not supported.",
  309. )
  310. )
  311. elif pa_version_under11p0 and (
  312. pa.types.is_duration(pa_dtype) or pa.types.is_decimal(pa_dtype)
  313. ):
  314. request.applymarker(
  315. pytest.mark.xfail(
  316. raises=pa.ArrowNotImplementedError,
  317. reason=f"pyarrow doesn't support parsing {pa_dtype}",
  318. )
  319. )
  320. elif pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None:
  321. _require_timezone_database(request)
  322. pa_array = data._pa_array.cast(pa.string())
  323. result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype)
  324. tm.assert_extension_array_equal(result, data)
  325. pa_array = pa_array.combine_chunks()
  326. result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype)
  327. tm.assert_extension_array_equal(result, data)
  328. def check_accumulate(self, ser, op_name, skipna):
  329. result = getattr(ser, op_name)(skipna=skipna)
  330. pa_type = ser.dtype.pyarrow_dtype
  331. if pa.types.is_temporal(pa_type):
  332. # Just check that we match the integer behavior.
  333. if pa_type.bit_width == 32:
  334. int_type = "int32[pyarrow]"
  335. else:
  336. int_type = "int64[pyarrow]"
  337. ser = ser.astype(int_type)
  338. result = result.astype(int_type)
  339. result = result.astype("Float64")
  340. expected = getattr(ser.astype("Float64"), op_name)(skipna=skipna)
  341. tm.assert_series_equal(result, expected, check_dtype=False)
  342. def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
  343. # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no
  344. # attribute "pyarrow_dtype"
  345. pa_type = ser.dtype.pyarrow_dtype # type: ignore[union-attr]
  346. if pa.types.is_binary(pa_type) or pa.types.is_decimal(pa_type):
  347. if op_name in ["cumsum", "cumprod", "cummax", "cummin"]:
  348. return False
  349. elif pa.types.is_string(pa_type):
  350. if op_name == "cumprod":
  351. return False
  352. elif pa.types.is_boolean(pa_type):
  353. if op_name in ["cumprod", "cummax", "cummin"]:
  354. return False
  355. elif pa.types.is_temporal(pa_type):
  356. if op_name == "cumsum" and not pa.types.is_duration(pa_type):
  357. return False
  358. elif op_name == "cumprod":
  359. return False
  360. return True
  361. @pytest.mark.parametrize("skipna", [True, False])
  362. def test_accumulate_series(self, data, all_numeric_accumulations, skipna, request):
  363. pa_type = data.dtype.pyarrow_dtype
  364. op_name = all_numeric_accumulations
  365. if pa.types.is_string(pa_type) and op_name in ["cumsum", "cummin", "cummax"]:
  366. # https://github.com/pandas-dev/pandas/pull/60633
  367. # Doesn't fit test structure, tested in series/test_cumulative.py instead.
  368. return
  369. ser = pd.Series(data)
  370. if not self._supports_accumulation(ser, op_name):
  371. # The base class test will check that we raise
  372. return super().test_accumulate_series(
  373. data, all_numeric_accumulations, skipna
  374. )
  375. if pa_version_under13p0 and all_numeric_accumulations != "cumsum":
  376. # xfailing takes a long time to run because pytest
  377. # renders the exception messages even when not showing them
  378. opt = request.config.option
  379. if opt.markexpr and "not slow" in opt.markexpr:
  380. pytest.skip(
  381. f"{all_numeric_accumulations} not implemented for pyarrow < 9"
  382. )
  383. mark = pytest.mark.xfail(
  384. reason=f"{all_numeric_accumulations} not implemented for pyarrow < 9"
  385. )
  386. request.applymarker(mark)
  387. elif all_numeric_accumulations == "cumsum" and (
  388. pa.types.is_boolean(pa_type) or pa.types.is_decimal(pa_type)
  389. ):
  390. request.applymarker(
  391. pytest.mark.xfail(
  392. reason=f"{all_numeric_accumulations} not implemented for {pa_type}",
  393. raises=TypeError,
  394. )
  395. )
  396. self.check_accumulate(ser, op_name, skipna)
  397. def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
  398. if op_name == "kurt" or (pa_version_under20p0 and op_name == "skew"):
  399. return False
  400. dtype = ser.dtype
  401. # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has
  402. # no attribute "pyarrow_dtype"
  403. pa_dtype = dtype.pyarrow_dtype # type: ignore[union-attr]
  404. if pa.types.is_temporal(pa_dtype) and op_name in [
  405. "sum",
  406. "var",
  407. "skew",
  408. "kurt",
  409. "prod",
  410. ]:
  411. if pa.types.is_duration(pa_dtype) and op_name in ["sum"]:
  412. # summing timedeltas is one case that *is* well-defined
  413. pass
  414. else:
  415. return False
  416. elif pa.types.is_binary(pa_dtype) and op_name in ["sum", "skew"]:
  417. return False
  418. elif (
  419. pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype)
  420. ) and op_name in [
  421. "mean",
  422. "median",
  423. "prod",
  424. "std",
  425. "sem",
  426. "var",
  427. "skew",
  428. "kurt",
  429. ]:
  430. return False
  431. if (
  432. pa.types.is_temporal(pa_dtype)
  433. and not pa.types.is_duration(pa_dtype)
  434. and op_name in ["any", "all"]
  435. ):
  436. # xref GH#34479 we support this in our non-pyarrow datetime64 dtypes,
  437. # but it isn't obvious we _should_. For now, we keep the pyarrow
  438. # behavior which does not support this.
  439. return False
  440. return True
  441. def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
  442. # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no
  443. # attribute "pyarrow_dtype"
  444. pa_dtype = ser.dtype.pyarrow_dtype # type: ignore[union-attr]
  445. if pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype):
  446. alt = ser.astype("Float64")
  447. else:
  448. # TODO: in the opposite case, aren't we testing... nothing? For
  449. # e.g. date/time dtypes trying to calculate 'expected' by converting
  450. # to object will raise for mean, std etc
  451. alt = ser
  452. # TODO: in the opposite case, aren't we testing... nothing?
  453. if op_name == "count":
  454. result = getattr(ser, op_name)()
  455. expected = getattr(alt, op_name)()
  456. else:
  457. result = getattr(ser, op_name)(skipna=skipna)
  458. expected = getattr(alt, op_name)(skipna=skipna)
  459. tm.assert_almost_equal(result, expected)
  460. @pytest.mark.parametrize("skipna", [True, False])
  461. def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request):
  462. dtype = data.dtype
  463. pa_dtype = dtype.pyarrow_dtype
  464. xfail_mark = pytest.mark.xfail(
  465. raises=TypeError,
  466. reason=(
  467. f"{all_numeric_reductions} is not implemented in "
  468. f"pyarrow={pa.__version__} for {pa_dtype}"
  469. ),
  470. )
  471. if pa.types.is_boolean(pa_dtype) and all_numeric_reductions in {
  472. "sem",
  473. "std",
  474. "var",
  475. "median",
  476. }:
  477. request.applymarker(xfail_mark)
  478. elif (
  479. not pa_version_under20p0
  480. and all_numeric_reductions == "skew"
  481. and (
  482. pa.types.is_boolean(pa_dtype)
  483. or (
  484. skipna
  485. and (
  486. pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype)
  487. )
  488. )
  489. )
  490. ):
  491. request.applymarker(
  492. pytest.mark.xfail(
  493. reason="https://github.com/apache/arrow/issues/45733",
  494. )
  495. )
  496. super().test_reduce_series_numeric(data, all_numeric_reductions, skipna)
  497. @pytest.mark.parametrize("skipna", [True, False])
  498. def test_reduce_series_boolean(
  499. self, data, all_boolean_reductions, skipna, na_value, request
  500. ):
  501. pa_dtype = data.dtype.pyarrow_dtype
  502. xfail_mark = pytest.mark.xfail(
  503. raises=TypeError,
  504. reason=(
  505. f"{all_boolean_reductions} is not implemented in "
  506. f"pyarrow={pa.__version__} for {pa_dtype}"
  507. ),
  508. )
  509. if pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype):
  510. # We *might* want to make this behave like the non-pyarrow cases,
  511. # but have not yet decided.
  512. request.applymarker(xfail_mark)
  513. return super().test_reduce_series_boolean(data, all_boolean_reductions, skipna)
  514. def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
  515. pa_type = arr._pa_array.type
  516. if op_name in ["max", "min"]:
  517. cmp_dtype = arr.dtype
  518. elif arr.dtype.name == "decimal128(7, 3)[pyarrow]":
  519. if op_name == "sum" and not pa_version_under21p0:
  520. # https://github.com/apache/arrow/pull/44184
  521. cmp_dtype = ArrowDtype(pa.decimal128(38, 3))
  522. elif op_name not in ["median", "var", "std", "skew"]:
  523. cmp_dtype = arr.dtype
  524. else:
  525. cmp_dtype = "float64[pyarrow]"
  526. elif op_name in ["median", "var", "std", "mean", "skew"]:
  527. cmp_dtype = "float64[pyarrow]"
  528. elif op_name == "sum" and pa.types.is_string(pa_type):
  529. cmp_dtype = arr.dtype
  530. else:
  531. cmp_dtype = {
  532. "i": "int64[pyarrow]",
  533. "u": "uint64[pyarrow]",
  534. "f": "float64[pyarrow]",
  535. }[arr.dtype.kind]
  536. return cmp_dtype
  537. @pytest.mark.parametrize("skipna", [True, False])
  538. def test_reduce_frame(self, data, all_numeric_reductions, skipna, request):
  539. op_name = all_numeric_reductions
  540. if op_name == "skew" and pa_version_under20p0:
  541. if data.dtype._is_numeric:
  542. mark = pytest.mark.xfail(reason="skew not implemented")
  543. request.applymarker(mark)
  544. return super().test_reduce_frame(data, all_numeric_reductions, skipna)
  545. @pytest.mark.parametrize("typ", ["int64", "uint64", "float64"])
  546. def test_median_not_approximate(self, typ):
  547. # GH 52679
  548. result = pd.Series([1, 2], dtype=f"{typ}[pyarrow]").median()
  549. assert result == 1.5
  550. def test_construct_from_string_own_name(self, dtype, request):
  551. pa_dtype = dtype.pyarrow_dtype
  552. if pa.types.is_decimal(pa_dtype):
  553. request.applymarker(
  554. pytest.mark.xfail(
  555. raises=NotImplementedError,
  556. reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}",
  557. )
  558. )
  559. if pa.types.is_string(pa_dtype):
  560. # We still support StringDtype('pyarrow') over ArrowDtype(pa.string())
  561. msg = r"string\[pyarrow\] should be constructed by StringDtype"
  562. with pytest.raises(TypeError, match=msg):
  563. dtype.construct_from_string(dtype.name)
  564. return
  565. super().test_construct_from_string_own_name(dtype)
  566. def test_is_dtype_from_name(self, dtype, request):
  567. pa_dtype = dtype.pyarrow_dtype
  568. if pa.types.is_string(pa_dtype):
  569. # We still support StringDtype('pyarrow') over ArrowDtype(pa.string())
  570. assert not type(dtype).is_dtype(dtype.name)
  571. else:
  572. if pa.types.is_decimal(pa_dtype):
  573. request.applymarker(
  574. pytest.mark.xfail(
  575. raises=NotImplementedError,
  576. reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}",
  577. )
  578. )
  579. super().test_is_dtype_from_name(dtype)
  580. def test_construct_from_string_another_type_raises(self, dtype):
  581. msg = r"'another_type' must end with '\[pyarrow\]'"
  582. with pytest.raises(TypeError, match=msg):
  583. type(dtype).construct_from_string("another_type")
  584. def test_get_common_dtype(self, dtype, request):
  585. pa_dtype = dtype.pyarrow_dtype
  586. if (
  587. pa.types.is_date(pa_dtype)
  588. or pa.types.is_time(pa_dtype)
  589. or (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None)
  590. or pa.types.is_binary(pa_dtype)
  591. or pa.types.is_decimal(pa_dtype)
  592. ):
  593. request.applymarker(
  594. pytest.mark.xfail(
  595. reason=(
  596. f"{pa_dtype} does not have associated numpy "
  597. f"dtype findable by find_common_type"
  598. )
  599. )
  600. )
  601. super().test_get_common_dtype(dtype)
  602. def test_is_not_string_type(self, dtype):
  603. pa_dtype = dtype.pyarrow_dtype
  604. if pa.types.is_string(pa_dtype):
  605. assert is_string_dtype(dtype)
  606. else:
  607. super().test_is_not_string_type(dtype)
  608. @pytest.mark.xfail(
  609. reason="GH 45419: pyarrow.ChunkedArray does not support views.", run=False
  610. )
  611. def test_view(self, data):
  612. super().test_view(data)
  613. def test_fillna_no_op_returns_copy(self, data):
  614. data = data[~data.isna()]
  615. valid = data[0]
  616. result = data.fillna(valid)
  617. assert result is not data
  618. tm.assert_extension_array_equal(result, data)
  619. result = data.fillna(method="backfill")
  620. assert result is not data
  621. tm.assert_extension_array_equal(result, data)
  622. @pytest.mark.xfail(
  623. reason="GH 45419: pyarrow.ChunkedArray does not support views", run=False
  624. )
  625. def test_transpose(self, data):
  626. super().test_transpose(data)
  627. @pytest.mark.xfail(
  628. reason="GH 45419: pyarrow.ChunkedArray does not support views", run=False
  629. )
  630. def test_setitem_preserves_views(self, data):
  631. super().test_setitem_preserves_views(data)
  632. @pytest.mark.parametrize("dtype_backend", ["pyarrow", no_default])
  633. @pytest.mark.parametrize("engine", ["c", "python"])
  634. def test_EA_types(self, engine, data, dtype_backend, request):
  635. pa_dtype = data.dtype.pyarrow_dtype
  636. if pa.types.is_decimal(pa_dtype):
  637. request.applymarker(
  638. pytest.mark.xfail(
  639. raises=NotImplementedError,
  640. reason=f"Parameterized types {pa_dtype} not supported.",
  641. )
  642. )
  643. elif pa.types.is_timestamp(pa_dtype) and pa_dtype.unit in ("us", "ns"):
  644. request.applymarker(
  645. pytest.mark.xfail(
  646. raises=ValueError,
  647. reason="https://github.com/pandas-dev/pandas/issues/49767",
  648. )
  649. )
  650. elif pa.types.is_binary(pa_dtype):
  651. request.applymarker(
  652. pytest.mark.xfail(reason="CSV parsers don't correctly handle binary")
  653. )
  654. df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))})
  655. csv_output = df.to_csv(index=False, na_rep=np.nan)
  656. if pa.types.is_binary(pa_dtype):
  657. csv_output = BytesIO(csv_output)
  658. else:
  659. csv_output = StringIO(csv_output)
  660. result = pd.read_csv(
  661. csv_output,
  662. dtype={"with_dtype": str(data.dtype)},
  663. engine=engine,
  664. dtype_backend=dtype_backend,
  665. )
  666. expected = df
  667. tm.assert_frame_equal(result, expected)
  668. def test_invert(self, data, request):
  669. pa_dtype = data.dtype.pyarrow_dtype
  670. if not (
  671. pa.types.is_boolean(pa_dtype)
  672. or pa.types.is_integer(pa_dtype)
  673. or pa.types.is_string(pa_dtype)
  674. ):
  675. request.applymarker(
  676. pytest.mark.xfail(
  677. raises=pa.ArrowNotImplementedError,
  678. reason=f"pyarrow.compute.invert does support {pa_dtype}",
  679. )
  680. )
  681. if PY312 and pa.types.is_boolean(pa_dtype):
  682. with tm.assert_produces_warning(
  683. DeprecationWarning, match="Bitwise inversion", check_stacklevel=False
  684. ):
  685. super().test_invert(data)
  686. else:
  687. super().test_invert(data)
  688. @pytest.mark.parametrize("periods", [1, -2])
  689. def test_diff(self, data, periods, request):
  690. pa_dtype = data.dtype.pyarrow_dtype
  691. if pa.types.is_unsigned_integer(pa_dtype) and periods == 1:
  692. request.applymarker(
  693. pytest.mark.xfail(
  694. raises=pa.ArrowInvalid,
  695. reason=(
  696. f"diff with {pa_dtype} and periods={periods} will overflow"
  697. ),
  698. )
  699. )
  700. super().test_diff(data, periods)
  701. def test_value_counts_returns_pyarrow_int64(self, data):
  702. # GH 51462
  703. data = data[:10]
  704. result = data.value_counts()
  705. assert result.dtype == ArrowDtype(pa.int64())
  706. _combine_le_expected_dtype = "bool[pyarrow]"
  707. def get_op_from_name(self, op_name):
  708. short_opname = op_name.strip("_")
  709. if short_opname == "rtruediv":
  710. # use the numpy version that won't raise on division by zero
  711. def rtruediv(x, y):
  712. return np.divide(y, x)
  713. return rtruediv
  714. elif short_opname == "rfloordiv":
  715. return lambda x, y: np.floor_divide(y, x)
  716. return tm.get_op_from_name(op_name)
  717. def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
  718. # BaseOpsUtil._combine can upcast expected dtype
  719. # (because it generates expected on python scalars)
  720. # while ArrowExtensionArray maintains original type
  721. expected = pointwise_result
  722. if op_name in ["eq", "ne", "lt", "le", "gt", "ge"]:
  723. return pointwise_result.astype("boolean[pyarrow]")
  724. was_frame = False
  725. if isinstance(expected, pd.DataFrame):
  726. was_frame = True
  727. expected_data = expected.iloc[:, 0]
  728. original_dtype = obj.iloc[:, 0].dtype
  729. else:
  730. expected_data = expected
  731. original_dtype = obj.dtype
  732. orig_pa_type = original_dtype.pyarrow_dtype
  733. if not was_frame and isinstance(other, pd.Series):
  734. # i.e. test_arith_series_with_array
  735. if not (
  736. pa.types.is_floating(orig_pa_type)
  737. or (
  738. pa.types.is_integer(orig_pa_type)
  739. and op_name not in ["__truediv__", "__rtruediv__"]
  740. )
  741. or pa.types.is_duration(orig_pa_type)
  742. or pa.types.is_timestamp(orig_pa_type)
  743. or pa.types.is_date(orig_pa_type)
  744. or pa.types.is_decimal(orig_pa_type)
  745. ):
  746. # base class _combine always returns int64, while
  747. # ArrowExtensionArray does not upcast
  748. return expected
  749. elif not (
  750. (op_name == "__floordiv__" and pa.types.is_integer(orig_pa_type))
  751. or pa.types.is_duration(orig_pa_type)
  752. or pa.types.is_timestamp(orig_pa_type)
  753. or pa.types.is_date(orig_pa_type)
  754. or pa.types.is_decimal(orig_pa_type)
  755. ):
  756. # base class _combine always returns int64, while
  757. # ArrowExtensionArray does not upcast
  758. return expected
  759. pa_expected = pa.array(expected_data._values)
  760. if pa.types.is_duration(pa_expected.type):
  761. if pa.types.is_date(orig_pa_type):
  762. if pa.types.is_date64(orig_pa_type):
  763. # TODO: why is this different vs date32?
  764. unit = "ms"
  765. else:
  766. unit = "s"
  767. else:
  768. # pyarrow sees sequence of datetime/timedelta objects and defaults
  769. # to "us" but the non-pointwise op retains unit
  770. # timestamp or duration
  771. unit = orig_pa_type.unit
  772. if type(other) in [datetime, timedelta] and unit in ["s", "ms"]:
  773. # pydatetime/pytimedelta objects have microsecond reso, so we
  774. # take the higher reso of the original and microsecond. Note
  775. # this matches what we would do with DatetimeArray/TimedeltaArray
  776. unit = "us"
  777. pa_expected = pa_expected.cast(f"duration[{unit}]")
  778. elif pa.types.is_decimal(pa_expected.type) and pa.types.is_decimal(
  779. orig_pa_type
  780. ):
  781. # decimal precision can resize in the result type depending on data
  782. # just compare the float values
  783. alt = getattr(obj, op_name)(other)
  784. alt_dtype = tm.get_dtype(alt)
  785. assert isinstance(alt_dtype, ArrowDtype)
  786. if op_name == "__pow__" and isinstance(other, Decimal):
  787. # TODO: would it make more sense to retain Decimal here?
  788. alt_dtype = ArrowDtype(pa.float64())
  789. elif (
  790. op_name == "__pow__"
  791. and isinstance(other, pd.Series)
  792. and other.dtype == original_dtype
  793. ):
  794. # TODO: would it make more sense to retain Decimal here?
  795. alt_dtype = ArrowDtype(pa.float64())
  796. else:
  797. assert pa.types.is_decimal(alt_dtype.pyarrow_dtype)
  798. return expected.astype(alt_dtype)
  799. else:
  800. pa_expected = pa_expected.cast(orig_pa_type)
  801. pd_expected = type(expected_data._values)(pa_expected)
  802. if was_frame:
  803. expected = pd.DataFrame(
  804. pd_expected, index=expected.index, columns=expected.columns
  805. )
  806. else:
  807. expected = pd.Series(pd_expected)
  808. return expected
  809. def _is_temporal_supported(self, opname, pa_dtype):
  810. return (
  811. (
  812. opname in ("__add__", "__radd__")
  813. or (
  814. opname
  815. in ("__truediv__", "__rtruediv__", "__floordiv__", "__rfloordiv__")
  816. and not pa_version_under14p0
  817. )
  818. )
  819. and pa.types.is_duration(pa_dtype)
  820. or opname in ("__sub__", "__rsub__")
  821. and pa.types.is_temporal(pa_dtype)
  822. )
  823. def _get_expected_exception(
  824. self, op_name: str, obj, other
  825. ) -> type[Exception] | tuple[type[Exception], ...] | None:
  826. if op_name in ("__divmod__", "__rdivmod__"):
  827. return (NotImplementedError, TypeError)
  828. exc: type[Exception] | tuple[type[Exception], ...] | None
  829. dtype = tm.get_dtype(obj)
  830. # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no
  831. # attribute "pyarrow_dtype"
  832. pa_dtype = dtype.pyarrow_dtype # type: ignore[union-attr]
  833. arrow_temporal_supported = self._is_temporal_supported(op_name, pa_dtype)
  834. if op_name in {
  835. "__mod__",
  836. "__rmod__",
  837. }:
  838. exc = (NotImplementedError, TypeError)
  839. elif arrow_temporal_supported:
  840. exc = None
  841. elif op_name in ["__add__", "__radd__"] and (
  842. pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype)
  843. ):
  844. exc = None
  845. elif not (
  846. pa.types.is_floating(pa_dtype)
  847. or pa.types.is_integer(pa_dtype)
  848. or pa.types.is_decimal(pa_dtype)
  849. ):
  850. exc = TypeError
  851. else:
  852. exc = None
  853. return exc
  854. def _get_arith_xfail_marker(self, opname, pa_dtype):
  855. mark = None
  856. arrow_temporal_supported = self._is_temporal_supported(opname, pa_dtype)
  857. if opname == "__rpow__" and (
  858. pa.types.is_floating(pa_dtype)
  859. or pa.types.is_integer(pa_dtype)
  860. or pa.types.is_decimal(pa_dtype)
  861. ):
  862. mark = pytest.mark.xfail(
  863. reason=(
  864. f"GH#29997: 1**pandas.NA == 1 while 1**pyarrow.NA == NULL "
  865. f"for {pa_dtype}"
  866. )
  867. )
  868. elif arrow_temporal_supported and (
  869. pa.types.is_time(pa_dtype)
  870. or (
  871. opname
  872. in ("__truediv__", "__rtruediv__", "__floordiv__", "__rfloordiv__")
  873. and pa.types.is_duration(pa_dtype)
  874. )
  875. ):
  876. mark = pytest.mark.xfail(
  877. raises=TypeError,
  878. reason=(
  879. f"{opname} not supported between"
  880. f"pd.NA and {pa_dtype} Python scalar"
  881. ),
  882. )
  883. elif opname == "__rfloordiv__" and (
  884. pa.types.is_integer(pa_dtype) or pa.types.is_decimal(pa_dtype)
  885. ):
  886. mark = pytest.mark.xfail(
  887. raises=pa.ArrowInvalid,
  888. reason="divide by 0",
  889. )
  890. elif opname == "__rtruediv__" and pa.types.is_decimal(pa_dtype):
  891. mark = pytest.mark.xfail(
  892. raises=pa.ArrowInvalid,
  893. reason="divide by 0",
  894. )
  895. return mark
  896. def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request):
  897. pa_dtype = data.dtype.pyarrow_dtype
  898. if all_arithmetic_operators == "__rmod__" and pa.types.is_binary(pa_dtype):
  899. pytest.skip("Skip testing Python string formatting")
  900. mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype)
  901. if mark is not None:
  902. request.applymarker(mark)
  903. super().test_arith_series_with_scalar(data, all_arithmetic_operators)
  904. def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
  905. pa_dtype = data.dtype.pyarrow_dtype
  906. if all_arithmetic_operators == "__rmod__" and (
  907. pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype)
  908. ):
  909. pytest.skip("Skip testing Python string formatting")
  910. mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype)
  911. if mark is not None:
  912. request.applymarker(mark)
  913. super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
  914. def test_arith_series_with_array(self, data, all_arithmetic_operators, request):
  915. pa_dtype = data.dtype.pyarrow_dtype
  916. if all_arithmetic_operators in (
  917. "__sub__",
  918. "__rsub__",
  919. ) and pa.types.is_unsigned_integer(pa_dtype):
  920. request.applymarker(
  921. pytest.mark.xfail(
  922. raises=pa.ArrowInvalid,
  923. reason=(
  924. f"Implemented pyarrow.compute.subtract_checked "
  925. f"which raises on overflow for {pa_dtype}"
  926. ),
  927. )
  928. )
  929. mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype)
  930. if mark is not None:
  931. request.applymarker(mark)
  932. op_name = all_arithmetic_operators
  933. ser = pd.Series(data)
  934. # pd.Series([ser.iloc[0]] * len(ser)) may not return ArrowExtensionArray
  935. # since ser.iloc[0] is a python scalar
  936. other = pd.Series(pd.array([ser.iloc[0]] * len(ser), dtype=data.dtype))
  937. self.check_opname(ser, op_name, other)
  938. def test_add_series_with_extension_array(self, data, request):
  939. pa_dtype = data.dtype.pyarrow_dtype
  940. if pa_dtype.equals("int8"):
  941. request.applymarker(
  942. pytest.mark.xfail(
  943. raises=pa.ArrowInvalid,
  944. reason=f"raises on overflow for {pa_dtype}",
  945. )
  946. )
  947. super().test_add_series_with_extension_array(data)
  948. def test_invalid_other_comp(self, data, comparison_op):
  949. # GH 48833
  950. with pytest.raises(
  951. NotImplementedError, match=".* not implemented for <class 'object'>"
  952. ):
  953. comparison_op(data, object())
  954. @pytest.mark.parametrize("masked_dtype", ["boolean", "Int64", "Float64"])
  955. def test_comp_masked_numpy(self, masked_dtype, comparison_op):
  956. # GH 52625
  957. data = [1, 0, None]
  958. ser_masked = pd.Series(data, dtype=masked_dtype)
  959. ser_pa = pd.Series(data, dtype=f"{masked_dtype.lower()}[pyarrow]")
  960. result = comparison_op(ser_pa, ser_masked)
  961. if comparison_op in [operator.lt, operator.gt, operator.ne]:
  962. exp = [False, False, None]
  963. else:
  964. exp = [True, True, None]
  965. expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
  966. tm.assert_series_equal(result, expected)
  967. class TestLogicalOps:
  968. """Various Series and DataFrame logical ops methods."""
  969. def test_kleene_or(self):
  970. a = pd.Series([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean[pyarrow]")
  971. b = pd.Series([True, False, None] * 3, dtype="boolean[pyarrow]")
  972. result = a | b
  973. expected = pd.Series(
  974. [True, True, True, True, False, None, True, None, None],
  975. dtype="boolean[pyarrow]",
  976. )
  977. tm.assert_series_equal(result, expected)
  978. result = b | a
  979. tm.assert_series_equal(result, expected)
  980. # ensure we haven't mutated anything inplace
  981. tm.assert_series_equal(
  982. a,
  983. pd.Series([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean[pyarrow]"),
  984. )
  985. tm.assert_series_equal(
  986. b, pd.Series([True, False, None] * 3, dtype="boolean[pyarrow]")
  987. )
  988. @pytest.mark.parametrize(
  989. "other, expected",
  990. [
  991. (None, [True, None, None]),
  992. (pd.NA, [True, None, None]),
  993. (True, [True, True, True]),
  994. (np.bool_(True), [True, True, True]),
  995. (False, [True, False, None]),
  996. (np.bool_(False), [True, False, None]),
  997. ],
  998. )
  999. def test_kleene_or_scalar(self, other, expected):
  1000. a = pd.Series([True, False, None], dtype="boolean[pyarrow]")
  1001. result = a | other
  1002. expected = pd.Series(expected, dtype="boolean[pyarrow]")
  1003. tm.assert_series_equal(result, expected)
  1004. result = other | a
  1005. tm.assert_series_equal(result, expected)
  1006. # ensure we haven't mutated anything inplace
  1007. tm.assert_series_equal(
  1008. a, pd.Series([True, False, None], dtype="boolean[pyarrow]")
  1009. )
  1010. def test_kleene_and(self):
  1011. a = pd.Series([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean[pyarrow]")
  1012. b = pd.Series([True, False, None] * 3, dtype="boolean[pyarrow]")
  1013. result = a & b
  1014. expected = pd.Series(
  1015. [True, False, None, False, False, False, None, False, None],
  1016. dtype="boolean[pyarrow]",
  1017. )
  1018. tm.assert_series_equal(result, expected)
  1019. result = b & a
  1020. tm.assert_series_equal(result, expected)
  1021. # ensure we haven't mutated anything inplace
  1022. tm.assert_series_equal(
  1023. a,
  1024. pd.Series([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean[pyarrow]"),
  1025. )
  1026. tm.assert_series_equal(
  1027. b, pd.Series([True, False, None] * 3, dtype="boolean[pyarrow]")
  1028. )
  1029. @pytest.mark.parametrize(
  1030. "other, expected",
  1031. [
  1032. (None, [None, False, None]),
  1033. (pd.NA, [None, False, None]),
  1034. (True, [True, False, None]),
  1035. (False, [False, False, False]),
  1036. (np.bool_(True), [True, False, None]),
  1037. (np.bool_(False), [False, False, False]),
  1038. ],
  1039. )
  1040. def test_kleene_and_scalar(self, other, expected):
  1041. a = pd.Series([True, False, None], dtype="boolean[pyarrow]")
  1042. result = a & other
  1043. expected = pd.Series(expected, dtype="boolean[pyarrow]")
  1044. tm.assert_series_equal(result, expected)
  1045. result = other & a
  1046. tm.assert_series_equal(result, expected)
  1047. # ensure we haven't mutated anything inplace
  1048. tm.assert_series_equal(
  1049. a, pd.Series([True, False, None], dtype="boolean[pyarrow]")
  1050. )
  1051. def test_kleene_xor(self):
  1052. a = pd.Series([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean[pyarrow]")
  1053. b = pd.Series([True, False, None] * 3, dtype="boolean[pyarrow]")
  1054. result = a ^ b
  1055. expected = pd.Series(
  1056. [False, True, None, True, False, None, None, None, None],
  1057. dtype="boolean[pyarrow]",
  1058. )
  1059. tm.assert_series_equal(result, expected)
  1060. result = b ^ a
  1061. tm.assert_series_equal(result, expected)
  1062. # ensure we haven't mutated anything inplace
  1063. tm.assert_series_equal(
  1064. a,
  1065. pd.Series([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean[pyarrow]"),
  1066. )
  1067. tm.assert_series_equal(
  1068. b, pd.Series([True, False, None] * 3, dtype="boolean[pyarrow]")
  1069. )
  1070. @pytest.mark.parametrize(
  1071. "other, expected",
  1072. [
  1073. (None, [None, None, None]),
  1074. (pd.NA, [None, None, None]),
  1075. (True, [False, True, None]),
  1076. (np.bool_(True), [False, True, None]),
  1077. (np.bool_(False), [True, False, None]),
  1078. ],
  1079. )
  1080. def test_kleene_xor_scalar(self, other, expected):
  1081. a = pd.Series([True, False, None], dtype="boolean[pyarrow]")
  1082. result = a ^ other
  1083. expected = pd.Series(expected, dtype="boolean[pyarrow]")
  1084. tm.assert_series_equal(result, expected)
  1085. result = other ^ a
  1086. tm.assert_series_equal(result, expected)
  1087. # ensure we haven't mutated anything inplace
  1088. tm.assert_series_equal(
  1089. a, pd.Series([True, False, None], dtype="boolean[pyarrow]")
  1090. )
  1091. @pytest.mark.parametrize(
  1092. "op, exp",
  1093. [
  1094. ["__and__", True],
  1095. ["__or__", True],
  1096. ["__xor__", False],
  1097. ],
  1098. )
  1099. def test_logical_masked_numpy(self, op, exp):
  1100. # GH 52625
  1101. data = [True, False, None]
  1102. ser_masked = pd.Series(data, dtype="boolean")
  1103. ser_pa = pd.Series(data, dtype="boolean[pyarrow]")
  1104. result = getattr(ser_pa, op)(ser_masked)
  1105. expected = pd.Series([exp, False, None], dtype=ArrowDtype(pa.bool_()))
  1106. tm.assert_series_equal(result, expected)
  1107. @pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES)
  1108. def test_bitwise(pa_type):
  1109. # GH 54495
  1110. dtype = ArrowDtype(pa_type)
  1111. left = pd.Series([1, None, 3, 4], dtype=dtype)
  1112. right = pd.Series([None, 3, 5, 4], dtype=dtype)
  1113. result = left | right
  1114. expected = pd.Series([None, None, 3 | 5, 4 | 4], dtype=dtype)
  1115. tm.assert_series_equal(result, expected)
  1116. result = left & right
  1117. expected = pd.Series([None, None, 3 & 5, 4 & 4], dtype=dtype)
  1118. tm.assert_series_equal(result, expected)
  1119. result = left ^ right
  1120. expected = pd.Series([None, None, 3 ^ 5, 4 ^ 4], dtype=dtype)
  1121. tm.assert_series_equal(result, expected)
  1122. result = ~left
  1123. expected = ~(left.fillna(0).to_numpy())
  1124. expected = pd.Series(expected, dtype=dtype).mask(left.isnull())
  1125. tm.assert_series_equal(result, expected)
  1126. def test_arrowdtype_construct_from_string_type_with_unsupported_parameters():
  1127. with pytest.raises(NotImplementedError, match="Passing pyarrow type"):
  1128. ArrowDtype.construct_from_string("not_a_real_dype[s, tz=UTC][pyarrow]")
  1129. with pytest.raises(NotImplementedError, match="Passing pyarrow type"):
  1130. ArrowDtype.construct_from_string("decimal(7, 2)[pyarrow]")
  1131. def test_arrowdtype_construct_from_string_supports_dt64tz():
  1132. # as of GH#50689, timestamptz is supported
  1133. dtype = ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]")
  1134. expected = ArrowDtype(pa.timestamp("s", "UTC"))
  1135. assert dtype == expected
  1136. def test_arrowdtype_construct_from_string_type_only_one_pyarrow():
  1137. # GH#51225
  1138. invalid = "int64[pyarrow]foobar[pyarrow]"
  1139. msg = (
  1140. r"Passing pyarrow type specific parameters \(\[pyarrow\]\) in the "
  1141. r"string is not supported\."
  1142. )
  1143. with pytest.raises(NotImplementedError, match=msg):
  1144. pd.Series(range(3), dtype=invalid)
  1145. def test_arrow_string_multiplication():
  1146. # GH 56537
  1147. binary = pd.Series(["abc", "defg"], dtype=ArrowDtype(pa.string()))
  1148. repeat = pd.Series([2, -2], dtype="int64[pyarrow]")
  1149. result = binary * repeat
  1150. expected = pd.Series(["abcabc", ""], dtype=ArrowDtype(pa.string()))
  1151. tm.assert_series_equal(result, expected)
  1152. reflected_result = repeat * binary
  1153. tm.assert_series_equal(result, reflected_result)
  1154. def test_arrow_string_multiplication_scalar_repeat():
  1155. binary = pd.Series(["abc", "defg"], dtype=ArrowDtype(pa.string()))
  1156. result = binary * 2
  1157. expected = pd.Series(["abcabc", "defgdefg"], dtype=ArrowDtype(pa.string()))
  1158. tm.assert_series_equal(result, expected)
  1159. reflected_result = 2 * binary
  1160. tm.assert_series_equal(reflected_result, expected)
  1161. @pytest.mark.parametrize(
  1162. "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
  1163. )
  1164. @pytest.mark.parametrize("quantile", [0.5, [0.5, 0.5]])
  1165. def test_quantile(data, interpolation, quantile, request):
  1166. pa_dtype = data.dtype.pyarrow_dtype
  1167. data = data.take([0, 0, 0])
  1168. ser = pd.Series(data)
  1169. if (
  1170. pa.types.is_string(pa_dtype)
  1171. or pa.types.is_binary(pa_dtype)
  1172. or pa.types.is_boolean(pa_dtype)
  1173. ):
  1174. # For string, bytes, and bool, we don't *expect* to have quantile work
  1175. # Note this matches the non-pyarrow behavior
  1176. msg = r"Function 'quantile' has no kernel matching input types \(.*\)"
  1177. with pytest.raises(pa.ArrowNotImplementedError, match=msg):
  1178. ser.quantile(q=quantile, interpolation=interpolation)
  1179. return
  1180. if (
  1181. pa.types.is_integer(pa_dtype)
  1182. or pa.types.is_floating(pa_dtype)
  1183. or pa.types.is_decimal(pa_dtype)
  1184. ):
  1185. pass
  1186. elif pa.types.is_temporal(data._pa_array.type):
  1187. pass
  1188. else:
  1189. request.applymarker(
  1190. pytest.mark.xfail(
  1191. raises=pa.ArrowNotImplementedError,
  1192. reason=f"quantile not supported by pyarrow for {pa_dtype}",
  1193. )
  1194. )
  1195. data = data.take([0, 0, 0])
  1196. ser = pd.Series(data)
  1197. result = ser.quantile(q=quantile, interpolation=interpolation)
  1198. if pa.types.is_timestamp(pa_dtype) and interpolation not in ["lower", "higher"]:
  1199. # rounding error will make the check below fail
  1200. # (e.g. '2020-01-01 01:01:01.000001' vs '2020-01-01 01:01:01.000001024'),
  1201. # so we'll check for now that we match the numpy analogue
  1202. if pa_dtype.tz:
  1203. pd_dtype = f"M8[{pa_dtype.unit}, {pa_dtype.tz}]"
  1204. else:
  1205. pd_dtype = f"M8[{pa_dtype.unit}]"
  1206. ser_np = ser.astype(pd_dtype)
  1207. expected = ser_np.quantile(q=quantile, interpolation=interpolation)
  1208. if quantile == 0.5:
  1209. if pa_dtype.unit == "us":
  1210. expected = expected.to_pydatetime(warn=False)
  1211. assert result == expected
  1212. else:
  1213. if pa_dtype.unit == "us":
  1214. expected = expected.dt.floor("us")
  1215. tm.assert_series_equal(result, expected.astype(data.dtype))
  1216. return
  1217. if quantile == 0.5:
  1218. assert result == data[0]
  1219. else:
  1220. # Just check the values
  1221. expected = pd.Series(data.take([0, 0]), index=[0.5, 0.5])
  1222. if (
  1223. pa.types.is_integer(pa_dtype)
  1224. or pa.types.is_floating(pa_dtype)
  1225. or pa.types.is_decimal(pa_dtype)
  1226. ):
  1227. expected = expected.astype("float64[pyarrow]")
  1228. result = result.astype("float64[pyarrow]")
  1229. tm.assert_series_equal(result, expected)
  1230. @pytest.mark.parametrize(
  1231. "take_idx, exp_idx",
  1232. [[[0, 0, 2, 2, 4, 4], [4, 0]], [[0, 0, 0, 2, 4, 4], [0]]],
  1233. ids=["multi_mode", "single_mode"],
  1234. )
  1235. def test_mode_dropna_true(data_for_grouping, take_idx, exp_idx):
  1236. data = data_for_grouping.take(take_idx)
  1237. ser = pd.Series(data)
  1238. result = ser.mode(dropna=True)
  1239. expected = pd.Series(data_for_grouping.take(exp_idx))
  1240. tm.assert_series_equal(result, expected)
  1241. def test_mode_dropna_false_mode_na(data):
  1242. # GH 50982
  1243. more_nans = pd.Series([None, None, data[0]], dtype=data.dtype)
  1244. result = more_nans.mode(dropna=False)
  1245. expected = pd.Series([None], dtype=data.dtype)
  1246. tm.assert_series_equal(result, expected)
  1247. expected = pd.Series([data[0], None], dtype=data.dtype)
  1248. result = expected.mode(dropna=False)
  1249. tm.assert_series_equal(result, expected)
  1250. @pytest.mark.parametrize(
  1251. "arrow_dtype, expected_type",
  1252. [
  1253. [pa.binary(), bytes],
  1254. [pa.binary(16), bytes],
  1255. [pa.large_binary(), bytes],
  1256. [pa.large_string(), str],
  1257. [pa.list_(pa.int64()), list],
  1258. [pa.large_list(pa.int64()), list],
  1259. [pa.map_(pa.string(), pa.int64()), list],
  1260. [pa.struct([("f1", pa.int8()), ("f2", pa.string())]), dict],
  1261. [pa.dictionary(pa.int64(), pa.int64()), CategoricalDtypeType],
  1262. ],
  1263. )
  1264. def test_arrow_dtype_type(arrow_dtype, expected_type):
  1265. # GH 51845
  1266. # TODO: Redundant with test_getitem_scalar once arrow_dtype exists in data fixture
  1267. assert ArrowDtype(arrow_dtype).type == expected_type
  1268. def test_is_bool_dtype():
  1269. # GH 22667
  1270. data = ArrowExtensionArray(pa.array([True, False, True]))
  1271. assert is_bool_dtype(data)
  1272. assert pd.core.common.is_bool_indexer(data)
  1273. s = pd.Series(range(len(data)))
  1274. result = s[data]
  1275. expected = s[np.asarray(data)]
  1276. tm.assert_series_equal(result, expected)
  1277. def test_is_numeric_dtype(data):
  1278. # GH 50563
  1279. pa_type = data.dtype.pyarrow_dtype
  1280. if (
  1281. pa.types.is_floating(pa_type)
  1282. or pa.types.is_integer(pa_type)
  1283. or pa.types.is_decimal(pa_type)
  1284. ):
  1285. assert is_numeric_dtype(data)
  1286. else:
  1287. assert not is_numeric_dtype(data)
  1288. def test_is_integer_dtype(data):
  1289. # GH 50667
  1290. pa_type = data.dtype.pyarrow_dtype
  1291. if pa.types.is_integer(pa_type):
  1292. assert is_integer_dtype(data)
  1293. else:
  1294. assert not is_integer_dtype(data)
  1295. def test_is_signed_integer_dtype(data):
  1296. pa_type = data.dtype.pyarrow_dtype
  1297. if pa.types.is_signed_integer(pa_type):
  1298. assert is_signed_integer_dtype(data)
  1299. else:
  1300. assert not is_signed_integer_dtype(data)
  1301. def test_is_unsigned_integer_dtype(data):
  1302. pa_type = data.dtype.pyarrow_dtype
  1303. if pa.types.is_unsigned_integer(pa_type):
  1304. assert is_unsigned_integer_dtype(data)
  1305. else:
  1306. assert not is_unsigned_integer_dtype(data)
  1307. def test_is_float_dtype(data):
  1308. pa_type = data.dtype.pyarrow_dtype
  1309. if pa.types.is_floating(pa_type):
  1310. assert is_float_dtype(data)
  1311. else:
  1312. assert not is_float_dtype(data)
  1313. def test_pickle_roundtrip(data):
  1314. # GH 42600
  1315. expected = pd.Series(data)
  1316. expected_sliced = expected.head(2)
  1317. full_pickled = pickle.dumps(expected)
  1318. sliced_pickled = pickle.dumps(expected_sliced)
  1319. assert len(full_pickled) > len(sliced_pickled)
  1320. result = pickle.loads(full_pickled)
  1321. tm.assert_series_equal(result, expected)
  1322. result_sliced = pickle.loads(sliced_pickled)
  1323. tm.assert_series_equal(result_sliced, expected_sliced)
  1324. def test_astype_from_non_pyarrow(data):
  1325. # GH49795
  1326. pd_array = data._pa_array.to_pandas().array
  1327. result = pd_array.astype(data.dtype)
  1328. assert not isinstance(pd_array.dtype, ArrowDtype)
  1329. assert isinstance(result.dtype, ArrowDtype)
  1330. tm.assert_extension_array_equal(result, data)
  1331. def test_astype_float_from_non_pyarrow_str():
  1332. # GH50430
  1333. ser = pd.Series(["1.0"])
  1334. result = ser.astype("float64[pyarrow]")
  1335. expected = pd.Series([1.0], dtype="float64[pyarrow]")
  1336. tm.assert_series_equal(result, expected)
  1337. def test_astype_errors_ignore():
  1338. # GH 55399
  1339. expected = pd.DataFrame({"col": [17000000]}, dtype="int32[pyarrow]")
  1340. result = expected.astype("float[pyarrow]", errors="ignore")
  1341. tm.assert_frame_equal(result, expected)
  1342. def test_to_numpy_with_defaults(data):
  1343. # GH49973
  1344. result = data.to_numpy()
  1345. pa_type = data._pa_array.type
  1346. if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type):
  1347. pytest.skip("Tested in test_to_numpy_temporal")
  1348. elif pa.types.is_date(pa_type):
  1349. expected = np.array(list(data))
  1350. else:
  1351. expected = np.array(data._pa_array)
  1352. if data._hasna and not is_numeric_dtype(data.dtype):
  1353. expected = expected.astype(object)
  1354. expected[pd.isna(data)] = pd.NA
  1355. tm.assert_numpy_array_equal(result, expected)
  1356. def test_to_numpy_int_with_na():
  1357. # GH51227: ensure to_numpy does not convert int to float
  1358. data = [1, None]
  1359. arr = pd.array(data, dtype="int64[pyarrow]")
  1360. result = arr.to_numpy()
  1361. expected = np.array([1, np.nan])
  1362. assert isinstance(result[0], float)
  1363. tm.assert_numpy_array_equal(result, expected)
  1364. @pytest.mark.parametrize("na_val, exp", [(lib.no_default, np.nan), (1, 1)])
  1365. def test_to_numpy_null_array(na_val, exp):
  1366. # GH#52443
  1367. arr = pd.array([pd.NA, pd.NA], dtype="null[pyarrow]")
  1368. result = arr.to_numpy(dtype="float64", na_value=na_val)
  1369. expected = np.array([exp] * 2, dtype="float64")
  1370. tm.assert_numpy_array_equal(result, expected)
  1371. def test_to_numpy_null_array_no_dtype():
  1372. # GH#52443
  1373. arr = pd.array([pd.NA, pd.NA], dtype="null[pyarrow]")
  1374. result = arr.to_numpy(dtype=None)
  1375. expected = np.array([pd.NA] * 2, dtype="object")
  1376. tm.assert_numpy_array_equal(result, expected)
  1377. def test_to_numpy_without_dtype():
  1378. # GH 54808
  1379. arr = pd.array([True, pd.NA], dtype="boolean[pyarrow]")
  1380. result = arr.to_numpy(na_value=False)
  1381. expected = np.array([True, False], dtype=np.bool_)
  1382. tm.assert_numpy_array_equal(result, expected)
  1383. arr = pd.array([1.0, pd.NA], dtype="float32[pyarrow]")
  1384. result = arr.to_numpy(na_value=0.0)
  1385. expected = np.array([1.0, 0.0], dtype=np.float32)
  1386. tm.assert_numpy_array_equal(result, expected)
  1387. def test_setitem_null_slice(data):
  1388. # GH50248
  1389. orig = data.copy()
  1390. result = orig.copy()
  1391. result[:] = data[0]
  1392. expected = ArrowExtensionArray._from_sequence(
  1393. [data[0]] * len(data),
  1394. dtype=data.dtype,
  1395. )
  1396. tm.assert_extension_array_equal(result, expected)
  1397. result = orig.copy()
  1398. result[:] = data[::-1]
  1399. expected = data[::-1]
  1400. tm.assert_extension_array_equal(result, expected)
  1401. result = orig.copy()
  1402. result[:] = data.tolist()
  1403. expected = data
  1404. tm.assert_extension_array_equal(result, expected)
  1405. def test_setitem_invalid_dtype(data):
  1406. # GH50248
  1407. pa_type = data._pa_array.type
  1408. if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type):
  1409. fill_value = 123
  1410. err = TypeError
  1411. msg = "Invalid value '123' for dtype"
  1412. elif (
  1413. pa.types.is_integer(pa_type)
  1414. or pa.types.is_floating(pa_type)
  1415. or pa.types.is_boolean(pa_type)
  1416. ):
  1417. fill_value = "foo"
  1418. err = pa.ArrowInvalid
  1419. msg = "Could not convert"
  1420. else:
  1421. fill_value = "foo"
  1422. err = TypeError
  1423. msg = "Invalid value 'foo' for dtype"
  1424. with pytest.raises(err, match=msg):
  1425. data[:] = fill_value
  1426. def test_from_arrow_respecting_given_dtype():
  1427. date_array = pa.array(
  1428. [pd.Timestamp("2019-12-31"), pd.Timestamp("2019-12-31")], type=pa.date32()
  1429. )
  1430. result = date_array.to_pandas(
  1431. types_mapper={pa.date32(): ArrowDtype(pa.date64())}.get
  1432. )
  1433. expected = pd.Series(
  1434. [pd.Timestamp("2019-12-31"), pd.Timestamp("2019-12-31")],
  1435. dtype=ArrowDtype(pa.date64()),
  1436. )
  1437. tm.assert_series_equal(result, expected)
  1438. def test_from_arrow_respecting_given_dtype_unsafe():
  1439. array = pa.array([1.5, 2.5], type=pa.float64())
  1440. with tm.external_error_raised(pa.ArrowInvalid):
  1441. array.to_pandas(types_mapper={pa.float64(): ArrowDtype(pa.int64())}.get)
  1442. def test_round():
  1443. dtype = "float64[pyarrow]"
  1444. ser = pd.Series([0.0, 1.23, 2.56, pd.NA], dtype=dtype)
  1445. result = ser.round(1)
  1446. expected = pd.Series([0.0, 1.2, 2.6, pd.NA], dtype=dtype)
  1447. tm.assert_series_equal(result, expected)
  1448. ser = pd.Series([123.4, pd.NA, 56.78], dtype=dtype)
  1449. result = ser.round(-1)
  1450. expected = pd.Series([120.0, pd.NA, 60.0], dtype=dtype)
  1451. tm.assert_series_equal(result, expected)
  1452. def test_searchsorted_with_na_raises(data_for_sorting, as_series):
  1453. # GH50447
  1454. b, c, a = data_for_sorting
  1455. arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c]
  1456. arr[-1] = pd.NA
  1457. if as_series:
  1458. arr = pd.Series(arr)
  1459. msg = (
  1460. "searchsorted requires array to be sorted, "
  1461. "which is impossible with NAs present."
  1462. )
  1463. with pytest.raises(ValueError, match=msg):
  1464. arr.searchsorted(b)
  1465. def test_sort_values_dictionary():
  1466. df = pd.DataFrame(
  1467. {
  1468. "a": pd.Series(
  1469. ["x", "y"], dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.string()))
  1470. ),
  1471. "b": [1, 2],
  1472. },
  1473. )
  1474. expected = df.copy()
  1475. result = df.sort_values(by=["a", "b"])
  1476. tm.assert_frame_equal(result, expected)
  1477. @pytest.mark.parametrize("pat", ["abc", "a[a-z]{2}"])
  1478. def test_str_count(pat):
  1479. ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
  1480. result = ser.str.count(pat)
  1481. expected = pd.Series([1, None], dtype=ArrowDtype(pa.int32()))
  1482. tm.assert_series_equal(result, expected)
  1483. def test_str_count_flags_unsupported():
  1484. ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
  1485. with pytest.raises(NotImplementedError, match="count not"):
  1486. ser.str.count("abc", flags=1)
  1487. @pytest.mark.parametrize(
  1488. "side, str_func", [["left", "rjust"], ["right", "ljust"], ["both", "center"]]
  1489. )
  1490. def test_str_pad(side, str_func):
  1491. ser = pd.Series(["a", None], dtype=ArrowDtype(pa.string()))
  1492. result = ser.str.pad(width=3, side=side, fillchar="x")
  1493. expected = pd.Series(
  1494. [getattr("a", str_func)(3, "x"), None], dtype=ArrowDtype(pa.string())
  1495. )
  1496. tm.assert_series_equal(result, expected)
  1497. def test_str_pad_invalid_side():
  1498. ser = pd.Series(["a", None], dtype=ArrowDtype(pa.string()))
  1499. with pytest.raises(ValueError, match="Invalid side: foo"):
  1500. ser.str.pad(3, "foo", "x")
  1501. @pytest.mark.parametrize(
  1502. "pat, case, na, regex, exp",
  1503. [
  1504. ["ab", False, None, False, [True, None]],
  1505. ["Ab", True, None, False, [False, None]],
  1506. ["ab", False, True, False, [True, True]],
  1507. ["a[a-z]{1}", False, None, True, [True, None]],
  1508. ["A[a-z]{1}", True, None, True, [False, None]],
  1509. ],
  1510. )
  1511. def test_str_contains(pat, case, na, regex, exp):
  1512. ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
  1513. result = ser.str.contains(pat, case=case, na=na, regex=regex)
  1514. expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
  1515. tm.assert_series_equal(result, expected)
  1516. def test_str_contains_flags_unsupported():
  1517. ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
  1518. with pytest.raises(NotImplementedError, match="contains not"):
  1519. ser.str.contains("a", flags=1)
  1520. @pytest.mark.parametrize(
  1521. "side, pat, na, exp",
  1522. [
  1523. ["startswith", "ab", None, [True, None, False]],
  1524. ["startswith", "b", False, [False, False, False]],
  1525. ["endswith", "b", True, [False, True, False]],
  1526. ["endswith", "bc", None, [True, None, False]],
  1527. ["startswith", ("a", "e", "g"), None, [True, None, True]],
  1528. ["endswith", ("a", "c", "g"), None, [True, None, True]],
  1529. ["startswith", (), None, [False, None, False]],
  1530. ["endswith", (), None, [False, None, False]],
  1531. ],
  1532. )
  1533. def test_str_start_ends_with(side, pat, na, exp):
  1534. ser = pd.Series(["abc", None, "efg"], dtype=ArrowDtype(pa.string()))
  1535. result = getattr(ser.str, side)(pat, na=na)
  1536. expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
  1537. tm.assert_series_equal(result, expected)
  1538. @pytest.mark.parametrize("side", ("startswith", "endswith"))
  1539. def test_str_starts_ends_with_all_nulls_empty_tuple(side):
  1540. ser = pd.Series([None, None], dtype=ArrowDtype(pa.string()))
  1541. result = getattr(ser.str, side)(())
  1542. # bool datatype preserved for all nulls.
  1543. expected = pd.Series([None, None], dtype=ArrowDtype(pa.bool_()))
  1544. tm.assert_series_equal(result, expected)
  1545. @pytest.mark.parametrize(
  1546. "arg_name, arg",
  1547. [["pat", re.compile("b")], ["repl", str], ["case", False], ["flags", 1]],
  1548. )
  1549. def test_str_replace_unsupported(arg_name, arg):
  1550. ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
  1551. kwargs = {"pat": "b", "repl": "x", "regex": True}
  1552. kwargs[arg_name] = arg
  1553. with pytest.raises(NotImplementedError, match="replace is not supported"):
  1554. ser.str.replace(**kwargs)
  1555. @pytest.mark.parametrize(
  1556. "pat, repl, n, regex, exp",
  1557. [
  1558. ["a", "x", -1, False, ["xbxc", None]],
  1559. ["a", "x", 1, False, ["xbac", None]],
  1560. ["[a-b]", "x", -1, True, ["xxxc", None]],
  1561. ],
  1562. )
  1563. def test_str_replace(pat, repl, n, regex, exp):
  1564. ser = pd.Series(["abac", None], dtype=ArrowDtype(pa.string()))
  1565. result = ser.str.replace(pat, repl, n=n, regex=regex)
  1566. expected = pd.Series(exp, dtype=ArrowDtype(pa.string()))
  1567. tm.assert_series_equal(result, expected)
  1568. def test_str_replace_negative_n():
  1569. # GH 56404
  1570. ser = pd.Series(["abc", "aaaaaa"], dtype=ArrowDtype(pa.string()))
  1571. actual = ser.str.replace("a", "", -3, True)
  1572. expected = pd.Series(["bc", ""], dtype=ArrowDtype(pa.string()))
  1573. tm.assert_series_equal(expected, actual)
  1574. # Same bug for pyarrow-backed StringArray GH#59628
  1575. ser2 = ser.astype(pd.StringDtype(storage="pyarrow"))
  1576. actual2 = ser2.str.replace("a", "", -3, True)
  1577. expected2 = expected.astype(ser2.dtype)
  1578. tm.assert_series_equal(expected2, actual2)
  1579. ser3 = ser.astype(pd.StringDtype(storage="pyarrow", na_value=np.nan))
  1580. actual3 = ser3.str.replace("a", "", -3, True)
  1581. expected3 = expected.astype(ser3.dtype)
  1582. tm.assert_series_equal(expected3, actual3)
  1583. def test_str_repeat_unsupported():
  1584. ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
  1585. with pytest.raises(NotImplementedError, match="repeat is not"):
  1586. ser.str.repeat([1, 2])
  1587. def test_str_repeat():
  1588. ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
  1589. result = ser.str.repeat(2)
  1590. expected = pd.Series(["abcabc", None], dtype=ArrowDtype(pa.string()))
  1591. tm.assert_series_equal(result, expected)
  1592. @pytest.mark.parametrize(
  1593. "pat, case, na, exp",
  1594. [
  1595. ["ab", False, None, [True, None]],
  1596. ["Ab", True, None, [False, None]],
  1597. ["bc", True, None, [False, None]],
  1598. ["ab", False, True, [True, True]],
  1599. ["a[a-z]{1}", False, None, [True, None]],
  1600. ["A[a-z]{1}", True, None, [False, None]],
  1601. ],
  1602. )
  1603. def test_str_match(pat, case, na, exp):
  1604. ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
  1605. result = ser.str.match(pat, case=case, na=na)
  1606. expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
  1607. tm.assert_series_equal(result, expected)
  1608. @pytest.mark.parametrize(
  1609. "pat, case, na, exp",
  1610. # Note: keep cases in sync with
  1611. # pandas/tests/strings/test_find_replace.py::test_str_fullmatch_extra_cases
  1612. [
  1613. ["abc", False, None, [True, False, False, None]],
  1614. ["Abc", True, None, [False, False, False, None]],
  1615. ["bc", True, None, [False, False, False, None]],
  1616. ["ab", False, None, [False, False, False, None]],
  1617. ["a[a-z]{2}", False, None, [True, False, False, None]],
  1618. ["A[a-z]{1}", True, None, [False, False, False, None]],
  1619. # GH Issue: #56652
  1620. ["abc$", False, None, [True, False, False, None]],
  1621. ["abc\\$", False, None, [False, True, False, None]],
  1622. ["Abc$", True, None, [False, False, False, None]],
  1623. ["Abc\\$", True, None, [False, False, False, None]],
  1624. # https://github.com/pandas-dev/pandas/issues/61072
  1625. ["(abc)|(abx)", True, None, [True, False, False, None]],
  1626. ["((abc)|(abx))", True, None, [True, False, False, None]],
  1627. ],
  1628. )
  1629. def test_str_fullmatch(pat, case, na, exp):
  1630. ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string()))
  1631. result = ser.str.fullmatch(pat, case=case, na=na)
  1632. expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
  1633. tm.assert_series_equal(result, expected)
  1634. @pytest.mark.parametrize(
  1635. "sub, start, end, exp, exp_typ",
  1636. [["ab", 0, None, [0, None], pa.int32()], ["bc", 1, 3, [1, None], pa.int64()]],
  1637. )
  1638. def test_str_find(sub, start, end, exp, exp_typ):
  1639. ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
  1640. result = ser.str.find(sub, start=start, end=end)
  1641. expected = pd.Series(exp, dtype=ArrowDtype(exp_typ))
  1642. tm.assert_series_equal(result, expected)
  1643. def test_str_find_negative_start():
  1644. # GH 56411
  1645. ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
  1646. result = ser.str.find(sub="b", start=-1000, end=3)
  1647. expected = pd.Series([1, None], dtype=ArrowDtype(pa.int64()))
  1648. tm.assert_series_equal(result, expected)
  1649. def test_str_find_no_end():
  1650. ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
  1651. result = ser.str.find("ab", start=1)
  1652. expected = pd.Series([-1, None], dtype="int64[pyarrow]")
  1653. tm.assert_series_equal(result, expected)
  1654. def test_str_find_negative_start_negative_end():
  1655. # GH 56791
  1656. ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string()))
  1657. result = ser.str.find(sub="d", start=-6, end=-3)
  1658. expected = pd.Series([3, None], dtype=ArrowDtype(pa.int64()))
  1659. tm.assert_series_equal(result, expected)
  1660. def test_str_find_large_start():
  1661. # GH 56791
  1662. ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string()))
  1663. result = ser.str.find(sub="d", start=16)
  1664. expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64()))
  1665. tm.assert_series_equal(result, expected)
  1666. @pytest.mark.skipif(
  1667. pa_version_under13p0, reason="https://github.com/apache/arrow/issues/36311"
  1668. )
  1669. @pytest.mark.parametrize("start", [-15, -3, 0, 1, 15, None])
  1670. @pytest.mark.parametrize("end", [-15, -1, 0, 3, 15, None])
  1671. @pytest.mark.parametrize("sub", ["", "az", "abce", "a", "caa"])
  1672. def test_str_find_e2e(start, end, sub):
  1673. s = pd.Series(
  1674. ["abcaadef", "abc", "abcdeddefgj8292", "ab", "a", ""],
  1675. dtype=ArrowDtype(pa.string()),
  1676. )
  1677. object_series = s.astype(pd.StringDtype(storage="python"))
  1678. result = s.str.find(sub, start, end)
  1679. expected = object_series.str.find(sub, start, end).astype(result.dtype)
  1680. tm.assert_series_equal(result, expected)
  1681. arrow_str_series = s.astype(pd.StringDtype(storage="pyarrow"))
  1682. result2 = arrow_str_series.str.find(sub, start, end).astype(result.dtype)
  1683. tm.assert_series_equal(result2, expected)
  1684. def test_str_find_negative_start_negative_end_no_match():
  1685. # GH 56791
  1686. ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string()))
  1687. result = ser.str.find(sub="d", start=-3, end=-6)
  1688. expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64()))
  1689. tm.assert_series_equal(result, expected)
  1690. @pytest.mark.parametrize(
  1691. "i, exp",
  1692. [
  1693. [1, ["b", "e", None]],
  1694. [-1, ["c", "e", None]],
  1695. [2, ["c", None, None]],
  1696. [-3, ["a", None, None]],
  1697. [4, [None, None, None]],
  1698. ],
  1699. )
  1700. def test_str_get(i, exp):
  1701. ser = pd.Series(["abc", "de", None], dtype=ArrowDtype(pa.string()))
  1702. result = ser.str.get(i)
  1703. expected = pd.Series(exp, dtype=ArrowDtype(pa.string()))
  1704. tm.assert_series_equal(result, expected)
  1705. @pytest.mark.xfail(
  1706. reason="TODO: StringMethods._validate should support Arrow list types",
  1707. raises=AttributeError,
  1708. )
  1709. def test_str_join():
  1710. ser = pd.Series(ArrowExtensionArray(pa.array([list("abc"), list("123"), None])))
  1711. result = ser.str.join("=")
  1712. expected = pd.Series(["a=b=c", "1=2=3", None], dtype=ArrowDtype(pa.string()))
  1713. tm.assert_series_equal(result, expected)
  1714. def test_str_join_string_type():
  1715. ser = pd.Series(ArrowExtensionArray(pa.array(["abc", "123", None])))
  1716. result = ser.str.join("=")
  1717. expected = pd.Series(["a=b=c", "1=2=3", None], dtype=ArrowDtype(pa.string()))
  1718. tm.assert_series_equal(result, expected)
  1719. @pytest.mark.parametrize(
  1720. "start, stop, step, exp",
  1721. [
  1722. [None, 2, None, ["ab", None]],
  1723. [None, 2, 1, ["ab", None]],
  1724. [1, 3, 1, ["bc", None]],
  1725. (None, None, -1, ["dcba", None]),
  1726. ],
  1727. )
  1728. def test_str_slice(start, stop, step, exp):
  1729. ser = pd.Series(["abcd", None], dtype=ArrowDtype(pa.string()))
  1730. result = ser.str.slice(start, stop, step)
  1731. expected = pd.Series(exp, dtype=ArrowDtype(pa.string()))
  1732. tm.assert_series_equal(result, expected)
  1733. @pytest.mark.parametrize(
  1734. "start, stop, repl, exp",
  1735. [
  1736. [1, 2, "x", ["axcd", None]],
  1737. [None, 2, "x", ["xcd", None]],
  1738. [None, 2, None, ["cd", None]],
  1739. ],
  1740. )
  1741. def test_str_slice_replace(start, stop, repl, exp):
  1742. ser = pd.Series(["abcd", None], dtype=ArrowDtype(pa.string()))
  1743. result = ser.str.slice_replace(start, stop, repl)
  1744. expected = pd.Series(exp, dtype=ArrowDtype(pa.string()))
  1745. tm.assert_series_equal(result, expected)
  1746. @pytest.mark.parametrize(
  1747. "value, method, exp",
  1748. [
  1749. ["a1c", "isalnum", True],
  1750. ["!|,", "isalnum", False],
  1751. ["aaa", "isalpha", True],
  1752. ["!!!", "isalpha", False],
  1753. ["٠", "isdecimal", True], # noqa: RUF001
  1754. ["~!", "isdecimal", False],
  1755. ["2", "isdigit", True],
  1756. ["~", "isdigit", False],
  1757. ["aaa", "islower", True],
  1758. ["aaA", "islower", False],
  1759. ["123", "isnumeric", True],
  1760. ["11I", "isnumeric", False],
  1761. [" ", "isspace", True],
  1762. ["", "isspace", False],
  1763. ["The That", "istitle", True],
  1764. ["the That", "istitle", False],
  1765. ["AAA", "isupper", True],
  1766. ["AAc", "isupper", False],
  1767. ],
  1768. )
  1769. def test_str_is_functions(value, method, exp):
  1770. ser = pd.Series([value, None], dtype=ArrowDtype(pa.string()))
  1771. result = getattr(ser.str, method)()
  1772. expected = pd.Series([exp, None], dtype=ArrowDtype(pa.bool_()))
  1773. tm.assert_series_equal(result, expected)
  1774. @pytest.mark.parametrize(
  1775. "method, exp",
  1776. [
  1777. ["capitalize", "Abc def"],
  1778. ["title", "Abc Def"],
  1779. ["swapcase", "AbC Def"],
  1780. ["lower", "abc def"],
  1781. ["upper", "ABC DEF"],
  1782. ["casefold", "abc def"],
  1783. ],
  1784. )
  1785. def test_str_transform_functions(method, exp):
  1786. ser = pd.Series(["aBc dEF", None], dtype=ArrowDtype(pa.string()))
  1787. result = getattr(ser.str, method)()
  1788. expected = pd.Series([exp, None], dtype=ArrowDtype(pa.string()))
  1789. tm.assert_series_equal(result, expected)
  1790. def test_str_len():
  1791. ser = pd.Series(["abcd", None], dtype=ArrowDtype(pa.string()))
  1792. result = ser.str.len()
  1793. expected = pd.Series([4, None], dtype=ArrowDtype(pa.int32()))
  1794. tm.assert_series_equal(result, expected)
  1795. @pytest.mark.parametrize(
  1796. "method, to_strip, val",
  1797. [
  1798. ["strip", None, " abc "],
  1799. ["strip", "x", "xabcx"],
  1800. ["lstrip", None, " abc"],
  1801. ["lstrip", "x", "xabc"],
  1802. ["rstrip", None, "abc "],
  1803. ["rstrip", "x", "abcx"],
  1804. ],
  1805. )
  1806. def test_str_strip(method, to_strip, val):
  1807. ser = pd.Series([val, None], dtype=ArrowDtype(pa.string()))
  1808. result = getattr(ser.str, method)(to_strip=to_strip)
  1809. expected = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
  1810. tm.assert_series_equal(result, expected)
  1811. @pytest.mark.parametrize("val", ["abc123", "abc"])
  1812. def test_str_removesuffix(val):
  1813. ser = pd.Series([val, None], dtype=ArrowDtype(pa.string()))
  1814. result = ser.str.removesuffix("123")
  1815. expected = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
  1816. tm.assert_series_equal(result, expected)
  1817. @pytest.mark.parametrize("val", ["123abc", "abc"])
  1818. def test_str_removeprefix(val):
  1819. ser = pd.Series([val, None], dtype=ArrowDtype(pa.string()))
  1820. result = ser.str.removeprefix("123")
  1821. expected = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
  1822. tm.assert_series_equal(result, expected)
  1823. @pytest.mark.parametrize("errors", ["ignore", "strict"])
  1824. @pytest.mark.parametrize(
  1825. "encoding, exp",
  1826. [
  1827. ["utf8", b"abc"],
  1828. ["utf32", b"\xff\xfe\x00\x00a\x00\x00\x00b\x00\x00\x00c\x00\x00\x00"],
  1829. ],
  1830. )
  1831. def test_str_encode(errors, encoding, exp):
  1832. ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
  1833. result = ser.str.encode(encoding, errors)
  1834. expected = pd.Series([exp, None], dtype=ArrowDtype(pa.binary()))
  1835. tm.assert_series_equal(result, expected)
  1836. @pytest.mark.parametrize("flags", [0, 2])
  1837. def test_str_findall(flags):
  1838. ser = pd.Series(["abc", "efg", None], dtype=ArrowDtype(pa.string()))
  1839. result = ser.str.findall("b", flags=flags)
  1840. expected = pd.Series([["b"], [], None], dtype=ArrowDtype(pa.list_(pa.string())))
  1841. tm.assert_series_equal(result, expected)
  1842. @pytest.mark.parametrize("method", ["index", "rindex"])
  1843. @pytest.mark.parametrize(
  1844. "start, end",
  1845. [
  1846. [0, None],
  1847. [1, 4],
  1848. ],
  1849. )
  1850. def test_str_r_index(method, start, end):
  1851. ser = pd.Series(["abcba", None], dtype=ArrowDtype(pa.string()))
  1852. result = getattr(ser.str, method)("c", start, end)
  1853. expected = pd.Series([2, None], dtype=ArrowDtype(pa.int64()))
  1854. tm.assert_series_equal(result, expected)
  1855. with pytest.raises(ValueError, match="substring not found"):
  1856. getattr(ser.str, method)("foo", start, end)
  1857. @pytest.mark.parametrize("form", ["NFC", "NFKC"])
  1858. def test_str_normalize(form):
  1859. ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
  1860. result = ser.str.normalize(form)
  1861. expected = ser.copy()
  1862. tm.assert_series_equal(result, expected)
  1863. @pytest.mark.parametrize(
  1864. "start, end",
  1865. [
  1866. [0, None],
  1867. [1, 4],
  1868. ],
  1869. )
  1870. def test_str_rfind(start, end):
  1871. ser = pd.Series(["abcba", "foo", None], dtype=ArrowDtype(pa.string()))
  1872. result = ser.str.rfind("c", start, end)
  1873. expected = pd.Series([2, -1, None], dtype=ArrowDtype(pa.int64()))
  1874. tm.assert_series_equal(result, expected)
  1875. def test_str_translate():
  1876. ser = pd.Series(["abcba", None], dtype=ArrowDtype(pa.string()))
  1877. result = ser.str.translate({97: "b"})
  1878. expected = pd.Series(["bbcbb", None], dtype=ArrowDtype(pa.string()))
  1879. tm.assert_series_equal(result, expected)
  1880. def test_str_wrap():
  1881. ser = pd.Series(["abcba", None], dtype=ArrowDtype(pa.string()))
  1882. result = ser.str.wrap(3)
  1883. expected = pd.Series(["abc\nba", None], dtype=ArrowDtype(pa.string()))
  1884. tm.assert_series_equal(result, expected)
  1885. def test_get_dummies():
  1886. ser = pd.Series(["a|b", None, "a|c"], dtype=ArrowDtype(pa.string()))
  1887. result = ser.str.get_dummies()
  1888. expected = pd.DataFrame(
  1889. [[True, True, False], [False, False, False], [True, False, True]],
  1890. dtype=ArrowDtype(pa.bool_()),
  1891. columns=["a", "b", "c"],
  1892. )
  1893. tm.assert_frame_equal(result, expected)
  1894. def test_str_partition():
  1895. ser = pd.Series(["abcba", None], dtype=ArrowDtype(pa.string()))
  1896. result = ser.str.partition("b")
  1897. expected = pd.DataFrame(
  1898. [["a", "b", "cba"], [None, None, None]], dtype=ArrowDtype(pa.string())
  1899. )
  1900. tm.assert_frame_equal(result, expected)
  1901. result = ser.str.partition("b", expand=False)
  1902. expected = pd.Series(ArrowExtensionArray(pa.array([["a", "b", "cba"], None])))
  1903. tm.assert_series_equal(result, expected)
  1904. result = ser.str.rpartition("b")
  1905. expected = pd.DataFrame(
  1906. [["abc", "b", "a"], [None, None, None]], dtype=ArrowDtype(pa.string())
  1907. )
  1908. tm.assert_frame_equal(result, expected)
  1909. result = ser.str.rpartition("b", expand=False)
  1910. expected = pd.Series(ArrowExtensionArray(pa.array([["abc", "b", "a"], None])))
  1911. tm.assert_series_equal(result, expected)
  1912. @pytest.mark.parametrize("method", ["rsplit", "split"])
  1913. def test_str_split_pat_none(method):
  1914. # GH 56271
  1915. ser = pd.Series(["a1 cbc\nb", None], dtype=ArrowDtype(pa.string()))
  1916. result = getattr(ser.str, method)()
  1917. expected = pd.Series(ArrowExtensionArray(pa.array([["a1", "cbc", "b"], None])))
  1918. tm.assert_series_equal(result, expected)
  1919. def test_str_split():
  1920. # GH 52401
  1921. ser = pd.Series(["a1cbcb", "a2cbcb", None], dtype=ArrowDtype(pa.string()))
  1922. result = ser.str.split("c")
  1923. expected = pd.Series(
  1924. ArrowExtensionArray(pa.array([["a1", "b", "b"], ["a2", "b", "b"], None]))
  1925. )
  1926. tm.assert_series_equal(result, expected)
  1927. result = ser.str.split("c", n=1)
  1928. expected = pd.Series(
  1929. ArrowExtensionArray(pa.array([["a1", "bcb"], ["a2", "bcb"], None]))
  1930. )
  1931. tm.assert_series_equal(result, expected)
  1932. result = ser.str.split("[1-2]", regex=True)
  1933. expected = pd.Series(
  1934. ArrowExtensionArray(pa.array([["a", "cbcb"], ["a", "cbcb"], None]))
  1935. )
  1936. tm.assert_series_equal(result, expected)
  1937. result = ser.str.split("[1-2]", regex=True, expand=True)
  1938. expected = pd.DataFrame(
  1939. {
  1940. 0: ArrowExtensionArray(pa.array(["a", "a", None])),
  1941. 1: ArrowExtensionArray(pa.array(["cbcb", "cbcb", None])),
  1942. }
  1943. )
  1944. tm.assert_frame_equal(result, expected)
  1945. result = ser.str.split("1", expand=True)
  1946. expected = pd.DataFrame(
  1947. {
  1948. 0: ArrowExtensionArray(pa.array(["a", "a2cbcb", None])),
  1949. 1: ArrowExtensionArray(pa.array(["cbcb", None, None])),
  1950. }
  1951. )
  1952. tm.assert_frame_equal(result, expected)
  1953. def test_str_rsplit():
  1954. # GH 52401
  1955. ser = pd.Series(["a1cbcb", "a2cbcb", None], dtype=ArrowDtype(pa.string()))
  1956. result = ser.str.rsplit("c")
  1957. expected = pd.Series(
  1958. ArrowExtensionArray(pa.array([["a1", "b", "b"], ["a2", "b", "b"], None]))
  1959. )
  1960. tm.assert_series_equal(result, expected)
  1961. result = ser.str.rsplit("c", n=1)
  1962. expected = pd.Series(
  1963. ArrowExtensionArray(pa.array([["a1cb", "b"], ["a2cb", "b"], None]))
  1964. )
  1965. tm.assert_series_equal(result, expected)
  1966. result = ser.str.rsplit("c", n=1, expand=True)
  1967. expected = pd.DataFrame(
  1968. {
  1969. 0: ArrowExtensionArray(pa.array(["a1cb", "a2cb", None])),
  1970. 1: ArrowExtensionArray(pa.array(["b", "b", None])),
  1971. }
  1972. )
  1973. tm.assert_frame_equal(result, expected)
  1974. result = ser.str.rsplit("1", expand=True)
  1975. expected = pd.DataFrame(
  1976. {
  1977. 0: ArrowExtensionArray(pa.array(["a", "a2cbcb", None])),
  1978. 1: ArrowExtensionArray(pa.array(["cbcb", None, None])),
  1979. }
  1980. )
  1981. tm.assert_frame_equal(result, expected)
  1982. def test_str_extract_non_symbolic():
  1983. ser = pd.Series(["a1", "b2", "c3"], dtype=ArrowDtype(pa.string()))
  1984. with pytest.raises(ValueError, match="pat=.* must contain a symbolic group name."):
  1985. ser.str.extract(r"[ab](\d)")
  1986. @pytest.mark.parametrize("expand", [True, False])
  1987. def test_str_extract(expand):
  1988. ser = pd.Series(["a1", "b2", "c3"], dtype=ArrowDtype(pa.string()))
  1989. result = ser.str.extract(r"(?P<letter>[ab])(?P<digit>\d)", expand=expand)
  1990. expected = pd.DataFrame(
  1991. {
  1992. "letter": ArrowExtensionArray(pa.array(["a", "b", None])),
  1993. "digit": ArrowExtensionArray(pa.array(["1", "2", None])),
  1994. }
  1995. )
  1996. tm.assert_frame_equal(result, expected)
  1997. def test_str_extract_expand():
  1998. ser = pd.Series(["a1", "b2", "c3"], dtype=ArrowDtype(pa.string()))
  1999. result = ser.str.extract(r"[ab](?P<digit>\d)", expand=True)
  2000. expected = pd.DataFrame(
  2001. {
  2002. "digit": ArrowExtensionArray(pa.array(["1", "2", None])),
  2003. }
  2004. )
  2005. tm.assert_frame_equal(result, expected)
  2006. result = ser.str.extract(r"[ab](?P<digit>\d)", expand=False)
  2007. expected = pd.Series(ArrowExtensionArray(pa.array(["1", "2", None])), name="digit")
  2008. tm.assert_series_equal(result, expected)
  2009. @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"])
  2010. def test_duration_from_strings_with_nat(unit):
  2011. # GH51175
  2012. strings = ["1000", "NaT"]
  2013. pa_type = pa.duration(unit)
  2014. result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa_type)
  2015. expected = ArrowExtensionArray(pa.array([1000, None], type=pa_type))
  2016. tm.assert_extension_array_equal(result, expected)
  2017. def test_unsupported_dt(data):
  2018. pa_dtype = data.dtype.pyarrow_dtype
  2019. if not pa.types.is_temporal(pa_dtype):
  2020. with pytest.raises(
  2021. AttributeError, match="Can only use .dt accessor with datetimelike values"
  2022. ):
  2023. pd.Series(data).dt
  2024. @pytest.mark.parametrize(
  2025. "prop, expected",
  2026. [
  2027. ["year", 2023],
  2028. ["day", 2],
  2029. ["day_of_week", 0],
  2030. ["dayofweek", 0],
  2031. ["weekday", 0],
  2032. ["day_of_year", 2],
  2033. ["dayofyear", 2],
  2034. ["hour", 3],
  2035. ["minute", 4],
  2036. ["is_leap_year", False],
  2037. ["microsecond", 5],
  2038. ["month", 1],
  2039. ["nanosecond", 6],
  2040. ["quarter", 1],
  2041. ["second", 7],
  2042. ["date", date(2023, 1, 2)],
  2043. ["time", time(3, 4, 7, 5)],
  2044. ],
  2045. )
  2046. def test_dt_properties(prop, expected):
  2047. ser = pd.Series(
  2048. [
  2049. pd.Timestamp(
  2050. year=2023,
  2051. month=1,
  2052. day=2,
  2053. hour=3,
  2054. minute=4,
  2055. second=7,
  2056. microsecond=5,
  2057. nanosecond=6,
  2058. ),
  2059. None,
  2060. ],
  2061. dtype=ArrowDtype(pa.timestamp("ns")),
  2062. )
  2063. result = getattr(ser.dt, prop)
  2064. exp_type = None
  2065. if isinstance(expected, date):
  2066. exp_type = pa.date32()
  2067. elif isinstance(expected, time):
  2068. exp_type = pa.time64("ns")
  2069. expected = pd.Series(ArrowExtensionArray(pa.array([expected, None], type=exp_type)))
  2070. tm.assert_series_equal(result, expected)
  2071. def test_dt_is_month_start_end():
  2072. ser = pd.Series(
  2073. [
  2074. datetime(year=2023, month=12, day=2, hour=3),
  2075. datetime(year=2023, month=1, day=1, hour=3),
  2076. datetime(year=2023, month=3, day=31, hour=3),
  2077. None,
  2078. ],
  2079. dtype=ArrowDtype(pa.timestamp("us")),
  2080. )
  2081. result = ser.dt.is_month_start
  2082. expected = pd.Series([False, True, False, None], dtype=ArrowDtype(pa.bool_()))
  2083. tm.assert_series_equal(result, expected)
  2084. result = ser.dt.is_month_end
  2085. expected = pd.Series([False, False, True, None], dtype=ArrowDtype(pa.bool_()))
  2086. tm.assert_series_equal(result, expected)
  2087. def test_dt_is_year_start_end():
  2088. ser = pd.Series(
  2089. [
  2090. datetime(year=2023, month=12, day=31, hour=3),
  2091. datetime(year=2023, month=1, day=1, hour=3),
  2092. datetime(year=2023, month=3, day=31, hour=3),
  2093. None,
  2094. ],
  2095. dtype=ArrowDtype(pa.timestamp("us")),
  2096. )
  2097. result = ser.dt.is_year_start
  2098. expected = pd.Series([False, True, False, None], dtype=ArrowDtype(pa.bool_()))
  2099. tm.assert_series_equal(result, expected)
  2100. result = ser.dt.is_year_end
  2101. expected = pd.Series([True, False, False, None], dtype=ArrowDtype(pa.bool_()))
  2102. tm.assert_series_equal(result, expected)
  2103. def test_dt_is_quarter_start_end():
  2104. ser = pd.Series(
  2105. [
  2106. datetime(year=2023, month=11, day=30, hour=3),
  2107. datetime(year=2023, month=1, day=1, hour=3),
  2108. datetime(year=2023, month=3, day=31, hour=3),
  2109. None,
  2110. ],
  2111. dtype=ArrowDtype(pa.timestamp("us")),
  2112. )
  2113. result = ser.dt.is_quarter_start
  2114. expected = pd.Series([False, True, False, None], dtype=ArrowDtype(pa.bool_()))
  2115. tm.assert_series_equal(result, expected)
  2116. result = ser.dt.is_quarter_end
  2117. expected = pd.Series([False, False, True, None], dtype=ArrowDtype(pa.bool_()))
  2118. tm.assert_series_equal(result, expected)
  2119. @pytest.mark.parametrize("method", ["days_in_month", "daysinmonth"])
  2120. def test_dt_days_in_month(method):
  2121. ser = pd.Series(
  2122. [
  2123. datetime(year=2023, month=3, day=30, hour=3),
  2124. datetime(year=2023, month=4, day=1, hour=3),
  2125. datetime(year=2023, month=2, day=3, hour=3),
  2126. None,
  2127. ],
  2128. dtype=ArrowDtype(pa.timestamp("us")),
  2129. )
  2130. result = getattr(ser.dt, method)
  2131. expected = pd.Series([31, 30, 28, None], dtype=ArrowDtype(pa.int64()))
  2132. tm.assert_series_equal(result, expected)
  2133. def test_dt_normalize():
  2134. ser = pd.Series(
  2135. [
  2136. datetime(year=2023, month=3, day=30),
  2137. datetime(year=2023, month=4, day=1, hour=3),
  2138. datetime(year=2023, month=2, day=3, hour=23, minute=59, second=59),
  2139. None,
  2140. ],
  2141. dtype=ArrowDtype(pa.timestamp("us")),
  2142. )
  2143. result = ser.dt.normalize()
  2144. expected = pd.Series(
  2145. [
  2146. datetime(year=2023, month=3, day=30),
  2147. datetime(year=2023, month=4, day=1),
  2148. datetime(year=2023, month=2, day=3),
  2149. None,
  2150. ],
  2151. dtype=ArrowDtype(pa.timestamp("us")),
  2152. )
  2153. tm.assert_series_equal(result, expected)
  2154. @pytest.mark.parametrize("unit", ["us", "ns"])
  2155. def test_dt_time_preserve_unit(unit):
  2156. ser = pd.Series(
  2157. [datetime(year=2023, month=1, day=2, hour=3), None],
  2158. dtype=ArrowDtype(pa.timestamp(unit)),
  2159. )
  2160. assert ser.dt.unit == unit
  2161. result = ser.dt.time
  2162. expected = pd.Series(
  2163. ArrowExtensionArray(pa.array([time(3, 0), None], type=pa.time64(unit)))
  2164. )
  2165. tm.assert_series_equal(result, expected)
  2166. @pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
  2167. def test_dt_tz(tz):
  2168. ser = pd.Series(
  2169. [datetime(year=2023, month=1, day=2, hour=3), None],
  2170. dtype=ArrowDtype(pa.timestamp("ns", tz=tz)),
  2171. )
  2172. result = ser.dt.tz
  2173. assert result == timezones.maybe_get_tz(tz)
  2174. def test_dt_isocalendar():
  2175. ser = pd.Series(
  2176. [datetime(year=2023, month=1, day=2, hour=3), None],
  2177. dtype=ArrowDtype(pa.timestamp("ns")),
  2178. )
  2179. result = ser.dt.isocalendar()
  2180. expected = pd.DataFrame(
  2181. [[2023, 1, 1], [0, 0, 0]],
  2182. columns=["year", "week", "day"],
  2183. dtype="int64[pyarrow]",
  2184. )
  2185. tm.assert_frame_equal(result, expected)
  2186. @pytest.mark.parametrize(
  2187. "method, exp", [["day_name", "Sunday"], ["month_name", "January"]]
  2188. )
  2189. def test_dt_day_month_name(method, exp, request):
  2190. # GH 52388
  2191. _require_timezone_database(request)
  2192. ser = pd.Series([datetime(2023, 1, 1), None], dtype=ArrowDtype(pa.timestamp("ms")))
  2193. result = getattr(ser.dt, method)()
  2194. expected = pd.Series([exp, None], dtype=ArrowDtype(pa.string()))
  2195. tm.assert_series_equal(result, expected)
  2196. def test_dt_strftime(request):
  2197. _require_timezone_database(request)
  2198. ser = pd.Series(
  2199. [datetime(year=2023, month=1, day=2, hour=3), None],
  2200. dtype=ArrowDtype(pa.timestamp("ns")),
  2201. )
  2202. result = ser.dt.strftime("%Y-%m-%dT%H:%M:%S")
  2203. expected = pd.Series(
  2204. ["2023-01-02T03:00:00.000000000", None], dtype=ArrowDtype(pa.string())
  2205. )
  2206. tm.assert_series_equal(result, expected)
  2207. @pytest.mark.parametrize("method", ["ceil", "floor", "round"])
  2208. def test_dt_roundlike_tz_options_not_supported(method):
  2209. ser = pd.Series(
  2210. [datetime(year=2023, month=1, day=2, hour=3), None],
  2211. dtype=ArrowDtype(pa.timestamp("ns")),
  2212. )
  2213. with pytest.raises(NotImplementedError, match="ambiguous is not supported."):
  2214. getattr(ser.dt, method)("1h", ambiguous="NaT")
  2215. with pytest.raises(NotImplementedError, match="nonexistent is not supported."):
  2216. getattr(ser.dt, method)("1h", nonexistent="NaT")
  2217. @pytest.mark.parametrize("method", ["ceil", "floor", "round"])
  2218. def test_dt_roundlike_unsupported_freq(method):
  2219. ser = pd.Series(
  2220. [datetime(year=2023, month=1, day=2, hour=3), None],
  2221. dtype=ArrowDtype(pa.timestamp("ns")),
  2222. )
  2223. with pytest.raises(ValueError, match="freq='1B' is not supported"):
  2224. getattr(ser.dt, method)("1B")
  2225. with pytest.raises(ValueError, match="Must specify a valid frequency: None"):
  2226. getattr(ser.dt, method)(None)
  2227. @pytest.mark.parametrize("freq", ["D", "h", "min", "s", "ms", "us", "ns"])
  2228. @pytest.mark.parametrize("method", ["ceil", "floor", "round"])
  2229. def test_dt_ceil_year_floor(freq, method):
  2230. ser = pd.Series(
  2231. [datetime(year=2023, month=1, day=1), None],
  2232. )
  2233. pa_dtype = ArrowDtype(pa.timestamp("ns"))
  2234. expected = getattr(ser.dt, method)(f"1{freq}").astype(pa_dtype)
  2235. result = getattr(ser.astype(pa_dtype).dt, method)(f"1{freq}")
  2236. tm.assert_series_equal(result, expected)
  2237. def test_dt_to_pydatetime():
  2238. # GH 51859
  2239. data = [datetime(2022, 1, 1), datetime(2023, 1, 1)]
  2240. ser = pd.Series(data, dtype=ArrowDtype(pa.timestamp("ns")))
  2241. msg = "The behavior of ArrowTemporalProperties.to_pydatetime is deprecated"
  2242. with tm.assert_produces_warning(FutureWarning, match=msg):
  2243. result = ser.dt.to_pydatetime()
  2244. expected = np.array(data, dtype=object)
  2245. tm.assert_numpy_array_equal(result, expected)
  2246. assert all(type(res) is datetime for res in result)
  2247. msg = "The behavior of DatetimeProperties.to_pydatetime is deprecated"
  2248. with tm.assert_produces_warning(FutureWarning, match=msg):
  2249. expected = ser.astype("datetime64[ns]").dt.to_pydatetime()
  2250. tm.assert_numpy_array_equal(result, expected)
  2251. @pytest.mark.parametrize("date_type", [32, 64])
  2252. def test_dt_to_pydatetime_date_error(date_type):
  2253. # GH 52812
  2254. ser = pd.Series(
  2255. [date(2022, 12, 31)],
  2256. dtype=ArrowDtype(getattr(pa, f"date{date_type}")()),
  2257. )
  2258. msg = "The behavior of ArrowTemporalProperties.to_pydatetime is deprecated"
  2259. with tm.assert_produces_warning(FutureWarning, match=msg):
  2260. with pytest.raises(ValueError, match="to_pydatetime cannot be called with"):
  2261. ser.dt.to_pydatetime()
  2262. def test_dt_tz_localize_unsupported_tz_options():
  2263. ser = pd.Series(
  2264. [datetime(year=2023, month=1, day=2, hour=3), None],
  2265. dtype=ArrowDtype(pa.timestamp("ns")),
  2266. )
  2267. with pytest.raises(NotImplementedError, match="ambiguous='NaT' is not supported"):
  2268. ser.dt.tz_localize("UTC", ambiguous="NaT")
  2269. with pytest.raises(NotImplementedError, match="nonexistent='NaT' is not supported"):
  2270. ser.dt.tz_localize("UTC", nonexistent="NaT")
  2271. def test_dt_tz_localize_none():
  2272. ser = pd.Series(
  2273. [datetime(year=2023, month=1, day=2, hour=3), None],
  2274. dtype=ArrowDtype(pa.timestamp("ns", tz="US/Pacific")),
  2275. )
  2276. result = ser.dt.tz_localize(None)
  2277. expected = pd.Series(
  2278. [datetime(year=2023, month=1, day=2, hour=3), None],
  2279. dtype=ArrowDtype(pa.timestamp("ns")),
  2280. )
  2281. tm.assert_series_equal(result, expected)
  2282. @pytest.mark.parametrize("unit", ["us", "ns"])
  2283. def test_dt_tz_localize(unit, request):
  2284. _require_timezone_database(request)
  2285. ser = pd.Series(
  2286. [datetime(year=2023, month=1, day=2, hour=3), None],
  2287. dtype=ArrowDtype(pa.timestamp(unit)),
  2288. )
  2289. result = ser.dt.tz_localize("US/Pacific")
  2290. exp_data = pa.array(
  2291. [datetime(year=2023, month=1, day=2, hour=3), None], type=pa.timestamp(unit)
  2292. )
  2293. exp_data = pa.compute.assume_timezone(exp_data, "US/Pacific")
  2294. expected = pd.Series(ArrowExtensionArray(exp_data))
  2295. tm.assert_series_equal(result, expected)
  2296. @pytest.mark.parametrize(
  2297. "nonexistent, exp_date",
  2298. [
  2299. ["shift_forward", datetime(year=2023, month=3, day=12, hour=3)],
  2300. ["shift_backward", pd.Timestamp("2023-03-12 01:59:59.999999999")],
  2301. ],
  2302. )
  2303. def test_dt_tz_localize_nonexistent(nonexistent, exp_date, request):
  2304. _require_timezone_database(request)
  2305. ser = pd.Series(
  2306. [datetime(year=2023, month=3, day=12, hour=2, minute=30), None],
  2307. dtype=ArrowDtype(pa.timestamp("ns")),
  2308. )
  2309. result = ser.dt.tz_localize("US/Pacific", nonexistent=nonexistent)
  2310. exp_data = pa.array([exp_date, None], type=pa.timestamp("ns"))
  2311. exp_data = pa.compute.assume_timezone(exp_data, "US/Pacific")
  2312. expected = pd.Series(ArrowExtensionArray(exp_data))
  2313. tm.assert_series_equal(result, expected)
  2314. def test_dt_tz_convert_not_tz_raises():
  2315. ser = pd.Series(
  2316. [datetime(year=2023, month=1, day=2, hour=3), None],
  2317. dtype=ArrowDtype(pa.timestamp("ns")),
  2318. )
  2319. with pytest.raises(TypeError, match="Cannot convert tz-naive timestamps"):
  2320. ser.dt.tz_convert("UTC")
  2321. def test_dt_tz_convert_none():
  2322. ser = pd.Series(
  2323. [datetime(year=2023, month=1, day=2, hour=3), None],
  2324. dtype=ArrowDtype(pa.timestamp("ns", "US/Pacific")),
  2325. )
  2326. result = ser.dt.tz_convert(None)
  2327. expected = pd.Series(
  2328. [datetime(year=2023, month=1, day=2, hour=3), None],
  2329. dtype=ArrowDtype(pa.timestamp("ns")),
  2330. )
  2331. tm.assert_series_equal(result, expected)
  2332. @pytest.mark.parametrize("unit", ["us", "ns"])
  2333. def test_dt_tz_convert(unit):
  2334. ser = pd.Series(
  2335. [datetime(year=2023, month=1, day=2, hour=3), None],
  2336. dtype=ArrowDtype(pa.timestamp(unit, "US/Pacific")),
  2337. )
  2338. result = ser.dt.tz_convert("US/Eastern")
  2339. expected = pd.Series(
  2340. [datetime(year=2023, month=1, day=2, hour=3), None],
  2341. dtype=ArrowDtype(pa.timestamp(unit, "US/Eastern")),
  2342. )
  2343. tm.assert_series_equal(result, expected)
  2344. @pytest.mark.parametrize("dtype", ["timestamp[ms][pyarrow]", "duration[ms][pyarrow]"])
  2345. def test_as_unit(dtype):
  2346. # GH 52284
  2347. ser = pd.Series([1000, None], dtype=dtype)
  2348. result = ser.dt.as_unit("ns")
  2349. expected = ser.astype(dtype.replace("ms", "ns"))
  2350. tm.assert_series_equal(result, expected)
  2351. @pytest.mark.parametrize(
  2352. "prop, expected",
  2353. [
  2354. ["days", 1],
  2355. ["seconds", 2],
  2356. ["microseconds", 3],
  2357. ["nanoseconds", 4],
  2358. ],
  2359. )
  2360. def test_dt_timedelta_properties(prop, expected):
  2361. # GH 52284
  2362. ser = pd.Series(
  2363. [
  2364. pd.Timedelta(
  2365. days=1,
  2366. seconds=2,
  2367. microseconds=3,
  2368. nanoseconds=4,
  2369. ),
  2370. None,
  2371. ],
  2372. dtype=ArrowDtype(pa.duration("ns")),
  2373. )
  2374. result = getattr(ser.dt, prop)
  2375. expected = pd.Series(
  2376. ArrowExtensionArray(pa.array([expected, None], type=pa.int32()))
  2377. )
  2378. tm.assert_series_equal(result, expected)
  2379. def test_dt_timedelta_total_seconds():
  2380. # GH 52284
  2381. ser = pd.Series(
  2382. [
  2383. pd.Timedelta(
  2384. days=1,
  2385. seconds=2,
  2386. microseconds=3,
  2387. nanoseconds=4,
  2388. ),
  2389. None,
  2390. ],
  2391. dtype=ArrowDtype(pa.duration("ns")),
  2392. )
  2393. result = ser.dt.total_seconds()
  2394. expected = pd.Series(
  2395. ArrowExtensionArray(pa.array([86402.000003, None], type=pa.float64()))
  2396. )
  2397. tm.assert_series_equal(result, expected)
  2398. def test_dt_to_pytimedelta():
  2399. # GH 52284
  2400. data = [timedelta(1, 2, 3), timedelta(1, 2, 4)]
  2401. ser = pd.Series(data, dtype=ArrowDtype(pa.duration("ns")))
  2402. result = ser.dt.to_pytimedelta()
  2403. expected = np.array(data, dtype=object)
  2404. tm.assert_numpy_array_equal(result, expected)
  2405. assert all(type(res) is timedelta for res in result)
  2406. expected = ser.astype("timedelta64[ns]").dt.to_pytimedelta()
  2407. tm.assert_numpy_array_equal(result, expected)
  2408. def test_dt_components():
  2409. # GH 52284
  2410. ser = pd.Series(
  2411. [
  2412. pd.Timedelta(
  2413. days=1,
  2414. seconds=2,
  2415. microseconds=3,
  2416. nanoseconds=4,
  2417. ),
  2418. None,
  2419. ],
  2420. dtype=ArrowDtype(pa.duration("ns")),
  2421. )
  2422. result = ser.dt.components
  2423. expected = pd.DataFrame(
  2424. [[1, 0, 0, 2, 0, 3, 4], [None, None, None, None, None, None, None]],
  2425. columns=[
  2426. "days",
  2427. "hours",
  2428. "minutes",
  2429. "seconds",
  2430. "milliseconds",
  2431. "microseconds",
  2432. "nanoseconds",
  2433. ],
  2434. dtype="int32[pyarrow]",
  2435. )
  2436. tm.assert_frame_equal(result, expected)
  2437. @pytest.mark.parametrize("skipna", [True, False])
  2438. def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna):
  2439. # GH51624
  2440. ser = pd.Series([None], dtype="float64[pyarrow]")
  2441. result = getattr(ser, all_boolean_reductions)(skipna=skipna)
  2442. if skipna:
  2443. expected = all_boolean_reductions == "all"
  2444. else:
  2445. expected = pd.NA
  2446. assert result is expected
  2447. def test_from_sequence_of_strings_boolean():
  2448. true_strings = ["true", "TRUE", "True", "1", "1.0"]
  2449. false_strings = ["false", "FALSE", "False", "0", "0.0"]
  2450. nulls = [None]
  2451. strings = true_strings + false_strings + nulls
  2452. bools = (
  2453. [True] * len(true_strings) + [False] * len(false_strings) + [None] * len(nulls)
  2454. )
  2455. result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa.bool_())
  2456. expected = pd.array(bools, dtype="boolean[pyarrow]")
  2457. tm.assert_extension_array_equal(result, expected)
  2458. strings = ["True", "foo"]
  2459. with pytest.raises(pa.ArrowInvalid, match="Failed to parse"):
  2460. ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa.bool_())
  2461. def test_concat_empty_arrow_backed_series(dtype):
  2462. # GH#51734
  2463. ser = pd.Series([], dtype=dtype)
  2464. expected = ser.copy()
  2465. result = pd.concat([ser[np.array([], dtype=np.bool_)]])
  2466. tm.assert_series_equal(result, expected)
  2467. @pytest.mark.parametrize("dtype", ["string", "string[pyarrow]"])
  2468. def test_series_from_string_array(dtype):
  2469. arr = pa.array("the quick brown fox".split())
  2470. ser = pd.Series(arr, dtype=dtype)
  2471. expected = pd.Series(ArrowExtensionArray(arr), dtype=dtype)
  2472. tm.assert_series_equal(ser, expected)
  2473. # _data was renamed to _pa_data
  2474. class OldArrowExtensionArray(ArrowExtensionArray):
  2475. def __getstate__(self):
  2476. state = super().__getstate__()
  2477. state["_data"] = state.pop("_pa_array")
  2478. return state
  2479. def test_pickle_old_arrowextensionarray():
  2480. data = pa.array([1])
  2481. expected = OldArrowExtensionArray(data)
  2482. result = pickle.loads(pickle.dumps(expected))
  2483. tm.assert_extension_array_equal(result, expected)
  2484. assert result._pa_array == pa.chunked_array(data)
  2485. assert not hasattr(result, "_data")
  2486. def test_setitem_boolean_replace_with_mask_segfault():
  2487. # GH#52059
  2488. N = 145_000
  2489. arr = ArrowExtensionArray(pa.chunked_array([np.ones((N,), dtype=np.bool_)]))
  2490. expected = arr.copy()
  2491. arr[np.zeros((N,), dtype=np.bool_)] = False
  2492. assert arr._pa_array == expected._pa_array
  2493. @pytest.mark.parametrize(
  2494. "data, arrow_dtype",
  2495. [
  2496. ([b"a", b"b"], pa.large_binary()),
  2497. (["a", "b"], pa.large_string()),
  2498. ],
  2499. )
  2500. def test_conversion_large_dtypes_from_numpy_array(data, arrow_dtype):
  2501. dtype = ArrowDtype(arrow_dtype)
  2502. result = pd.array(np.array(data), dtype=dtype)
  2503. expected = pd.array(data, dtype=dtype)
  2504. tm.assert_extension_array_equal(result, expected)
  2505. def test_concat_null_array():
  2506. df = pd.DataFrame({"a": [None, None]}, dtype=ArrowDtype(pa.null()))
  2507. df2 = pd.DataFrame({"a": [0, 1]}, dtype="int64[pyarrow]")
  2508. result = pd.concat([df, df2], ignore_index=True)
  2509. expected = pd.DataFrame({"a": [None, None, 0, 1]}, dtype="int64[pyarrow]")
  2510. tm.assert_frame_equal(result, expected)
  2511. @pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES + tm.FLOAT_PYARROW_DTYPES)
  2512. def test_describe_numeric_data(pa_type):
  2513. # GH 52470
  2514. data = pd.Series([1, 2, 3], dtype=ArrowDtype(pa_type))
  2515. result = data.describe()
  2516. expected = pd.Series(
  2517. [3, 2, 1, 1, 1.5, 2.0, 2.5, 3],
  2518. dtype=ArrowDtype(pa.float64()),
  2519. index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
  2520. )
  2521. tm.assert_series_equal(result, expected)
  2522. @pytest.mark.parametrize("pa_type", tm.TIMEDELTA_PYARROW_DTYPES)
  2523. def test_describe_timedelta_data(pa_type):
  2524. # GH53001
  2525. data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type))
  2526. result = data.describe()
  2527. expected = pd.Series(
  2528. [9] + pd.to_timedelta([5, 2, 1, 3, 5, 7, 9], unit=pa_type.unit).tolist(),
  2529. dtype=object,
  2530. index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
  2531. )
  2532. tm.assert_series_equal(result, expected)
  2533. @pytest.mark.parametrize("pa_type", tm.DATETIME_PYARROW_DTYPES)
  2534. def test_describe_datetime_data(pa_type):
  2535. # GH53001
  2536. data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type))
  2537. result = data.describe()
  2538. expected = pd.Series(
  2539. [9]
  2540. + [
  2541. pd.Timestamp(v, tz=pa_type.tz, unit=pa_type.unit)
  2542. for v in [5, 1, 3, 5, 7, 9]
  2543. ],
  2544. dtype=object,
  2545. index=["count", "mean", "min", "25%", "50%", "75%", "max"],
  2546. )
  2547. tm.assert_series_equal(result, expected)
  2548. @pytest.mark.parametrize(
  2549. "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
  2550. )
  2551. def test_quantile_temporal(pa_type):
  2552. # GH52678
  2553. data = [1, 2, 3]
  2554. ser = pd.Series(data, dtype=ArrowDtype(pa_type))
  2555. result = ser.quantile(0.1)
  2556. expected = ser[0]
  2557. assert result == expected
  2558. def test_date32_repr():
  2559. # GH48238
  2560. arrow_dt = pa.array([date.fromisoformat("2020-01-01")], type=pa.date32())
  2561. ser = pd.Series(arrow_dt, dtype=ArrowDtype(arrow_dt.type))
  2562. assert repr(ser) == "0 2020-01-01\ndtype: date32[day][pyarrow]"
  2563. def test_duration_overflow_from_ndarray_containing_nat():
  2564. # GH52843
  2565. data_ts = pd.to_datetime([1, None])
  2566. data_td = pd.to_timedelta([1, None])
  2567. ser_ts = pd.Series(data_ts, dtype=ArrowDtype(pa.timestamp("ns")))
  2568. ser_td = pd.Series(data_td, dtype=ArrowDtype(pa.duration("ns")))
  2569. result = ser_ts + ser_td
  2570. expected = pd.Series([2, None], dtype=ArrowDtype(pa.timestamp("ns")))
  2571. tm.assert_series_equal(result, expected)
  2572. def test_infer_dtype_pyarrow_dtype(data, request):
  2573. res = lib.infer_dtype(data)
  2574. assert res != "unknown-array"
  2575. if data._hasna and res in ["floating", "datetime64", "timedelta64"]:
  2576. mark = pytest.mark.xfail(
  2577. reason="in infer_dtype pd.NA is not ignored in these cases "
  2578. "even with skipna=True in the list(data) check below"
  2579. )
  2580. request.applymarker(mark)
  2581. assert res == lib.infer_dtype(list(data), skipna=True)
  2582. @pytest.mark.parametrize(
  2583. "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
  2584. )
  2585. def test_from_sequence_temporal(pa_type):
  2586. # GH 53171
  2587. val = 3
  2588. unit = pa_type.unit
  2589. if pa.types.is_duration(pa_type):
  2590. seq = [pd.Timedelta(val, unit=unit).as_unit(unit)]
  2591. else:
  2592. seq = [pd.Timestamp(val, unit=unit, tz=pa_type.tz).as_unit(unit)]
  2593. result = ArrowExtensionArray._from_sequence(seq, dtype=pa_type)
  2594. expected = ArrowExtensionArray(pa.array([val], type=pa_type))
  2595. tm.assert_extension_array_equal(result, expected)
  2596. @pytest.mark.parametrize(
  2597. "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
  2598. )
  2599. def test_setitem_temporal(pa_type):
  2600. # GH 53171
  2601. unit = pa_type.unit
  2602. if pa.types.is_duration(pa_type):
  2603. val = pd.Timedelta(1, unit=unit).as_unit(unit)
  2604. else:
  2605. val = pd.Timestamp(1, unit=unit, tz=pa_type.tz).as_unit(unit)
  2606. arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type))
  2607. result = arr.copy()
  2608. result[:] = val
  2609. expected = ArrowExtensionArray(pa.array([1, 1, 1], type=pa_type))
  2610. tm.assert_extension_array_equal(result, expected)
  2611. @pytest.mark.parametrize(
  2612. "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
  2613. )
  2614. def test_arithmetic_temporal(pa_type, request):
  2615. # GH 53171
  2616. arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type))
  2617. unit = pa_type.unit
  2618. result = arr - pd.Timedelta(1, unit=unit).as_unit(unit)
  2619. expected = ArrowExtensionArray(pa.array([0, 1, 2], type=pa_type))
  2620. tm.assert_extension_array_equal(result, expected)
  2621. @pytest.mark.parametrize(
  2622. "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
  2623. )
  2624. def test_comparison_temporal(pa_type):
  2625. # GH 53171
  2626. unit = pa_type.unit
  2627. if pa.types.is_duration(pa_type):
  2628. val = pd.Timedelta(1, unit=unit).as_unit(unit)
  2629. else:
  2630. val = pd.Timestamp(1, unit=unit, tz=pa_type.tz).as_unit(unit)
  2631. arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type))
  2632. result = arr > val
  2633. expected = ArrowExtensionArray(pa.array([False, True, True], type=pa.bool_()))
  2634. tm.assert_extension_array_equal(result, expected)
  2635. @pytest.mark.parametrize(
  2636. "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
  2637. )
  2638. def test_getitem_temporal(pa_type):
  2639. # GH 53326
  2640. arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type))
  2641. result = arr[1]
  2642. if pa.types.is_duration(pa_type):
  2643. expected = pd.Timedelta(2, unit=pa_type.unit).as_unit(pa_type.unit)
  2644. assert isinstance(result, pd.Timedelta)
  2645. else:
  2646. expected = pd.Timestamp(2, unit=pa_type.unit, tz=pa_type.tz).as_unit(
  2647. pa_type.unit
  2648. )
  2649. assert isinstance(result, pd.Timestamp)
  2650. assert result.unit == expected.unit
  2651. assert result == expected
  2652. @pytest.mark.parametrize(
  2653. "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
  2654. )
  2655. def test_iter_temporal(pa_type):
  2656. # GH 53326
  2657. arr = ArrowExtensionArray(pa.array([1, None], type=pa_type))
  2658. result = list(arr)
  2659. if pa.types.is_duration(pa_type):
  2660. expected = [
  2661. pd.Timedelta(1, unit=pa_type.unit).as_unit(pa_type.unit),
  2662. pd.NA,
  2663. ]
  2664. assert isinstance(result[0], pd.Timedelta)
  2665. else:
  2666. expected = [
  2667. pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit),
  2668. pd.NA,
  2669. ]
  2670. assert isinstance(result[0], pd.Timestamp)
  2671. assert result[0].unit == expected[0].unit
  2672. assert result == expected
  2673. def test_groupby_series_size_returns_pa_int(data):
  2674. # GH 54132
  2675. ser = pd.Series(data[:3], index=["a", "a", "b"])
  2676. result = ser.groupby(level=0).size()
  2677. expected = pd.Series([2, 1], dtype="int64[pyarrow]", index=["a", "b"])
  2678. tm.assert_series_equal(result, expected)
  2679. @pytest.mark.parametrize(
  2680. "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES, ids=repr
  2681. )
  2682. @pytest.mark.parametrize("dtype", [None, object])
  2683. def test_to_numpy_temporal(pa_type, dtype):
  2684. # GH 53326
  2685. # GH 55997: Return datetime64/timedelta64 types with NaT if possible
  2686. arr = ArrowExtensionArray(pa.array([1, None], type=pa_type))
  2687. result = arr.to_numpy(dtype=dtype)
  2688. if pa.types.is_duration(pa_type):
  2689. value = pd.Timedelta(1, unit=pa_type.unit).as_unit(pa_type.unit)
  2690. else:
  2691. value = pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit)
  2692. if dtype == object or (pa.types.is_timestamp(pa_type) and pa_type.tz is not None):
  2693. if dtype == object:
  2694. na = pd.NA
  2695. else:
  2696. na = pd.NaT
  2697. expected = np.array([value, na], dtype=object)
  2698. assert result[0].unit == value.unit
  2699. else:
  2700. na = pa_type.to_pandas_dtype().type("nat", pa_type.unit)
  2701. value = value.to_numpy()
  2702. expected = np.array([value, na])
  2703. assert np.datetime_data(result[0])[0] == pa_type.unit
  2704. tm.assert_numpy_array_equal(result, expected)
  2705. def test_groupby_count_return_arrow_dtype(data_missing):
  2706. df = pd.DataFrame({"A": [1, 1], "B": data_missing, "C": data_missing})
  2707. result = df.groupby("A").count()
  2708. expected = pd.DataFrame(
  2709. [[1, 1]],
  2710. index=pd.Index([1], name="A"),
  2711. columns=["B", "C"],
  2712. dtype="int64[pyarrow]",
  2713. )
  2714. tm.assert_frame_equal(result, expected)
  2715. def test_fixed_size_list():
  2716. # GH#55000
  2717. ser = pd.Series(
  2718. [[1, 2], [3, 4]], dtype=ArrowDtype(pa.list_(pa.int64(), list_size=2))
  2719. )
  2720. result = ser.dtype.type
  2721. assert result == list
  2722. def test_arrowextensiondtype_dataframe_repr():
  2723. # GH 54062
  2724. df = pd.DataFrame(
  2725. pd.period_range("2012", periods=3),
  2726. columns=["col"],
  2727. dtype=ArrowDtype(ArrowPeriodType("D")),
  2728. )
  2729. result = repr(df)
  2730. # TODO: repr value may not be expected; address how
  2731. # pyarrow.ExtensionType values are displayed
  2732. expected = " col\n0 15340\n1 15341\n2 15342"
  2733. assert result == expected
  2734. def test_pow_missing_operand():
  2735. # GH 55512
  2736. k = pd.Series([2, None], dtype="int64[pyarrow]")
  2737. result = k.pow(None, fill_value=3)
  2738. expected = pd.Series([8, None], dtype="int64[pyarrow]")
  2739. tm.assert_series_equal(result, expected)
  2740. @pytest.mark.parametrize("pa_type", tm.TIMEDELTA_PYARROW_DTYPES)
  2741. def test_duration_fillna_numpy(pa_type):
  2742. # GH 54707
  2743. ser1 = pd.Series([None, 2], dtype=ArrowDtype(pa_type))
  2744. ser2 = pd.Series(np.array([1, 3], dtype=f"m8[{pa_type.unit}]"))
  2745. result = ser1.fillna(ser2)
  2746. expected = pd.Series([1, 2], dtype=ArrowDtype(pa_type))
  2747. tm.assert_series_equal(result, expected)
  2748. def test_comparison_not_propagating_arrow_error():
  2749. # GH#54944
  2750. a = pd.Series([1 << 63], dtype="uint64[pyarrow]")
  2751. b = pd.Series([None], dtype="int64[pyarrow]")
  2752. with pytest.raises(pa.lib.ArrowInvalid, match="Integer value"):
  2753. a < b
  2754. def test_factorize_chunked_dictionary():
  2755. # GH 54844
  2756. pa_array = pa.chunked_array(
  2757. [pa.array(["a"]).dictionary_encode(), pa.array(["b"]).dictionary_encode()]
  2758. )
  2759. ser = pd.Series(ArrowExtensionArray(pa_array))
  2760. res_indices, res_uniques = ser.factorize()
  2761. exp_indicies = np.array([0, 1], dtype=np.intp)
  2762. exp_uniques = pd.Index(ArrowExtensionArray(pa_array.combine_chunks()))
  2763. tm.assert_numpy_array_equal(res_indices, exp_indicies)
  2764. tm.assert_index_equal(res_uniques, exp_uniques)
  2765. def test_dictionary_astype_categorical():
  2766. # GH#56672
  2767. arrs = [
  2768. pa.array(np.array(["a", "x", "c", "a"])).dictionary_encode(),
  2769. pa.array(np.array(["a", "d", "c"])).dictionary_encode(),
  2770. ]
  2771. ser = pd.Series(ArrowExtensionArray(pa.chunked_array(arrs)))
  2772. result = ser.astype("category")
  2773. categories = pd.Index(["a", "x", "c", "d"], dtype=ArrowDtype(pa.string()))
  2774. expected = pd.Series(
  2775. ["a", "x", "c", "a", "a", "d", "c"],
  2776. dtype=pd.CategoricalDtype(categories=categories),
  2777. )
  2778. tm.assert_series_equal(result, expected)
  2779. def test_arrow_floordiv():
  2780. # GH 55561
  2781. a = pd.Series([-7], dtype="int64[pyarrow]")
  2782. b = pd.Series([4], dtype="int64[pyarrow]")
  2783. expected = pd.Series([-2], dtype="int64[pyarrow]")
  2784. result = a // b
  2785. tm.assert_series_equal(result, expected)
  2786. def test_arrow_floordiv_large_values():
  2787. # GH 56645
  2788. a = pd.Series([1425801600000000000], dtype="int64[pyarrow]")
  2789. expected = pd.Series([1425801600000], dtype="int64[pyarrow]")
  2790. result = a // 1_000_000
  2791. tm.assert_series_equal(result, expected)
  2792. @pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"])
  2793. def test_arrow_floordiv_large_integral_result(dtype):
  2794. # GH 56676
  2795. a = pd.Series([18014398509481983], dtype=dtype)
  2796. result = a // 1
  2797. tm.assert_series_equal(result, a)
  2798. @pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES)
  2799. def test_arrow_floordiv_larger_divisor(pa_type):
  2800. # GH 56676
  2801. dtype = ArrowDtype(pa_type)
  2802. a = pd.Series([-23], dtype=dtype)
  2803. result = a // 24
  2804. expected = pd.Series([-1], dtype=dtype)
  2805. tm.assert_series_equal(result, expected)
  2806. @pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES)
  2807. def test_arrow_floordiv_integral_invalid(pa_type):
  2808. # GH 56676
  2809. min_value = np.iinfo(pa_type.to_pandas_dtype()).min
  2810. a = pd.Series([min_value], dtype=ArrowDtype(pa_type))
  2811. with pytest.raises(pa.lib.ArrowInvalid, match="overflow|not in range"):
  2812. a // -1
  2813. with pytest.raises(pa.lib.ArrowInvalid, match="divide by zero"):
  2814. a // 0
  2815. @pytest.mark.parametrize("dtype", tm.FLOAT_PYARROW_DTYPES_STR_REPR)
  2816. def test_arrow_floordiv_floating_0_divisor(dtype):
  2817. # GH 56676
  2818. a = pd.Series([2], dtype=dtype)
  2819. result = a // 0
  2820. expected = pd.Series([float("inf")], dtype=dtype)
  2821. tm.assert_series_equal(result, expected)
  2822. @pytest.mark.parametrize("dtype", ["float64", "datetime64[ns]", "timedelta64[ns]"])
  2823. def test_astype_int_with_null_to_numpy_dtype(dtype):
  2824. # GH 57093
  2825. ser = pd.Series([1, None], dtype="int64[pyarrow]")
  2826. result = ser.astype(dtype)
  2827. expected = pd.Series([1, None], dtype=dtype)
  2828. tm.assert_series_equal(result, expected)
  2829. @pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES)
  2830. def test_arrow_integral_floordiv_large_values(pa_type):
  2831. # GH 56676
  2832. max_value = np.iinfo(pa_type.to_pandas_dtype()).max
  2833. dtype = ArrowDtype(pa_type)
  2834. a = pd.Series([max_value], dtype=dtype)
  2835. b = pd.Series([1], dtype=dtype)
  2836. result = a // b
  2837. tm.assert_series_equal(result, a)
  2838. @pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"])
  2839. def test_arrow_true_division_large_divisor(dtype):
  2840. # GH 56706
  2841. a = pd.Series([0], dtype=dtype)
  2842. b = pd.Series([18014398509481983], dtype=dtype)
  2843. expected = pd.Series([0], dtype="float64[pyarrow]")
  2844. result = a / b
  2845. tm.assert_series_equal(result, expected)
  2846. @pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"])
  2847. def test_arrow_floor_division_large_divisor(dtype):
  2848. # GH 56706
  2849. a = pd.Series([0], dtype=dtype)
  2850. b = pd.Series([18014398509481983], dtype=dtype)
  2851. expected = pd.Series([0], dtype=dtype)
  2852. result = a // b
  2853. tm.assert_series_equal(result, expected)
  2854. def test_string_to_datetime_parsing_cast():
  2855. # GH 56266
  2856. string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"]
  2857. result = pd.Series(string_dates, dtype="timestamp[ns][pyarrow]")
  2858. expected = pd.Series(
  2859. ArrowExtensionArray(pa.array(pd.to_datetime(string_dates), from_pandas=True))
  2860. )
  2861. tm.assert_series_equal(result, expected)
  2862. @pytest.mark.skipif(
  2863. pa_version_under13p0, reason="pairwise_diff_checked not implemented in pyarrow"
  2864. )
  2865. def test_interpolate_not_numeric(data):
  2866. if not data.dtype._is_numeric:
  2867. ser = pd.Series(data)
  2868. msg = re.escape(f"Cannot interpolate with {ser.dtype} dtype")
  2869. with pytest.raises(TypeError, match=msg):
  2870. pd.Series(data).interpolate()
  2871. def test_string_to_time_parsing_cast():
  2872. # GH 56463
  2873. string_times = ["11:41:43.076160"]
  2874. result = pd.Series(string_times, dtype="time64[us][pyarrow]")
  2875. expected = pd.Series(
  2876. ArrowExtensionArray(pa.array([time(11, 41, 43, 76160)], from_pandas=True))
  2877. )
  2878. tm.assert_series_equal(result, expected)
  2879. def test_to_numpy_float():
  2880. # GH#56267
  2881. ser = pd.Series([32, 40, None], dtype="float[pyarrow]")
  2882. result = ser.astype("float64")
  2883. expected = pd.Series([32, 40, np.nan], dtype="float64")
  2884. tm.assert_series_equal(result, expected)
  2885. def test_to_numpy_timestamp_to_int():
  2886. # GH 55997
  2887. ser = pd.Series(["2020-01-01 04:30:00"], dtype="timestamp[ns][pyarrow]")
  2888. result = ser.to_numpy(dtype=np.int64)
  2889. expected = np.array([1577853000000000000])
  2890. tm.assert_numpy_array_equal(result, expected)
  2891. def test_map_numeric_na_action():
  2892. ser = pd.Series([32, 40, None], dtype="int64[pyarrow]")
  2893. result = ser.map(lambda x: 42, na_action="ignore")
  2894. expected = pd.Series([42.0, 42.0, np.nan], dtype="float64")
  2895. tm.assert_series_equal(result, expected)